diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml index 6912d15abf3d6..944ce9b4fb1f6 100644 --- a/.github/FUNDING.yml +++ b/.github/FUNDING.yml @@ -1 +1,2 @@ custom: https://pandas.pydata.org/donate.html +tidelift: pypi/pandas diff --git a/.github/SECURITY.md b/.github/SECURITY.md new file mode 100644 index 0000000000000..f3b059a5d4f13 --- /dev/null +++ b/.github/SECURITY.md @@ -0,0 +1 @@ +To report a security vulnerability to pandas, please go to https://tidelift.com/security and see the instructions there. diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 32ffb3330564c..5cc22c638c9b1 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,17 +1,21 @@ repos: - - repo: https://github.com/python/black - rev: stable - hooks: - - id: black - language_version: python3.7 - - repo: https://gitlab.com/pycqa/flake8 - rev: 3.7.7 - hooks: - - id: flake8 - language: python_venv - additional_dependencies: [flake8-comprehensions] - - repo: https://github.com/pre-commit/mirrors-isort - rev: v4.3.20 - hooks: - - id: isort - language: python_venv +- repo: https://github.com/python/black + rev: stable + hooks: + - id: black + language_version: python3.7 +- repo: https://gitlab.com/pycqa/flake8 + rev: 3.7.7 + hooks: + - id: flake8 + language: python_venv + additional_dependencies: [flake8-comprehensions] +- repo: https://github.com/pre-commit/mirrors-isort + rev: v4.3.20 + hooks: + - id: isort + language: python_venv +- repo: https://github.com/asottile/seed-isort-config + rev: v1.9.2 + hooks: + - id: seed-isort-config diff --git a/.travis.yml b/.travis.yml index 9be4291d10874..79fecc41bec0d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -21,7 +21,7 @@ env: git: # for cloning - depth: 2000 + depth: false matrix: fast_finish: true @@ -63,7 +63,7 @@ before_install: - pwd - uname -a - git --version - - git tag + - ./ci/check_git_tags.sh # Because travis runs on Google Cloud and has a /etc/boto.cfg, # it breaks moto import, see: # https://github.com/spulec/moto/issues/1771 diff --git a/README.md b/README.md index aeeea1464e1fd..3cde98d3145f2 100644 --- a/README.md +++ b/README.md @@ -233,3 +233,5 @@ You can also triage issues which may include reproducing bug reports, or asking Or maybe through using pandas you have an idea of your own or are looking for something in the documentation and thinking ‘this can be improved’...you can do something about it! Feel free to ask questions on the [mailing list](https://groups.google.com/forum/?fromgroups#!forum/pydata) or on [Gitter](https://gitter.im/pydata/pandas). + +As contributors and maintainers to this project, you are expected to abide by pandas' code of conduct. More information can be found at: [Contributor Code of Conduct](https://github.com/pandas-dev/pandas/blob/master/.github/CODE_OF_CONDUCT.md) diff --git a/asv_bench/benchmarks/attrs_caching.py b/asv_bench/benchmarks/attrs_caching.py index c43e5dfd729aa..501e27b9078ec 100644 --- a/asv_bench/benchmarks/attrs_caching.py +++ b/asv_bench/benchmarks/attrs_caching.py @@ -1,4 +1,5 @@ import numpy as np + from pandas import DataFrame try: @@ -32,4 +33,4 @@ def time_cache_readonly(self): self.obj.prop -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/binary_ops.py b/asv_bench/benchmarks/binary_ops.py index fd3324b78f1c3..58e0db67d6025 100644 --- a/asv_bench/benchmarks/binary_ops.py +++ b/asv_bench/benchmarks/binary_ops.py @@ -1,4 +1,5 @@ import numpy as np + from pandas import DataFrame, Series, date_range from pandas.core.algorithms import checked_add_with_arr @@ -155,4 +156,4 @@ def time_add_overflow_both_arg_nan(self): ) -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index 8097118a79d20..559aa7050a640 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -1,7 +1,9 @@ +import warnings + import numpy as np + import pandas as pd import pandas.util.testing as tm -import warnings try: from pandas.api.types import union_categoricals @@ -280,4 +282,4 @@ def time_sort_values(self): self.index.sort_values(ascending=False) -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/ctors.py b/asv_bench/benchmarks/ctors.py index 654075292cdf6..ec3dd7a48a89f 100644 --- a/asv_bench/benchmarks/ctors.py +++ b/asv_bench/benchmarks/ctors.py @@ -1,6 +1,7 @@ import numpy as np + +from pandas import DatetimeIndex, Index, MultiIndex, Series, Timestamp import pandas.util.testing as tm -from pandas import Series, Index, DatetimeIndex, Timestamp, MultiIndex def no_change(arr): @@ -113,4 +114,4 @@ def time_multiindex_from_iterables(self): MultiIndex.from_product(self.iterables) -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/dtypes.py b/asv_bench/benchmarks/dtypes.py index 60800b1f9cae7..24cc1c6f9fa70 100644 --- a/asv_bench/benchmarks/dtypes.py +++ b/asv_bench/benchmarks/dtypes.py @@ -1,14 +1,14 @@ +import numpy as np + from pandas.api.types import pandas_dtype -import numpy as np from .pandas_vb_common import ( - numeric_dtypes, datetime_dtypes, - string_dtypes, extension_dtypes, + numeric_dtypes, + string_dtypes, ) - _numpy_dtypes = [ np.dtype(dtype) for dtype in (numeric_dtypes + datetime_dtypes + string_dtypes) ] @@ -40,4 +40,4 @@ def time_pandas_dtype_invalid(self, dtype): pass -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/eval.py b/asv_bench/benchmarks/eval.py index 84e94315cc28b..06a181875aaa8 100644 --- a/asv_bench/benchmarks/eval.py +++ b/asv_bench/benchmarks/eval.py @@ -1,4 +1,5 @@ import numpy as np + import pandas as pd try: @@ -62,4 +63,4 @@ def time_query_with_boolean_selection(self): self.df.query("(a >= @self.min_val) & (a <= @self.max_val)") -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/frame_ctor.py b/asv_bench/benchmarks/frame_ctor.py index acfb26bcf5d7c..3944e0bc523d8 100644 --- a/asv_bench/benchmarks/frame_ctor.py +++ b/asv_bench/benchmarks/frame_ctor.py @@ -1,6 +1,7 @@ import numpy as np + +from pandas import DataFrame, MultiIndex, Series, Timestamp, date_range import pandas.util.testing as tm -from pandas import DataFrame, Series, MultiIndex, Timestamp, date_range try: from pandas.tseries.offsets import Nano, Hour @@ -104,4 +105,4 @@ def time_frame_from_lists(self): self.df = DataFrame(self.data) -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index e2f6764c76eef..05f98c66faa2b 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -1,5 +1,5 @@ -import warnings import string +import warnings import numpy as np @@ -609,4 +609,4 @@ def time_dataframe_describe(self): self.df.describe() -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/gil.py b/asv_bench/benchmarks/gil.py index 0d0b75561d057..d57492dd37268 100644 --- a/asv_bench/benchmarks/gil.py +++ b/asv_bench/benchmarks/gil.py @@ -1,7 +1,8 @@ import numpy as np -import pandas.util.testing as tm -from pandas import DataFrame, Series, read_csv, factorize, date_range + +from pandas import DataFrame, Series, date_range, factorize, read_csv from pandas.core.algorithms import take_1d +import pandas.util.testing as tm try: from pandas import ( @@ -36,7 +37,7 @@ def wrapper(fname): return wrapper -from .pandas_vb_common import BaseIO +from .pandas_vb_common import BaseIO # noqa: E402 isort:skip class ParallelGroupbyMethods: @@ -301,4 +302,4 @@ def time_loop(self, threads): self.loop() -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 39b07d4734399..d51c53e2264f1 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -15,7 +15,6 @@ ) import pandas.util.testing as tm - method_blacklist = { "object": { "median", @@ -626,4 +625,4 @@ def time_first(self): self.df_nans.groupby("key").transform("first") -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py index 6541ddcb0397d..a94960d494707 100644 --- a/asv_bench/benchmarks/index_object.py +++ b/asv_bench/benchmarks/index_object.py @@ -1,14 +1,17 @@ +import gc + import numpy as np -import pandas.util.testing as tm + from pandas import ( - Series, - date_range, DatetimeIndex, - Index, - RangeIndex, Float64Index, + Index, IntervalIndex, + RangeIndex, + Series, + date_range, ) +import pandas.util.testing as tm class SetOperations: @@ -225,4 +228,21 @@ def time_intersection_both_duplicate(self, N): self.intv.intersection(self.intv2) -from .pandas_vb_common import setup # noqa: F401 +class GC: + params = [1, 2, 5] + + def create_use_drop(self): + idx = Index(list(range(1000 * 1000))) + idx._engine + + def peakmem_gc_instances(self, N): + try: + gc.disable() + + for _ in range(N): + self.create_use_drop() + finally: + gc.enable() + + +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index 84604b8196536..ac35139c1954a 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -1,22 +1,23 @@ import warnings import numpy as np -import pandas.util.testing as tm + from pandas import ( - Series, + CategoricalIndex, DataFrame, - MultiIndex, - Int64Index, - UInt64Index, Float64Index, - IntervalIndex, - CategoricalIndex, IndexSlice, + Int64Index, + IntervalIndex, + MultiIndex, + Series, + UInt64Index, concat, date_range, option_context, period_range, ) +import pandas.util.testing as tm class NumericSeriesIndexing: @@ -371,4 +372,4 @@ def time_chained_indexing(self, mode): df2["C"] = 1.0 -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/inference.py b/asv_bench/benchmarks/inference.py index 66ef4f2aec380..e85b3bd2c7687 100644 --- a/asv_bench/benchmarks/inference.py +++ b/asv_bench/benchmarks/inference.py @@ -1,8 +1,9 @@ import numpy as np -import pandas.util.testing as tm + from pandas import DataFrame, Series, to_numeric +import pandas.util.testing as tm -from .pandas_vb_common import numeric_dtypes, lib +from .pandas_vb_common import lib, numeric_dtypes class NumericInferOps: @@ -120,4 +121,4 @@ def time_convert(self, data): lib.maybe_convert_numeric(data, set(), coerce_numeric=False) -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 4525e504fc4dd..9b8599b0a1b64 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -1,10 +1,11 @@ +from io import StringIO import random import string import numpy as np + +from pandas import Categorical, DataFrame, date_range, read_csv, to_datetime import pandas.util.testing as tm -from pandas import DataFrame, Categorical, date_range, read_csv, to_datetime -from io import StringIO from ..pandas_vb_common import BaseIO @@ -406,4 +407,4 @@ def time_to_datetime_format_DD_MM_YYYY(self, cache_dates): to_datetime(df["date"], cache=cache_dates, format="%d-%m-%Y") -from ..pandas_vb_common import setup # noqa: F401 +from ..pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/io/excel.py b/asv_bench/benchmarks/io/excel.py index 12e70f84e5203..9aa5cbd5b6f7c 100644 --- a/asv_bench/benchmarks/io/excel.py +++ b/asv_bench/benchmarks/io/excel.py @@ -1,6 +1,8 @@ from io import BytesIO + import numpy as np -from pandas import DataFrame, date_range, ExcelWriter, read_excel + +from pandas import DataFrame, ExcelWriter, date_range, read_excel import pandas.util.testing as tm @@ -35,4 +37,4 @@ def time_write_excel(self, engine): writer_write.save() -from ..pandas_vb_common import setup # noqa: F401 +from ..pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/io/hdf.py b/asv_bench/benchmarks/io/hdf.py index 2874a7889156b..8ec04a2087f1b 100644 --- a/asv_bench/benchmarks/io/hdf.py +++ b/asv_bench/benchmarks/io/hdf.py @@ -1,5 +1,6 @@ import numpy as np -from pandas import DataFrame, date_range, HDFStore, read_hdf + +from pandas import DataFrame, HDFStore, date_range, read_hdf import pandas.util.testing as tm from ..pandas_vb_common import BaseIO @@ -127,4 +128,4 @@ def time_write_hdf(self, format): self.df.to_hdf(self.fname, "df", format=format) -from ..pandas_vb_common import setup # noqa: F401 +from ..pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/io/json.py b/asv_bench/benchmarks/io/json.py index fc07f2a484102..b249c92b53e93 100644 --- a/asv_bench/benchmarks/io/json.py +++ b/asv_bench/benchmarks/io/json.py @@ -1,6 +1,7 @@ import numpy as np + +from pandas import DataFrame, concat, date_range, read_json, timedelta_range import pandas.util.testing as tm -from pandas import DataFrame, date_range, timedelta_range, concat, read_json from ..pandas_vb_common import BaseIO @@ -214,4 +215,4 @@ def peakmem_float(self, frames): df.to_json() -from ..pandas_vb_common import setup # noqa: F401 +from ..pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/io/msgpack.py b/asv_bench/benchmarks/io/msgpack.py index d97b4ae13f0bd..f5038602539ab 100644 --- a/asv_bench/benchmarks/io/msgpack.py +++ b/asv_bench/benchmarks/io/msgpack.py @@ -1,5 +1,7 @@ import warnings + import numpy as np + from pandas import DataFrame, date_range, read_msgpack import pandas.util.testing as tm @@ -27,4 +29,4 @@ def time_write_msgpack(self): self.df.to_msgpack(self.fname) -from ..pandas_vb_common import setup # noqa: F401 +from ..pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/io/pickle.py b/asv_bench/benchmarks/io/pickle.py index 286ac767c02e7..647e9d27dec9d 100644 --- a/asv_bench/benchmarks/io/pickle.py +++ b/asv_bench/benchmarks/io/pickle.py @@ -1,4 +1,5 @@ import numpy as np + from pandas import DataFrame, date_range, read_pickle import pandas.util.testing as tm @@ -25,4 +26,4 @@ def time_write_pickle(self): self.df.to_pickle(self.fname) -from ..pandas_vb_common import setup # noqa: F401 +from ..pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/io/sql.py b/asv_bench/benchmarks/io/sql.py index b80872b17a9e4..fe84c869717e3 100644 --- a/asv_bench/benchmarks/io/sql.py +++ b/asv_bench/benchmarks/io/sql.py @@ -1,10 +1,11 @@ import sqlite3 import numpy as np -import pandas.util.testing as tm -from pandas import DataFrame, date_range, read_sql_query, read_sql_table from sqlalchemy import create_engine +from pandas import DataFrame, date_range, read_sql_query, read_sql_table +import pandas.util.testing as tm + class SQL: @@ -141,4 +142,4 @@ def time_read_sql_table_column(self, dtype): read_sql_table(self.table_name, self.con, columns=[dtype]) -from ..pandas_vb_common import setup # noqa: F401 +from ..pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/io/stata.py b/asv_bench/benchmarks/io/stata.py index b3ed71af47dc8..28829785d72e9 100644 --- a/asv_bench/benchmarks/io/stata.py +++ b/asv_bench/benchmarks/io/stata.py @@ -1,4 +1,5 @@ import numpy as np + from pandas import DataFrame, date_range, read_stata import pandas.util.testing as tm @@ -50,4 +51,4 @@ def setup(self, convert_dates): self.df.to_stata(self.fname, self.convert_dates) -from ..pandas_vb_common import setup # noqa: F401 +from ..pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py index 7c899e3dc6ac8..6aa82a43a4d6a 100644 --- a/asv_bench/benchmarks/join_merge.py +++ b/asv_bench/benchmarks/join_merge.py @@ -1,8 +1,9 @@ import string import numpy as np + +from pandas import DataFrame, MultiIndex, Series, concat, date_range, merge, merge_asof import pandas.util.testing as tm -from pandas import DataFrame, Series, MultiIndex, date_range, concat, merge, merge_asof try: from pandas import merge_ordered @@ -348,4 +349,4 @@ def time_series_align_left_monotonic(self): self.ts1.align(self.ts2, join="left") -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/multiindex_object.py b/asv_bench/benchmarks/multiindex_object.py index eda059a68e8a5..3f4fd7ad911c1 100644 --- a/asv_bench/benchmarks/multiindex_object.py +++ b/asv_bench/benchmarks/multiindex_object.py @@ -1,8 +1,9 @@ import string import numpy as np + +from pandas import DataFrame, MultiIndex, date_range import pandas.util.testing as tm -from pandas import date_range, MultiIndex, DataFrame class GetLoc: @@ -146,4 +147,4 @@ def time_categorical_level(self): self.df.set_index(["a", "b"]) -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/offset.py b/asv_bench/benchmarks/offset.py index 31c3b6fb6cb60..d822646e712ae 100644 --- a/asv_bench/benchmarks/offset.py +++ b/asv_bench/benchmarks/offset.py @@ -1,7 +1,8 @@ -import warnings from datetime import datetime +import warnings import numpy as np + import pandas as pd try: diff --git a/asv_bench/benchmarks/pandas_vb_common.py b/asv_bench/benchmarks/pandas_vb_common.py index fdc8207021c0f..1faf13329110d 100644 --- a/asv_bench/benchmarks/pandas_vb_common.py +++ b/asv_bench/benchmarks/pandas_vb_common.py @@ -1,7 +1,8 @@ -import os from importlib import import_module +import os import numpy as np + import pandas as pd # Compatibility import for lib diff --git a/asv_bench/benchmarks/period.py b/asv_bench/benchmarks/period.py index 2f8ae0650ab75..7303240a25f29 100644 --- a/asv_bench/benchmarks/period.py +++ b/asv_bench/benchmarks/period.py @@ -1,4 +1,5 @@ from pandas import DataFrame, Period, PeriodIndex, Series, date_range, period_range + from pandas.tseries.frequencies import to_offset diff --git a/asv_bench/benchmarks/plotting.py b/asv_bench/benchmarks/plotting.py index 4fb0876f05a0a..5c718516360ed 100644 --- a/asv_bench/benchmarks/plotting.py +++ b/asv_bench/benchmarks/plotting.py @@ -1,11 +1,12 @@ +import matplotlib import numpy as np -from pandas import DataFrame, Series, DatetimeIndex, date_range + +from pandas import DataFrame, DatetimeIndex, Series, date_range try: from pandas.plotting import andrews_curves except ImportError: from pandas.tools.plotting import andrews_curves -import matplotlib matplotlib.use("Agg") @@ -93,4 +94,4 @@ def time_plot_andrews_curves(self): andrews_curves(self.df, "Name") -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/reindex.py b/asv_bench/benchmarks/reindex.py index 8d4c9ebaf3e89..cd450f801c805 100644 --- a/asv_bench/benchmarks/reindex.py +++ b/asv_bench/benchmarks/reindex.py @@ -1,6 +1,8 @@ import numpy as np + +from pandas import DataFrame, Index, MultiIndex, Series, date_range, period_range import pandas.util.testing as tm -from pandas import DataFrame, Series, MultiIndex, Index, date_range, period_range + from .pandas_vb_common import lib @@ -159,4 +161,4 @@ def time_lib_fast_zip(self): lib.fast_zip(self.col_array_list) -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/replace.py b/asv_bench/benchmarks/replace.py index 6137e944e6b9e..2a115fb0b4fe3 100644 --- a/asv_bench/benchmarks/replace.py +++ b/asv_bench/benchmarks/replace.py @@ -1,4 +1,5 @@ import numpy as np + import pandas as pd @@ -36,6 +37,23 @@ def time_replace_series(self, inplace): self.s.replace(self.to_rep, inplace=inplace) +class ReplaceList: + # GH#28099 + + params = [(True, False)] + param_names = ["inplace"] + + def setup(self, inplace): + self.df = pd.DataFrame({"A": 0, "B": 0}, index=range(4 * 10 ** 7)) + + def time_replace_list(self, inplace): + self.df.replace([np.inf, -np.inf], np.nan, inplace=inplace) + + def time_replace_list_one_match(self, inplace): + # the 1 can be held in self._df.blocks[0], while the inf and -inf cant + self.df.replace([np.inf, -np.inf, 1], np.nan, inplace=inplace) + + class Convert: params = (["DataFrame", "Series"], ["Timestamp", "Timedelta"]) @@ -56,4 +74,4 @@ def time_replace(self, constructor, replace_data): self.data.replace(self.to_replace) -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py index cc373f413fb88..441f4b380656e 100644 --- a/asv_bench/benchmarks/reshape.py +++ b/asv_bench/benchmarks/reshape.py @@ -1,9 +1,10 @@ -import string from itertools import product +import string import numpy as np -from pandas import DataFrame, MultiIndex, date_range, melt, wide_to_long + import pandas as pd +from pandas import DataFrame, MultiIndex, date_range, melt, wide_to_long class Melt: @@ -262,4 +263,4 @@ def time_explode(self, n_rows, max_list_length): self.series.explode() -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index a70977fcf539f..3640513d31be2 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -1,6 +1,7 @@ -import pandas as pd import numpy as np +import pandas as pd + class Methods: @@ -121,4 +122,4 @@ def peakmem_fixed(self): self.roll.max() -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index 6038a2ab4bd9f..a3f1d92545c3f 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -1,8 +1,9 @@ from datetime import datetime import numpy as np + +from pandas import NaT, Series, date_range import pandas.util.testing as tm -from pandas import Series, date_range, NaT class SeriesConstructor: @@ -275,4 +276,4 @@ def time_func(self, func, N, dtype): self.func() -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/sparse.py b/asv_bench/benchmarks/sparse.py index 19d08c086a508..ac78ca53679fd 100644 --- a/asv_bench/benchmarks/sparse.py +++ b/asv_bench/benchmarks/sparse.py @@ -136,4 +136,4 @@ def time_division(self, fill_value): self.arr1 / self.arr2 -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/stat_ops.py b/asv_bench/benchmarks/stat_ops.py index 620a6de0f5f34..6032bee41958e 100644 --- a/asv_bench/benchmarks/stat_ops.py +++ b/asv_bench/benchmarks/stat_ops.py @@ -1,6 +1,6 @@ import numpy as np -import pandas as pd +import pandas as pd ops = ["mean", "sum", "median", "std", "skew", "kurt", "mad", "prod", "sem", "var"] @@ -148,4 +148,4 @@ def time_cov_series(self, use_bottleneck): self.s.cov(self.s2) -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index 6be2fa92d9eac..f30b2482615bd 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -1,7 +1,8 @@ import warnings import numpy as np -from pandas import Series, DataFrame + +from pandas import DataFrame, Series import pandas.util.testing as tm diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py index 1020b773f8acb..498774034d642 100644 --- a/asv_bench/benchmarks/timeseries.py +++ b/asv_bench/benchmarks/timeseries.py @@ -2,7 +2,9 @@ import dateutil import numpy as np -from pandas import to_datetime, date_range, Series, DataFrame, period_range + +from pandas import DataFrame, Series, date_range, period_range, to_datetime + from pandas.tseries.frequencies import infer_freq try: @@ -426,4 +428,4 @@ def time_dt_accessor_year(self, tz): self.series.dt.year -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/azure-pipelines.yml b/azure-pipelines.yml index cfd7f6546833d..263a87176a9c9 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -22,22 +22,17 @@ jobs: timeoutInMinutes: 90 steps: - script: | - # XXX next command should avoid redefining the path in every step, but - # made the process crash as it couldn't find deactivate - #echo '##vso[task.prependpath]$HOME/miniconda3/bin' + echo '##vso[task.prependpath]$(HOME)/miniconda3/bin' echo '##vso[task.setvariable variable=ENV_FILE]environment.yml' echo '##vso[task.setvariable variable=AZURE]true' displayName: 'Setting environment variables' # Do not require a conda environment - - script: | - export PATH=$HOME/miniconda3/bin:$PATH - ci/code_checks.sh patterns + - script: ci/code_checks.sh patterns displayName: 'Looking for unwanted patterns' condition: true - script: | - export PATH=$HOME/miniconda3/bin:$PATH sudo apt-get install -y libc6-dev-i386 ci/setup_env.sh displayName: 'Setup environment and build pandas' @@ -45,14 +40,12 @@ jobs: # Do not require pandas - script: | - export PATH=$HOME/miniconda3/bin:$PATH source activate pandas-dev ci/code_checks.sh lint displayName: 'Linting' condition: true - script: | - export PATH=$HOME/miniconda3/bin:$PATH source activate pandas-dev ci/code_checks.sh dependencies displayName: 'Dependencies consistency' @@ -60,42 +53,36 @@ jobs: # Require pandas - script: | - export PATH=$HOME/miniconda3/bin:$PATH source activate pandas-dev ci/code_checks.sh code displayName: 'Checks on imported code' condition: true - script: | - export PATH=$HOME/miniconda3/bin:$PATH source activate pandas-dev ci/code_checks.sh doctests displayName: 'Running doctests' condition: true - script: | - export PATH=$HOME/miniconda3/bin:$PATH source activate pandas-dev ci/code_checks.sh docstrings displayName: 'Docstring validation' condition: true - script: | - export PATH=$HOME/miniconda3/bin:$PATH source activate pandas-dev ci/code_checks.sh typing displayName: 'Typing validation' condition: true - script: | - export PATH=$HOME/miniconda3/bin:$PATH source activate pandas-dev pytest --capture=no --strict scripts - displayName: 'Testing docstring validaton script' + displayName: 'Testing docstring validation script' condition: true - script: | - export PATH=$HOME/miniconda3/bin:$PATH source activate pandas-dev cd asv_bench asv check -E existing @@ -124,16 +111,15 @@ jobs: steps: - script: | echo '##vso[task.setvariable variable=ENV_FILE]environment.yml' + echo '##vso[task.prependpath]$(HOME)/miniconda3/bin' displayName: 'Setting environment variables' - script: | - export PATH=$HOME/miniconda3/bin:$PATH sudo apt-get install -y libc6-dev-i386 ci/setup_env.sh displayName: 'Setup environment and build pandas' - script: | - export PATH=$HOME/miniconda3/bin:$PATH source activate pandas-dev # Next we should simply have `doc/make.py --warnings-are-errors`, everything else is required because the ipython directive doesn't fail the build on errors (https://github.com/ipython/ipython/issues/11547) doc/make.py --warnings-are-errors | tee sphinx.log ; SPHINX_RET=${PIPESTATUS[0]} diff --git a/ci/azure/posix.yml b/ci/azure/posix.yml index 39f862290e720..6093df46ffb60 100644 --- a/ci/azure/posix.yml +++ b/ci/azure/posix.yml @@ -56,17 +56,15 @@ jobs: steps: - script: | if [ "$(uname)" == "Linux" ]; then sudo apt-get install -y libc6-dev-i386 $EXTRA_APT; fi + echo '##vso[task.prependpath]$(HOME)/miniconda3/bin' echo "Creating Environment" ci/setup_env.sh displayName: 'Setup environment and build pandas' - script: | - export PATH=$HOME/miniconda3/bin:$PATH source activate pandas-dev ci/run_tests.sh displayName: 'Test' - - script: | - export PATH=$HOME/miniconda3/bin:$PATH - source activate pandas-dev && pushd /tmp && python -c "import pandas; pandas.show_versions();" && popd + - script: source activate pandas-dev && pushd /tmp && python -c "import pandas; pandas.show_versions();" && popd - task: PublishTestResults@2 inputs: testResultsFiles: 'test-data-*.xml' @@ -97,7 +95,6 @@ jobs: } displayName: 'Check for test failures' - script: | - export PATH=$HOME/miniconda3/bin:$PATH source activate pandas-dev python ci/print_skipped.py displayName: 'Print skipped tests' diff --git a/ci/azure/windows.yml b/ci/azure/windows.yml index 20cad1bb4af96..dfa82819b9826 100644 --- a/ci/azure/windows.yml +++ b/ci/azure/windows.yml @@ -17,7 +17,9 @@ jobs: CONDA_PY: "37" steps: - - powershell: Write-Host "##vso[task.prependpath]$env:CONDA\Scripts" + - powershell: | + Write-Host "##vso[task.prependpath]$env:CONDA\Scripts" + Write-Host "##vso[task.prependpath]$HOME/miniconda3/bin" displayName: 'Add conda to PATH' - script: conda update -q -n base conda displayName: Update conda @@ -52,7 +54,6 @@ jobs: } displayName: 'Check for test failures' - script: | - export PATH=$HOME/miniconda3/bin:$PATH source activate pandas-dev python ci/print_skipped.py displayName: 'Print skipped tests' diff --git a/ci/check_git_tags.sh b/ci/check_git_tags.sh new file mode 100755 index 0000000000000..9dbcd4f98683e --- /dev/null +++ b/ci/check_git_tags.sh @@ -0,0 +1,28 @@ +set -e + +if [[ ! $(git tag) ]]; then + echo "No git tags in clone, please sync your git tags with upstream using:" + echo " git fetch --tags upstream" + echo " git push --tags origin" + echo "" + echo "If the issue persists, the clone depth needs to be increased in .travis.yml" + exit 1 +fi + +# This will error if there are no tags and we omit --always +DESCRIPTION=$(git describe --long --tags) +echo "$DESCRIPTION" + +if [[ "$DESCRIPTION" == *"untagged"* ]]; then + echo "Unable to determine most recent tag, aborting build" + exit 1 +else + if [[ "$DESCRIPTION" != *"g"* ]]; then + # A good description will have the hash prefixed by g, a bad one will be + # just the hash + echo "Unable to determine most recent tag, aborting build" + exit 1 + else + echo "$(git tag)" + fi +fi diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 06d45e38bfcdb..333136ddfddd9 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -263,8 +263,8 @@ fi ### DOCSTRINGS ### if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then - MSG='Validate docstrings (GL03, GL04, GL05, GL06, GL07, GL09, SS04, SS05, PR03, PR04, PR05, PR10, EX04, RT01, RT04, RT05, SA05)' ; echo $MSG - $BASE_DIR/scripts/validate_docstrings.py --format=azure --errors=GL03,GL04,GL05,GL06,GL07,GL09,SS04,SS05,PR03,PR04,PR05,PR10,EX04,RT01,RT04,RT05,SA05 + MSG='Validate docstrings (GL03, GL04, GL05, GL06, GL07, GL09, GL10, SS04, SS05, PR03, PR04, PR05, PR10, EX04, RT01, RT04, RT05, SA05)' ; echo $MSG + $BASE_DIR/scripts/validate_docstrings.py --format=azure --errors=GL03,GL04,GL05,GL06,GL07,GL09,GL10,SS04,SS05,PR03,PR04,PR05,PR10,EX04,RT01,RT04,RT05,SA05 RET=$(($RET + $?)) ; echo $MSG "DONE" fi diff --git a/ci/deps/azure-36-locale.yaml b/ci/deps/azure-36-locale.yaml index 8f8273f57c3fe..6a77b5dbedc61 100644 --- a/ci/deps/azure-36-locale.yaml +++ b/ci/deps/azure-36-locale.yaml @@ -20,8 +20,8 @@ dependencies: - xlsxwriter=0.9.8 - xlwt=1.2.0 # universal - - pytest>=4.0.2,<5.0.0 - - pytest-xdist + - pytest>=5.0.0 + - pytest-xdist>=1.29.0 - pytest-mock - pytest-azurepipelines - hypothesis>=3.58.0 diff --git a/ci/deps/azure-37-locale.yaml b/ci/deps/azure-37-locale.yaml index 05adbf0c924dc..26dcd213bbfa0 100644 --- a/ci/deps/azure-37-locale.yaml +++ b/ci/deps/azure-37-locale.yaml @@ -26,8 +26,8 @@ dependencies: - xlsxwriter - xlwt # universal - - pytest>=4.0.2 - - pytest-xdist + - pytest>=5.0.1 + - pytest-xdist>=1.29.0 - pytest-mock - pytest-azurepipelines - pip diff --git a/ci/deps/azure-37-numpydev.yaml b/ci/deps/azure-37-numpydev.yaml index 5cf897c98da10..65c92ec1dcf0d 100644 --- a/ci/deps/azure-37-numpydev.yaml +++ b/ci/deps/azure-37-numpydev.yaml @@ -6,7 +6,8 @@ dependencies: - pytz - Cython>=0.28.2 # universal - - pytest>=4.0.2 + # pytest < 5 until defaults has pytest-xdist>=1.29.0 + - pytest>=4.0.2,<5.0 - pytest-xdist - pytest-mock - hypothesis>=3.58.0 diff --git a/ci/deps/azure-macos-35.yaml b/ci/deps/azure-macos-35.yaml index 98859b596ab2a..39315b15a018b 100644 --- a/ci/deps/azure-macos-35.yaml +++ b/ci/deps/azure-macos-35.yaml @@ -22,11 +22,12 @@ dependencies: - xlrd - xlsxwriter - xlwt + - pip - pip: - pyreadstat # universal - - pytest==4.5.0 - - pytest-xdist + - pytest>=5.0.1 + - pytest-xdist>=1.29.0 - pytest-mock - hypothesis>=3.58.0 # https://github.com/pandas-dev/pandas/issues/27421 diff --git a/ci/deps/azure-windows-36.yaml b/ci/deps/azure-windows-36.yaml index b0f3f5389ac85..ff9264a36cb12 100644 --- a/ci/deps/azure-windows-36.yaml +++ b/ci/deps/azure-windows-36.yaml @@ -23,8 +23,8 @@ dependencies: - xlwt # universal - cython>=0.28.2 - - pytest>=4.0.2 - - pytest-xdist + - pytest>=5.0.1 + - pytest-xdist>=1.29.0 - pytest-mock - pytest-azurepipelines - hypothesis>=3.58.0 diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml index 08208d1e2d59a..075234a937035 100644 --- a/ci/deps/azure-windows-37.yaml +++ b/ci/deps/azure-windows-37.yaml @@ -26,8 +26,8 @@ dependencies: - xlwt # universal - cython>=0.28.2 - - pytest>=4.0.2 - - pytest-xdist + - pytest>=5.0.0 + - pytest-xdist>=1.29.0 - pytest-mock - pytest-azurepipelines - hypothesis>=3.58.0 diff --git a/ci/deps/travis-36-cov.yaml b/ci/deps/travis-36-cov.yaml index a3f6d5b30f3e1..19002cbb8575e 100644 --- a/ci/deps/travis-36-cov.yaml +++ b/ci/deps/travis-36-cov.yaml @@ -39,8 +39,8 @@ dependencies: - xlsxwriter - xlwt # universal - - pytest - - pytest-xdist + - pytest>=5.0.1 + - pytest-xdist>=1.29.0 - pytest-cov - pytest-mock - hypothesis>=3.58.0 diff --git a/ci/deps/travis-36-slow.yaml b/ci/deps/travis-36-slow.yaml index 538a82f66e4c8..9564bf5bb3a9f 100644 --- a/ci/deps/travis-36-slow.yaml +++ b/ci/deps/travis-36-slow.yaml @@ -25,8 +25,8 @@ dependencies: - xlsxwriter - xlwt # universal - - pytest>=4.0.2,<5.0.0 - - pytest-xdist + - pytest>=5.0.0 + - pytest-xdist>=1.29.0 - pytest-mock - moto - hypothesis>=3.58.0 diff --git a/ci/deps/travis-37.yaml b/ci/deps/travis-37.yaml index c9a8c274fb144..9e08c41a3d9c0 100644 --- a/ci/deps/travis-37.yaml +++ b/ci/deps/travis-37.yaml @@ -13,8 +13,8 @@ dependencies: - pyarrow - pytz # universal - - pytest>=4.0.2 - - pytest-xdist + - pytest>=5.0.0 + - pytest-xdist>=1.29.0 - pytest-mock - hypothesis>=3.58.0 - s3fs diff --git a/ci/print_skipped.py b/ci/print_skipped.py index a44281044e11d..6bc1dcfcd320d 100755 --- a/ci/print_skipped.py +++ b/ci/print_skipped.py @@ -1,8 +1,8 @@ #!/usr/bin/env python +import math import os import sys -import math import xml.etree.ElementTree as et diff --git a/ci/run_tests.sh b/ci/run_tests.sh index ee46da9f52eab..27d3fcb4cf563 100755 --- a/ci/run_tests.sh +++ b/ci/run_tests.sh @@ -50,9 +50,10 @@ do # if no tests are found (the case of "single and slow"), pytest exits with code 5, and would make the script fail, if not for the below code sh -c "$PYTEST_CMD; ret=\$?; [ \$ret = 5 ] && exit 0 || exit \$ret" - if [[ "$COVERAGE" && $? == 0 ]]; then - echo "uploading coverage for $TYPE tests" - echo "bash <(curl -s https://codecov.io/bash) -Z -c -F $TYPE -f $COVERAGE_FNAME" - bash <(curl -s https://codecov.io/bash) -Z -c -F $TYPE -f $COVERAGE_FNAME - fi + # 2019-08-21 disabling because this is hitting HTTP 400 errors GH#27602 + # if [[ "$COVERAGE" && $? == 0 && "$TRAVIS_BRANCH" == "master" ]]; then + # echo "uploading coverage for $TYPE tests" + # echo "bash <(curl -s https://codecov.io/bash) -Z -c -F $TYPE -f $COVERAGE_FNAME" + # bash <(curl -s https://codecov.io/bash) -Z -c -F $TYPE -f $COVERAGE_FNAME + # fi done diff --git a/doc/logo/pandas_logo.py b/doc/logo/pandas_logo.py index 5a07b094e6ad3..89410e3847bef 100644 --- a/doc/logo/pandas_logo.py +++ b/doc/logo/pandas_logo.py @@ -1,7 +1,6 @@ # script to generate the pandas logo -from matplotlib import pyplot as plt -from matplotlib import rcParams +from matplotlib import pyplot as plt, rcParams import numpy as np rcParams["mathtext.fontset"] = "cm" diff --git a/doc/make.py b/doc/make.py index 48febef20fbe6..cbb1fa6a5324a 100755 --- a/doc/make.py +++ b/doc/make.py @@ -11,18 +11,18 @@ $ python make.py html $ python make.py latex """ +import argparse +import csv import importlib -import sys import os import shutil -import csv import subprocess -import argparse +import sys import webbrowser + import docutils import docutils.parsers.rst - DOC_PATH = os.path.dirname(os.path.abspath(__file__)) SOURCE_PATH = os.path.join(DOC_PATH, "source") BUILD_PATH = os.path.join(DOC_PATH, "build") diff --git a/doc/source/conf.py b/doc/source/conf.py index 3ebc5d8b6333b..1da1948e45268 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -10,15 +10,15 @@ # All configuration values have a default; values that are commented out # serve to show the default. -import sys -import os -import inspect import importlib +import inspect import logging +import os +import sys + import jinja2 -from sphinx.ext.autosummary import _import_by_name from numpydoc.docscrape import NumpyDocString - +from sphinx.ext.autosummary import _import_by_name logger = logging.getLogger(__name__) @@ -141,7 +141,7 @@ # built documents. # # The short X.Y version. -import pandas +import pandas # noqa: E402 isort:skip # version = '%s r%s' % (pandas.__version__, svn_version()) version = str(pandas.__version__) @@ -315,7 +315,6 @@ import numpy as np import pandas as pd - randn = np.random.randn np.random.seed(123456) np.set_printoptions(precision=4, suppress=True) pd.options.display.max_rows = 15 @@ -433,10 +432,14 @@ # Add custom Documenter to handle attributes/methods of an AccessorProperty # eg pandas.Series.str and pandas.Series.dt (see GH9322) -import sphinx -from sphinx.util import rpartition -from sphinx.ext.autodoc import Documenter, MethodDocumenter, AttributeDocumenter -from sphinx.ext.autosummary import Autosummary +import sphinx # noqa: E402 isort:skip +from sphinx.util import rpartition # noqa: E402 isort:skip +from sphinx.ext.autodoc import ( # noqa: E402 isort:skip + AttributeDocumenter, + Documenter, + MethodDocumenter, +) +from sphinx.ext.autosummary import Autosummary # noqa: E402 isort:skip class AccessorDocumenter(MethodDocumenter): diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index 80dc8b0d8782b..be6555b2ab936 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -133,22 +133,11 @@ Installing a C compiler Pandas uses C extensions (mostly written using Cython) to speed up certain operations. To install pandas from source, you need to compile these C extensions, which means you need a C compiler. This process depends on which -platform you're using. Follow the `CPython contributing guide -`_ for getting a -compiler installed. You don't need to do any of the ``./configure`` or ``make`` -steps; you only need to install the compiler. - -For Windows developers, when using Python 3.5 and later, it is sufficient to -install `Visual Studio 2017 `_ with the -**Python development workload** and the **Python native development tools** -option. Otherwise, the following links may be helpful. - -* https://blogs.msdn.microsoft.com/pythonengineering/2017/03/07/python-support-in-vs2017/ -* https://blogs.msdn.microsoft.com/pythonengineering/2016/04/11/unable-to-find-vcvarsall-bat/ -* https://github.com/conda/conda-recipes/wiki/Building-from-Source-on-Windows-32-bit-and-64-bit -* https://cowboyprogrammer.org/building-python-wheels-for-windows/ -* https://blog.ionelmc.ro/2014/12/21/compiling-python-extensions-on-windows/ -* https://support.enthought.com/hc/en-us/articles/204469260-Building-Python-extensions-with-Canopy +platform you're using. + +* Windows: https://devguide.python.org/setup/#windows-compiling +* Mac: https://devguide.python.org/setup/#macos +* Unix: https://devguide.python.org/setup/#unix-compiling Let us know if you have any difficulties by opening an issue or reaching out on `Gitter`_. @@ -710,6 +699,136 @@ You'll also need to See :ref:`contributing.warnings` for more. +.. _contributing.type_hints: + +Type Hints +---------- + +*pandas* strongly encourages the use of :pep:`484` style type hints. New development should contain type hints and pull requests to annotate existing code are accepted as well! + +Style Guidelines +~~~~~~~~~~~~~~~~ + +Types imports should follow the ``from typing import ...`` convention. So rather than + +.. code-block:: python + + import typing + + primes = [] # type: typing.List[int] + +You should write + +.. code-block:: python + + from typing import List, Optional, Union + + primes = [] # type: List[int] + +``Optional`` should be used where applicable, so instead of + +.. code-block:: python + + maybe_primes = [] # type: List[Union[int, None]] + +You should write + +.. code-block:: python + + maybe_primes = [] # type: List[Optional[int]] + +In some cases in the code base classes may define class variables that shadow builtins. This causes an issue as described in `Mypy 1775 `_. The defensive solution here is to create an unambiguous alias of the builtin and use that without your annotation. For example, if you come across a definition like + +.. code-block:: python + + class SomeClass1: + str = None + +The appropriate way to annotate this would be as follows + +.. code-block:: python + + str_type = str + + class SomeClass2: + str = None # type: str_type + +In some cases you may be tempted to use ``cast`` from the typing module when you know better than the analyzer. This occurs particularly when using custom inference functions. For example + +.. code-block:: python + + from typing import cast + + from pandas.core.dtypes.common import is_number + + def cannot_infer_bad(obj: Union[str, int, float]): + + if is_number(obj): + ... + else: # Reasonably only str objects would reach this but... + obj = cast(str, obj) # Mypy complains without this! + return obj.upper() + +The limitation here is that while a human can reasonably understand that ``is_number`` would catch the ``int`` and ``float`` types mypy cannot make that same inference just yet (see `mypy #5206 `_. While the above works, the use of ``cast`` is **strongly discouraged**. Where applicable a refactor of the code to appease static analysis is preferable + +.. code-block:: python + + def cannot_infer_good(obj: Union[str, int, float]): + + if isinstance(obj, str): + return obj.upper() + else: + ... + +With custom types and inference this is not always possible so exceptions are made, but every effort should be exhausted to avoid ``cast`` before going down such paths. + +Syntax Requirements +~~~~~~~~~~~~~~~~~~~ + +Because *pandas* still supports Python 3.5, :pep:`526` does not apply and variables **must** be annotated with type comments. Specifically, this is a valid annotation within pandas: + +.. code-block:: python + + primes = [] # type: List[int] + +Whereas this is **NOT** allowed: + +.. code-block:: python + + primes: List[int] = [] # not supported in Python 3.5! + +Note that function signatures can always be annotated per :pep:`3107`: + +.. code-block:: python + + def sum_of_primes(primes: List[int] = []) -> int: + ... + + +Pandas-specific Types +~~~~~~~~~~~~~~~~~~~~~ + +Commonly used types specific to *pandas* will appear in `pandas._typing `_ and you should use these where applicable. This module is private for now but ultimately this should be exposed to third party libraries who want to implement type checking against pandas. + +For example, quite a few functions in *pandas* accept a ``dtype`` argument. This can be expressed as a string like ``"object"``, a ``numpy.dtype`` like ``np.int64`` or even a pandas ``ExtensionDtype`` like ``pd.CategoricalDtype``. Rather than burden the user with having to constantly annotate all of those options, this can simply be imported and reused from the pandas._typing module + +.. code-block:: python + + from pandas._typing import Dtype + + def as_type(dtype: Dtype) -> ...: + ... + +This module will ultimately house types for repeatedly used concepts like "path-like", "array-like", "numeric", etc... and can also hold aliases for commonly appearing parameters like `axis`. Development of this module is active so be sure to refer to the source for the most up to date list of available types. + +Validating Type Hints +~~~~~~~~~~~~~~~~~~~~~ + +*pandas* uses `mypy `_ to statically analyze the code base and type hints. After making any change you can ensure your type hints are correct by running + +.. code-block:: shell + + mypy pandas .. _contributing.ci: diff --git a/doc/source/development/developer.rst b/doc/source/development/developer.rst index a283920ae4377..923ef005d5926 100644 --- a/doc/source/development/developer.rst +++ b/doc/source/development/developer.rst @@ -37,12 +37,19 @@ So that a ``pandas.DataFrame`` can be faithfully reconstructed, we store a .. code-block:: text - {'index_columns': ['__index_level_0__', '__index_level_1__', ...], + {'index_columns': [, , ...], 'column_indexes': [, , ..., ], 'columns': [, , ...], - 'pandas_version': $VERSION} + 'pandas_version': $VERSION, + 'creator': { + 'library': $LIBRARY, + 'version': $LIBRARY_VERSION + }} -Here, ````/```` and so forth are dictionaries containing the metadata +The "descriptor" values ```` in the ``'index_columns'`` field are +strings (referring to a column) or dictionaries with values as described below. + +The ````/```` and so forth are dictionaries containing the metadata for each column, *including the index columns*. This has JSON form: .. code-block:: text @@ -53,26 +60,37 @@ for each column, *including the index columns*. This has JSON form: 'numpy_type': numpy_type, 'metadata': metadata} -.. note:: +See below for the detailed specification for these. + +Index Metadata Descriptors +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +``RangeIndex`` can be stored as metadata only, not requiring serialization. The +descriptor format for these as is follows: - Every index column is stored with a name matching the pattern - ``__index_level_\d+__`` and its corresponding column information is can be - found with the following code snippet. +.. code-block:: python - Following this naming convention isn't strictly necessary, but strongly - suggested for compatibility with Arrow. + index = pd.RangeIndex(0, 10, 2) + {'kind': 'range', + 'name': index.name, + 'start': index.start, + 'stop': index.stop, + 'step': index.step} - Here's an example of how the index metadata is structured in pyarrow: +Other index types must be serialized as data columns along with the other +DataFrame columns. The metadata for these is a string indicating the name of +the field in the data columns, for example ``'__index_level_0__'``. - .. code-block:: python +If an index has a non-None ``name`` attribute, and there is no other column +with a name matching that value, then the ``index.name`` value can be used as +the descriptor. Otherwise (for unnamed indexes and ones with names colliding +with other column names) a disambiguating name with pattern matching +``__index_level_\d+__`` should be used. In cases of named indexes as data +columns, ``name`` attribute is always stored in the column descriptors as +above. - # assuming there's at least 3 levels in the index - index_columns = metadata['index_columns'] # noqa: F821 - columns = metadata['columns'] # noqa: F821 - ith_index = 2 - assert index_columns[ith_index] == '__index_level_2__' - ith_index_info = columns[-len(index_columns):][ith_index] - ith_index_level_name = ith_index_info['name'] +Column Metadata +~~~~~~~~~~~~~~~ ``pandas_type`` is the logical type of the column, and is one of: @@ -161,4 +179,8 @@ As an example of fully-formed metadata: 'numpy_type': 'int64', 'metadata': None} ], - 'pandas_version': '0.20.0'} + 'pandas_version': '0.20.0', + 'creator': { + 'library': 'pyarrow', + 'version': '0.13.0' + }} diff --git a/doc/source/getting_started/10min.rst b/doc/source/getting_started/10min.rst index 9045e5b32c29f..41520795bde62 100644 --- a/doc/source/getting_started/10min.rst +++ b/doc/source/getting_started/10min.rst @@ -278,7 +278,7 @@ Using a single column's values to select data. .. ipython:: python - df[df.A > 0] + df[df['A'] > 0] Selecting values from a DataFrame where a boolean condition is met. diff --git a/doc/source/getting_started/basics.rst b/doc/source/getting_started/basics.rst index 3f6f56376861f..802ffadf2a81e 100644 --- a/doc/source/getting_started/basics.rst +++ b/doc/source/getting_started/basics.rst @@ -926,7 +926,7 @@ Single aggregations on a ``Series`` this will return a scalar value: .. ipython:: python - tsdf.A.agg('sum') + tsdf['A'].agg('sum') Aggregating with multiple functions @@ -950,13 +950,13 @@ On a ``Series``, multiple functions return a ``Series``, indexed by the function .. ipython:: python - tsdf.A.agg(['sum', 'mean']) + tsdf['A'].agg(['sum', 'mean']) Passing a ``lambda`` function will yield a ```` named row: .. ipython:: python - tsdf.A.agg(['sum', lambda x: x.mean()]) + tsdf['A'].agg(['sum', lambda x: x.mean()]) Passing a named function will yield that name for the row: @@ -965,7 +965,7 @@ Passing a named function will yield that name for the row: def mymean(x): return x.mean() - tsdf.A.agg(['sum', mymean]) + tsdf['A'].agg(['sum', mymean]) Aggregating with a dict +++++++++++++++++++++++ @@ -1065,7 +1065,7 @@ Passing a single function to ``.transform()`` with a ``Series`` will yield a sin .. ipython:: python - tsdf.A.transform(np.abs) + tsdf['A'].transform(np.abs) Transform with multiple functions @@ -1084,7 +1084,7 @@ resulting column names will be the transforming functions. .. ipython:: python - tsdf.A.transform([np.abs, lambda x: x + 1]) + tsdf['A'].transform([np.abs, lambda x: x + 1]) Transforming with a dict diff --git a/doc/source/getting_started/comparison/comparison_with_r.rst b/doc/source/getting_started/comparison/comparison_with_r.rst index 444e886bc951d..f67f46fc2b29b 100644 --- a/doc/source/getting_started/comparison/comparison_with_r.rst +++ b/doc/source/getting_started/comparison/comparison_with_r.rst @@ -81,7 +81,7 @@ R pandas =========================================== =========================================== ``select(df, col_one = col1)`` ``df.rename(columns={'col1': 'col_one'})['col_one']`` ``rename(df, col_one = col1)`` ``df.rename(columns={'col1': 'col_one'})`` -``mutate(df, c=a-b)`` ``df.assign(c=df.a-df.b)`` +``mutate(df, c=a-b)`` ``df.assign(c=df['a']-df['b'])`` =========================================== =========================================== @@ -258,8 +258,8 @@ index/slice as well as standard boolean indexing: df = pd.DataFrame({'a': np.random.randn(10), 'b': np.random.randn(10)}) df.query('a <= b') - df[df.a <= df.b] - df.loc[df.a <= df.b] + df[df['a'] <= df['b']] + df.loc[df['a'] <= df['b']] For more details and examples see :ref:`the query documentation `. @@ -284,7 +284,7 @@ In ``pandas`` the equivalent expression, using the df = pd.DataFrame({'a': np.random.randn(10), 'b': np.random.randn(10)}) df.eval('a + b') - df.a + df.b # same as the previous expression + df['a'] + df['b'] # same as the previous expression In certain cases :meth:`~pandas.DataFrame.eval` will be much faster than evaluation in pure Python. For more details and examples see :ref:`the eval diff --git a/doc/source/getting_started/comparison/comparison_with_sql.rst b/doc/source/getting_started/comparison/comparison_with_sql.rst index 366fdd546f58b..6a03c06de3699 100644 --- a/doc/source/getting_started/comparison/comparison_with_sql.rst +++ b/doc/source/getting_started/comparison/comparison_with_sql.rst @@ -49,6 +49,20 @@ With pandas, column selection is done by passing a list of column names to your Calling the DataFrame without the list of column names would display all columns (akin to SQL's ``*``). +In SQL, you can add a calculated column: + +.. code-block:: sql + + SELECT *, tip/total_bill as tip_rate + FROM tips + LIMIT 5; + +With pandas, you can use the :meth:`DataFrame.assign` method of a DataFrame to append a new column: + +.. ipython:: python + + tips.assign(tip_rate=tips['tip'] / tips['total_bill']).head(5) + WHERE ----- Filtering in SQL is done via a WHERE clause. diff --git a/doc/source/index.rst.template b/doc/source/index.rst.template index b57ce83cfc33c..f5669626aa2b3 100644 --- a/doc/source/index.rst.template +++ b/doc/source/index.rst.template @@ -39,7 +39,7 @@ See the :ref:`overview` for more detail about what's in the library. :hidden: {% endif %} {% if not single_doc %} - What's New in 0.25.0 + What's New in 1.0.0 install getting_started/index user_guide/index @@ -53,7 +53,7 @@ See the :ref:`overview` for more detail about what's in the library. whatsnew/index {% endif %} -* :doc:`whatsnew/v0.25.0` +* :doc:`whatsnew/v1.0.0` * :doc:`install` * :doc:`getting_started/index` diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst index 407aab4bb1f1b..4b1a99da7cd4c 100644 --- a/doc/source/reference/extensions.rst +++ b/doc/source/reference/extensions.rst @@ -34,7 +34,6 @@ objects. api.extensions.ExtensionArray._concat_same_type api.extensions.ExtensionArray._formatter - api.extensions.ExtensionArray._formatting_values api.extensions.ExtensionArray._from_factorized api.extensions.ExtensionArray._from_sequence api.extensions.ExtensionArray._from_sequence_of_strings @@ -45,6 +44,7 @@ objects. api.extensions.ExtensionArray.argsort api.extensions.ExtensionArray.astype api.extensions.ExtensionArray.copy + api.extensions.ExtensionArray.view api.extensions.ExtensionArray.dropna api.extensions.ExtensionArray.factorize api.extensions.ExtensionArray.fillna diff --git a/doc/source/reference/window.rst b/doc/source/reference/window.rst index 9e1374a3bd8e4..2f6addf607877 100644 --- a/doc/source/reference/window.rst +++ b/doc/source/reference/window.rst @@ -5,7 +5,6 @@ ====== Window ====== -.. currentmodule:: pandas.core.window Rolling objects are returned by ``.rolling`` calls: :func:`pandas.DataFrame.rolling`, :func:`pandas.Series.rolling`, etc. Expanding objects are returned by ``.expanding`` calls: :func:`pandas.DataFrame.expanding`, :func:`pandas.Series.expanding`, etc. @@ -13,6 +12,8 @@ EWM objects are returned by ``.ewm`` calls: :func:`pandas.DataFrame.ewm`, :func: Standard moving window functions -------------------------------- +.. currentmodule:: pandas.core.window.rolling + .. autosummary:: :toctree: api/ @@ -38,6 +39,8 @@ Standard moving window functions Standard expanding window functions ----------------------------------- +.. currentmodule:: pandas.core.window.expanding + .. autosummary:: :toctree: api/ @@ -59,6 +62,8 @@ Standard expanding window functions Exponentially-weighted moving window functions ---------------------------------------------- +.. currentmodule:: pandas.core.window.ewm + .. autosummary:: :toctree: api/ diff --git a/doc/source/user_guide/advanced.rst b/doc/source/user_guide/advanced.rst index 22a9791ffde30..62a9b6396404a 100644 --- a/doc/source/user_guide/advanced.rst +++ b/doc/source/user_guide/advanced.rst @@ -738,7 +738,7 @@ and allows efficient indexing and storage of an index with a large number of dup df['B'] = df['B'].astype(CategoricalDtype(list('cab'))) df df.dtypes - df.B.cat.categories + df['B'].cat.categories Setting the index will create a ``CategoricalIndex``. diff --git a/doc/source/user_guide/cookbook.rst b/doc/source/user_guide/cookbook.rst index 15af5208a4f1f..c9d3bc3a28c70 100644 --- a/doc/source/user_guide/cookbook.rst +++ b/doc/source/user_guide/cookbook.rst @@ -592,8 +592,8 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to .. ipython:: python df = pd.DataFrame([0, 1, 0, 1, 1, 1, 0, 1, 1], columns=['A']) - df.A.groupby((df.A != df.A.shift()).cumsum()).groups - df.A.groupby((df.A != df.A.shift()).cumsum()).cumsum() + df['A'].groupby((df['A'] != df['A'].shift()).cumsum()).groups + df['A'].groupby((df['A'] != df['A'].shift()).cumsum()).cumsum() Expanding data ************** @@ -719,7 +719,7 @@ Rolling Apply to multiple columns where function calculates a Series before a Sc df def gm(df, const): - v = ((((df.A + df.B) + 1).cumprod()) - 1) * const + v = ((((df['A'] + df['B']) + 1).cumprod()) - 1) * const return v.iloc[-1] s = pd.Series({df.index[i]: gm(df.iloc[i:min(i + 51, len(df) - 1)], 5) diff --git a/doc/source/user_guide/enhancingperf.rst b/doc/source/user_guide/enhancingperf.rst index b77bfb9778837..2df5b9d82dcc3 100644 --- a/doc/source/user_guide/enhancingperf.rst +++ b/doc/source/user_guide/enhancingperf.rst @@ -243,9 +243,9 @@ We've gotten another big improvement. Let's check again where the time is spent: .. ipython:: python - %prun -l 4 apply_integrate_f(df['a'].to_numpy(), - df['b'].to_numpy(), - df['N'].to_numpy()) + %%prun -l 4 apply_integrate_f(df['a'].to_numpy(), + df['b'].to_numpy(), + df['N'].to_numpy()) As one might expect, the majority of the time is now spent in ``apply_integrate_f``, so if we wanted to make anymore efficiencies we must continue to concentrate our @@ -393,15 +393,15 @@ Consider the following toy example of doubling each observation: .. code-block:: ipython # Custom function without numba - In [5]: %timeit df['col1_doubled'] = df.a.apply(double_every_value_nonumba) # noqa E501 + In [5]: %timeit df['col1_doubled'] = df['a'].apply(double_every_value_nonumba) # noqa E501 1000 loops, best of 3: 797 us per loop # Standard implementation (faster than a custom function) - In [6]: %timeit df['col1_doubled'] = df.a * 2 + In [6]: %timeit df['col1_doubled'] = df['a'] * 2 1000 loops, best of 3: 233 us per loop # Custom function with numba - In [7]: %timeit (df['col1_doubled'] = double_every_value_withnumba(df.a.to_numpy()) + In [7]: %timeit (df['col1_doubled'] = double_every_value_withnumba(df['a'].to_numpy()) 1000 loops, best of 3: 145 us per loop Caveats @@ -643,8 +643,8 @@ The equivalent in standard Python would be .. ipython:: python df = pd.DataFrame(dict(a=range(5), b=range(5, 10))) - df['c'] = df.a + df.b - df['d'] = df.a + df.b + df.c + df['c'] = df['a'] + df['b'] + df['d'] = df['a'] + df['b'] + df['c'] df['a'] = 1 df @@ -688,7 +688,7 @@ name in an expression. a = np.random.randn() df.query('@a < a') - df.loc[a < df.a] # same as the previous expression + df.loc[a < df['a']] # same as the previous expression With :func:`pandas.eval` you cannot use the ``@`` prefix *at all*, because it isn't defined in that context. ``pandas`` will let you know this if you try to diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index e3b75afcf945e..cf55ce0c9a6d4 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -210,7 +210,7 @@ as an attribute: See `here for an explanation of valid identifiers `__. - - The attribute will not be available if it conflicts with an existing method name, e.g. ``s.min`` is not allowed. + - The attribute will not be available if it conflicts with an existing method name, e.g. ``s.min`` is not allowed, but ``s['min']`` is possible. - Similarly, the attribute will not be available if it conflicts with any of the following list: ``index``, ``major_axis``, ``minor_axis``, ``items``. @@ -540,7 +540,7 @@ The ``callable`` must be a function with one argument (the calling Series or Dat columns=list('ABCD')) df1 - df1.loc[lambda df: df.A > 0, :] + df1.loc[lambda df: df['A'] > 0, :] df1.loc[:, lambda df: ['A', 'B']] df1.iloc[:, lambda df: [0, 1]] @@ -552,7 +552,7 @@ You can use callable indexing in ``Series``. .. ipython:: python - df1.A.loc[lambda s: s > 0] + df1['A'].loc[lambda s: s > 0] Using these methods / indexers, you can chain data selection operations without using a temporary variable. @@ -561,7 +561,7 @@ without using a temporary variable. bb = pd.read_csv('data/baseball.csv', index_col='id') (bb.groupby(['year', 'team']).sum() - .loc[lambda df: df.r > 100]) + .loc[lambda df: df['r'] > 100]) .. _indexing.deprecate_ix: @@ -871,9 +871,9 @@ Boolean indexing Another common operation is the use of boolean vectors to filter the data. The operators are: ``|`` for ``or``, ``&`` for ``and``, and ``~`` for ``not``. These **must** be grouped by using parentheses, since by default Python will -evaluate an expression such as ``df.A > 2 & df.B < 3`` as -``df.A > (2 & df.B) < 3``, while the desired evaluation order is -``(df.A > 2) & (df.B < 3)``. +evaluate an expression such as ``df['A'] > 2 & df['B'] < 3`` as +``df['A'] > (2 & df['B']) < 3``, while the desired evaluation order is +``(df['A > 2) & (df['B'] < 3)``. Using a boolean vector to index a Series works exactly as in a NumPy ndarray: @@ -1134,7 +1134,7 @@ between the values of columns ``a`` and ``c``. For example: df # pure python - df[(df.a < df.b) & (df.b < df.c)] + df[(df['a'] < df['b']) & (df['b'] < df['c'])] # query df.query('(a < b) & (b < c)') @@ -1241,7 +1241,7 @@ Full numpy-like syntax: df = pd.DataFrame(np.random.randint(n, size=(n, 3)), columns=list('abc')) df df.query('(a < b) & (b < c)') - df[(df.a < df.b) & (df.b < df.c)] + df[(df['a'] < df['b']) & (df['b'] < df['c'])] Slightly nicer by removing the parentheses (by binding making comparison operators bind tighter than ``&`` and ``|``). @@ -1279,12 +1279,12 @@ The ``in`` and ``not in`` operators df.query('a in b') # How you'd do it in pure Python - df[df.a.isin(df.b)] + df[df['a'].isin(df['b'])] df.query('a not in b') # pure Python - df[~df.a.isin(df.b)] + df[~df['a'].isin(df['b'])] You can combine this with other expressions for very succinct queries: @@ -1297,7 +1297,7 @@ You can combine this with other expressions for very succinct queries: df.query('a in b and c < d') # pure Python - df[df.b.isin(df.a) & (df.c < df.d)] + df[df['b'].isin(df['a']) & (df['c'] < df['d'])] .. note:: @@ -1326,7 +1326,7 @@ to ``in``/``not in``. df.query('b == ["a", "b", "c"]') # pure Python - df[df.b.isin(["a", "b", "c"])] + df[df['b'].isin(["a", "b", "c"])] df.query('c == [1, 2]') @@ -1338,7 +1338,7 @@ to ``in``/``not in``. df.query('[1, 2] not in c') # pure Python - df[df.c.isin([1, 2])] + df[df['c'].isin([1, 2])] Boolean operators @@ -1352,7 +1352,7 @@ You can negate boolean expressions with the word ``not`` or the ``~`` operator. df['bools'] = np.random.rand(len(df)) > 0.5 df.query('~bools') df.query('not bools') - df.query('not bools') == df[~df.bools] + df.query('not bools') == df[~df['bools']] Of course, expressions can be arbitrarily complex too: @@ -1362,7 +1362,10 @@ Of course, expressions can be arbitrarily complex too: shorter = df.query('a < b < c and (not bools) or bools > 2') # equivalent in pure Python - longer = df[(df.a < df.b) & (df.b < df.c) & (~df.bools) | (df.bools > 2)] + longer = df[(df['a'] < df['b']) + & (df['b'] < df['c']) + & (~df['bools']) + | (df['bools'] > 2)] shorter longer @@ -1835,14 +1838,14 @@ chained indexing expression, you can set the :ref:`option ` # This will show the SettingWithCopyWarning # but the frame values will be set - dfb['c'][dfb.a.str.startswith('o')] = 42 + dfb['c'][dfb['a'].str.startswith('o')] = 42 This however is operating on a copy and will not work. :: >>> pd.set_option('mode.chained_assignment','warn') - >>> dfb[dfb.a.str.startswith('o')]['c'] = 42 + >>> dfb[dfb['a'].str.startswith('o')]['c'] = 42 Traceback (most recent call last) ... SettingWithCopyWarning: diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 8e5352c337072..338c890ce317c 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -28,6 +28,7 @@ The pandas I/O API is a set of top level ``reader`` functions accessed like :delim: ; text;`CSV `__;:ref:`read_csv`;:ref:`to_csv` + text;Fixed-Width Text File;:ref:`read_fwf` text;`JSON `__;:ref:`read_json`;:ref:`to_json` text;`HTML `__;:ref:`read_html`;:ref:`to_html` text; Local clipboard;:ref:`read_clipboard`;:ref:`to_clipboard` @@ -1372,6 +1373,7 @@ should pass the ``escapechar`` option: print(data) pd.read_csv(StringIO(data), escapechar='\\') +.. _io.fwf_reader: .. _io.fwf: Files with fixed width columns @@ -3204,7 +3206,7 @@ argument to ``to_excel`` and to ``ExcelWriter``. The built-in engines are: writer = pd.ExcelWriter('path_to_file.xlsx', engine='xlsxwriter') # Or via pandas configuration. - from pandas import options # noqa: E402 + from pandas import options # noqa: E402 options.io.excel.xlsx.writer = 'xlsxwriter' df.to_excel('path_to_file.xlsx', sheet_name='Sheet1') @@ -3572,7 +3574,7 @@ Closing a Store and using a context manager: Read/write API '''''''''''''' -``HDFStore`` supports an top-level API using ``read_hdf`` for reading and ``to_hdf`` for writing, +``HDFStore`` supports a top-level API using ``read_hdf`` for reading and ``to_hdf`` for writing, similar to how ``read_csv`` and ``to_csv`` work. .. ipython:: python @@ -3687,7 +3689,7 @@ Hierarchical keys Keys to a store can be specified as a string. These can be in a hierarchical path-name like format (e.g. ``foo/bar/bah``), which will generate a hierarchy of sub-stores (or ``Groups`` in PyTables -parlance). Keys can be specified with out the leading '/' and are **always** +parlance). Keys can be specified without the leading '/' and are **always** absolute (e.g. 'foo' refers to '/foo'). Removal operations can remove everything in the sub-store and **below**, so be *careful*. @@ -3825,7 +3827,7 @@ data. A query is specified using the ``Term`` class under the hood, as a boolean expression. -* ``index`` and ``columns`` are supported indexers of a ``DataFrames``. +* ``index`` and ``columns`` are supported indexers of ``DataFrames``. * if ``data_columns`` are specified, these can be used as additional indexers. Valid comparison operators are: @@ -3917,7 +3919,7 @@ Use boolean expressions, with in-line function evaluation. store.select('dfq', "index>pd.Timestamp('20130104') & columns=['A', 'B']") -Use and inline column reference +Use inline column reference. .. ipython:: python @@ -4593,8 +4595,8 @@ Performance write chunksize (default is 50000). This will significantly lower your memory usage on writing. * You can pass ``expectedrows=`` to the first ``append``, - to set the TOTAL number of expected rows that ``PyTables`` will - expected. This will optimize read/write performance. + to set the TOTAL number of rows that ``PyTables`` will expect. + This will optimize read/write performance. * Duplicate rows can be written to tables, but are filtered out in selection (with the last items being selected; thus a table is unique on major, minor pairs) @@ -5491,30 +5493,29 @@ The top-level function :func:`read_spss` can read (but not write) SPSS `sav` (.sav) and `zsav` (.zsav) format files. SPSS files contain column names. By default the -whole file is read, categorical columns are converted into ``pd.Categorical`` +whole file is read, categorical columns are converted into ``pd.Categorical``, and a ``DataFrame`` with all columns is returned. -Specify a ``usecols`` to obtain a subset of columns. Specify ``convert_categoricals=False`` +Specify the ``usecols`` parameter to obtain a subset of columns. Specify ``convert_categoricals=False`` to avoid converting categorical columns into ``pd.Categorical``. -Read a spss file: +Read an SPSS file: .. code-block:: python - df = pd.read_spss('spss_data.zsav') + df = pd.read_spss('spss_data.sav') -Extract a subset of columns ``usecols`` from SPSS file and +Extract a subset of columns contained in ``usecols`` from an SPSS file and avoid converting categorical columns into ``pd.Categorical``: .. code-block:: python - df = pd.read_spss('spss_data.zsav', usecols=['foo', 'bar'], + df = pd.read_spss('spss_data.sav', usecols=['foo', 'bar'], convert_categoricals=False) -More info_ about the sav and zsav file format is available from the IBM -web site. +More information about the `sav` and `zsav` file format is available here_. -.. _info: https://www.ibm.com/support/knowledgecenter/en/SSLVMB_22.0.0/com.ibm.spss.statistics.help/spss/base/savedatatypes.htm +.. _here: https://www.ibm.com/support/knowledgecenter/en/SSLVMB_22.0.0/com.ibm.spss.statistics.help/spss/base/savedatatypes.htm .. _io.other: diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index f118fe84d523a..dd6d3062a8f0a 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -469,7 +469,7 @@ If ``crosstab`` receives only two Series, it will provide a frequency table. 'C': [1, 1, np.nan, 1, 1]}) df - pd.crosstab(df.A, df.B) + pd.crosstab(df['A'], df['B']) Any input passed containing ``Categorical`` data will have **all** of its categories included in the cross-tabulation, even if the actual data does @@ -489,13 +489,13 @@ using the ``normalize`` argument: .. ipython:: python - pd.crosstab(df.A, df.B, normalize=True) + pd.crosstab(df['A'], df['B'], normalize=True) ``normalize`` can also normalize values within each row or within each column: .. ipython:: python - pd.crosstab(df.A, df.B, normalize='columns') + pd.crosstab(df['A'], df['B'], normalize='columns') ``crosstab`` can also be passed a third ``Series`` and an aggregation function (``aggfunc``) that will be applied to the values of the third ``Series`` within @@ -503,7 +503,7 @@ each group defined by the first two ``Series``: .. ipython:: python - pd.crosstab(df.A, df.B, values=df.C, aggfunc=np.sum) + pd.crosstab(df['A'], df['B'], values=df['C'], aggfunc=np.sum) Adding margins ~~~~~~~~~~~~~~ @@ -512,7 +512,7 @@ Finally, one can also add margins or normalize this output. .. ipython:: python - pd.crosstab(df.A, df.B, values=df.C, aggfunc=np.sum, normalize=True, + pd.crosstab(df['A'], df['B'], values=df['C'], aggfunc=np.sum, normalize=True, margins=True) .. _reshaping.tile: diff --git a/doc/source/user_guide/visualization.rst b/doc/source/user_guide/visualization.rst index fdceaa5868cec..fa16b2f216610 100644 --- a/doc/source/user_guide/visualization.rst +++ b/doc/source/user_guide/visualization.rst @@ -1148,10 +1148,10 @@ To plot data on a secondary y-axis, use the ``secondary_y`` keyword: .. ipython:: python - df.A.plot() + df['A'].plot() @savefig series_plot_secondary_y.png - df.B.plot(secondary_y=True, style='g') + df['B'].plot(secondary_y=True, style='g') .. ipython:: python :suppress: @@ -1205,7 +1205,7 @@ Here is the default behavior, notice how the x-axis tick labeling is performed: plt.figure() @savefig ser_plot_suppress.png - df.A.plot() + df['A'].plot() .. ipython:: python :suppress: @@ -1219,7 +1219,7 @@ Using the ``x_compat`` parameter, you can suppress this behavior: plt.figure() @savefig ser_plot_suppress_parm.png - df.A.plot(x_compat=True) + df['A'].plot(x_compat=True) .. ipython:: python :suppress: @@ -1235,9 +1235,9 @@ in ``pandas.plotting.plot_params`` can be used in a `with statement`: @savefig ser_plot_suppress_context.png with pd.plotting.plot_params.use('x_compat', True): - df.A.plot(color='r') - df.B.plot(color='g') - df.C.plot(color='b') + df['A'].plot(color='r') + df['B'].plot(color='g') + df['C'].plot(color='b') .. ipython:: python :suppress: diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst index aeab2cf5809e7..fe80cc8bb959a 100644 --- a/doc/source/whatsnew/index.rst +++ b/doc/source/whatsnew/index.rst @@ -24,6 +24,7 @@ Version 0.25 .. toctree:: :maxdepth: 2 + v0.25.2 v0.25.1 v0.25.0 diff --git a/doc/source/whatsnew/v0.10.0.rst b/doc/source/whatsnew/v0.10.0.rst index 59ea6b9776232..2e0442364b2f3 100644 --- a/doc/source/whatsnew/v0.10.0.rst +++ b/doc/source/whatsnew/v0.10.0.rst @@ -498,7 +498,7 @@ Here is a taste of what to expect. .. code-block:: ipython - In [58]: p4d = Panel4D(randn(2, 2, 5, 4), + In [58]: p4d = Panel4D(np.random.randn(2, 2, 5, 4), ....: labels=['Label1','Label2'], ....: items=['Item1', 'Item2'], ....: major_axis=date_range('1/1/2000', periods=5), diff --git a/doc/source/whatsnew/v0.20.0.rst b/doc/source/whatsnew/v0.20.0.rst index ef6108ae3ec90..62604dd3edd2d 100644 --- a/doc/source/whatsnew/v0.20.0.rst +++ b/doc/source/whatsnew/v0.20.0.rst @@ -495,7 +495,7 @@ Other enhancements - :func:`pandas.util.hash_pandas_object` has gained the ability to hash a ``MultiIndex`` (:issue:`15224`) - ``Series/DataFrame.squeeze()`` have gained the ``axis`` parameter. (:issue:`15339`) - ``DataFrame.to_excel()`` has a new ``freeze_panes`` parameter to turn on Freeze Panes when exporting to Excel (:issue:`15160`) -- ``pd.read_html()`` will parse multiple header rows, creating a MutliIndex header. (:issue:`13434`). +- ``pd.read_html()`` will parse multiple header rows, creating a MultiIndex header. (:issue:`13434`). - HTML table output skips ``colspan`` or ``rowspan`` attribute if equal to 1. (:issue:`15403`) - :class:`pandas.io.formats.style.Styler` template now has blocks for easier extension, see the :ref:`example notebook ` (:issue:`15649`) - :meth:`Styler.render() ` now accepts ``**kwargs`` to allow user-defined variables in the template (:issue:`15649`) diff --git a/doc/source/whatsnew/v0.25.1.rst b/doc/source/whatsnew/v0.25.1.rst index 4d9ee4c676759..63dd56f4a3793 100644 --- a/doc/source/whatsnew/v0.25.1.rst +++ b/doc/source/whatsnew/v0.25.1.rst @@ -1,161 +1,115 @@ .. _whatsnew_0251: -What's new in 0.25.1 (July XX, 2019) ------------------------------------- +What's new in 0.25.1 (August 21, 2019) +-------------------------------------- -Enhancements -~~~~~~~~~~~~ - - -.. _whatsnew_0251.enhancements.other: +These are the changes in pandas 0.25.1. See :ref:`release` for a full changelog +including other versions of pandas. -Other enhancements -^^^^^^^^^^^^^^^^^^ +I/O and LZMA +~~~~~~~~~~~~ -- -- -- +Some users may unknowingly have an incomplete Python installation lacking the `lzma` module from the standard library. In this case, `import pandas` failed due to an `ImportError` (:issue: `27575`). +Pandas will now warn, rather than raising an `ImportError` if the `lzma` module is not present. Any subsequent attempt to use `lzma` methods will raise a `RuntimeError`. +A possible fix for the lack of the `lzma` module is to ensure you have the necessary libraries and then re-install Python. +For example, on MacOS installing Python with `pyenv` may lead to an incomplete Python installation due to unmet system dependencies at compilation time (like `xz`). Compilation will succeed, but Python might fail at run time. The issue can be solved by installing the necessary dependencies and then re-installing Python. .. _whatsnew_0251.bug_fixes: Bug fixes ~~~~~~~~~ - Categorical ^^^^^^^^^^^ -- -- -- +- Bug in :meth:`Categorical.fillna` that would replace all values, not just those that are ``NaN`` (:issue:`26215`) Datetimelike ^^^^^^^^^^^^ -- -- -- - -Timedelta -^^^^^^^^^ - -- -- -- +- Bug in :func:`to_datetime` where passing a timezone-naive :class:`DatetimeArray` or :class:`DatetimeIndex` and ``utc=True`` would incorrectly return a timezone-naive result (:issue:`27733`) +- Bug in :meth:`Period.to_timestamp` where a :class:`Period` outside the :class:`Timestamp` implementation bounds (roughly 1677-09-21 to 2262-04-11) would return an incorrect :class:`Timestamp` instead of raising ``OutOfBoundsDatetime`` (:issue:`19643`) +- Bug in iterating over :class:`DatetimeIndex` when the underlying data is read-only (:issue:`28055`) Timezones ^^^^^^^^^ - Bug in :class:`Index` where a numpy object array with a timezone aware :class:`Timestamp` and ``np.nan`` would not return a :class:`DatetimeIndex` (:issue:`27011`) -- -- Numeric ^^^^^^^ + - Bug in :meth:`Series.interpolate` when using a timezone aware :class:`DatetimeIndex` (:issue:`27548`) - Bug when printing negative floating point complex numbers would raise an ``IndexError`` (:issue:`27484`) -- -- +- Bug where :class:`DataFrame` arithmetic operators such as :meth:`DataFrame.mul` with a :class:`Series` with axis=1 would raise an ``AttributeError`` on :class:`DataFrame` larger than the minimum threshold to invoke numexpr (:issue:`27636`) +- Bug in :class:`DataFrame` arithmetic where missing values in results were incorrectly masked with ``NaN`` instead of ``Inf`` (:issue:`27464`) Conversion ^^^^^^^^^^ - Improved the warnings for the deprecated methods :meth:`Series.real` and :meth:`Series.imag` (:issue:`27610`) -- -- - -Strings -^^^^^^^ - -- -- -- - Interval ^^^^^^^^ + - Bug in :class:`IntervalIndex` where `dir(obj)` would raise ``ValueError`` (:issue:`27571`) -- -- -- Indexing ^^^^^^^^ - Bug in partial-string indexing returning a NumPy array rather than a ``Series`` when indexing with a scalar like ``.loc['2015']`` (:issue:`27516`) -- -- +- Break reference cycle involving :class:`Index` and other index classes to allow garbage collection of index objects without running the GC. (:issue:`27585`, :issue:`27840`) +- Fix regression in assigning values to a single column of a DataFrame with a ``MultiIndex`` columns (:issue:`27841`). +- Fix regression in ``.ix`` fallback with an ``IntervalIndex`` (:issue:`27865`). Missing ^^^^^^^ -- -- -- - -MultiIndex -^^^^^^^^^^ - -- -- -- +- Bug in :func:`pandas.isnull` or :func:`pandas.isna` when the input is a type e.g. ``type(pandas.Series())`` (:issue:`27482`) I/O ^^^ -- -- -- +- Avoid calling ``S3File.s3`` when reading parquet, as this was removed in s3fs version 0.3.0 (:issue:`27756`) +- Better error message when a negative header is passed in :func:`pandas.read_csv` (:issue:`27779`) +- Follow the ``min_rows`` display option (introduced in v0.25.0) correctly in the HTML repr in the notebook (:issue:`27991`). Plotting ^^^^^^^^ -- Added a pandas_plotting_backends entrypoint group for registering plot backends. See :ref:`extending.plotting-backends` for more (:issue:`26747`). -- -- +- Added a ``pandas_plotting_backends`` entrypoint group for registering plot backends. See :ref:`extending.plotting-backends` for more (:issue:`26747`). +- Fixed the re-instatement of Matplotlib datetime converters after calling + :meth:`pandas.plotting.deregister_matplotlib_converters` (:issue:`27481`). +- Fix compatibility issue with matplotlib when passing a pandas ``Index`` to a plot call (:issue:`27775`). Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ +- Fixed regression in :meth:`pands.core.groupby.DataFrameGroupBy.quantile` raising when multiple quantiles are given (:issue:`27526`) - Bug in :meth:`pandas.core.groupby.DataFrameGroupBy.transform` where applying a timezone conversion lambda function would drop timezone information (:issue:`27496`) -- -- +- Bug in :meth:`pandas.core.groupby.GroupBy.nth` where ``observed=False`` was being ignored for Categorical groupers (:issue:`26385`) +- Bug in windowing over read-only arrays (:issue:`27766`) +- Fixed segfault in `pandas.core.groupby.DataFrameGroupBy.quantile` when an invalid quantile was passed (:issue:`27470`) Reshaping ^^^^^^^^^ - A ``KeyError`` is now raised if ``.unstack()`` is called on a :class:`Series` or :class:`DataFrame` with a flat :class:`Index` passing a name which is not the correct one (:issue:`18303`) +- Bug :meth:`merge_asof` could not merge :class:`Timedelta` objects when passing `tolerance` kwarg (:issue:`27642`) +- Bug in :meth:`DataFrame.crosstab` when ``margins`` set to ``True`` and ``normalize`` is not ``False``, an error is raised. (:issue:`27500`) - :meth:`DataFrame.join` now suppresses the ``FutureWarning`` when the sort parameter is specified (:issue:`21952`) -- +- Bug in :meth:`DataFrame.join` raising with readonly arrays (:issue:`27943`) Sparse ^^^^^^ -- -- -- - - -Build Changes -^^^^^^^^^^^^^ - -- -- -- - -ExtensionArray -^^^^^^^^^^^^^^ - -- -- -- +- Bug in reductions for :class:`Series` with Sparse dtypes (:issue:`27080`) Other ^^^^^ + - Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` when replacing timezone-aware timestamps using a dict-like replacer (:issue:`27720`) -- -- -- +- Bug in :meth:`Series.rename` when using a custom type indexer. Now any value that isn't callable or dict-like is treated as a scalar. (:issue:`27814`) .. _whatsnew_0.251.contributors: diff --git a/doc/source/whatsnew/v0.25.2.rst b/doc/source/whatsnew/v0.25.2.rst new file mode 100644 index 0000000000000..1cdf213d81a74 --- /dev/null +++ b/doc/source/whatsnew/v0.25.2.rst @@ -0,0 +1,110 @@ +.. _whatsnew_0252: + +What's new in 0.25.2 (October XX, 2019) +--------------------------------------- + +These are the changes in pandas 0.25.2. See :ref:`release` for a full changelog +including other versions of pandas. + +.. _whatsnew_0252.bug_fixes: + +Bug fixes +~~~~~~~~~ + +Categorical +^^^^^^^^^^^ + +- + +Datetimelike +^^^^^^^^^^^^ + +- +- +- + +Timezones +^^^^^^^^^ + +- + +Numeric +^^^^^^^ + +- +- +- +- + +Conversion +^^^^^^^^^^ + +- + +Interval +^^^^^^^^ + +- + +Indexing +^^^^^^^^ + +- +- +- +- + +Missing +^^^^^^^ + +- + +I/O +^^^ + +- Fix regression in notebook display where tags not used for :attr:`DataFrame.index` (:issue:`28204`). +- Regression in :meth:`~DataFrame.to_csv` where writing a :class:`Series` or :class:`DataFrame` indexed by an :class:`IntervalIndex` would incorrectly raise a ``TypeError`` (:issue:`28210`) +- +- + +Plotting +^^^^^^^^ + +- +- +- + +Groupby/resample/rolling +^^^^^^^^^^^^^^^^^^^^^^^^ + +- Bug incorrectly raising an ``IndexError`` when passing a list of quantiles to :meth:`pandas.core.groupby.DataFrameGroupBy.quantile` (:issue:`28113`). +- +- +- + +Reshaping +^^^^^^^^^ + +- +- +- +- +- + +Sparse +^^^^^^ + +- + +Other +^^^^^ + +- Compatibility with Python 3.8 in :meth:`DataFrame.query` (:issue:`27261`) +- + +.. _whatsnew_0.252.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v0.25.1..HEAD diff --git a/doc/source/whatsnew/v0.7.3.rst b/doc/source/whatsnew/v0.7.3.rst index a8697f60d7467..020cf3bdc2d59 100644 --- a/doc/source/whatsnew/v0.7.3.rst +++ b/doc/source/whatsnew/v0.7.3.rst @@ -25,8 +25,6 @@ New features from pandas.tools.plotting import scatter_matrix scatter_matrix(df, alpha=0.2) # noqa F821 -.. image:: ../savefig/scatter_matrix_kde.png - :width: 5in - Add ``stacked`` argument to Series and DataFrame's ``plot`` method for :ref:`stacked bar plots `. @@ -35,15 +33,11 @@ New features df.plot(kind='bar', stacked=True) # noqa F821 -.. image:: ../savefig/bar_plot_stacked_ex.png - :width: 4in .. code-block:: python df.plot(kind='barh', stacked=True) # noqa F821 -.. image:: ../savefig/barh_plot_stacked_ex.png - :width: 4in - Add log x and y :ref:`scaling options ` to ``DataFrame.plot`` and ``Series.plot`` diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 04cd5e4c2c918..3b6288146bdf2 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -21,27 +21,27 @@ including other versions of pandas. Enhancements ~~~~~~~~~~~~ -.. _whatsnew_1000.enhancements.other: - - - +.. _whatsnew_1000.enhancements.other: + Other enhancements ^^^^^^^^^^^^^^^^^^ -.. _whatsnew_1000.api_breaking: - - - +.. _whatsnew_1000.api_breaking: + Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. _whatsnew_1000.api.other: - - :class:`pandas.core.groupby.GroupBy.transform` now raises on invalid operation names (:issue:`27489`). - +.. _whatsnew_1000.api.other: + Other API changes ^^^^^^^^^^^^^^^^^ @@ -65,7 +65,8 @@ Removal of prior version deprecations/changes - Changed the the default value of `inplace` in :meth:`DataFrame.set_index` and :meth:`Series.set_axis`. It now defaults to False (:issue:`27600`) - :meth:`pandas.Series.str.cat` now defaults to aligning ``others``, using ``join='left'`` (:issue:`27611`) - :meth:`pandas.Series.str.cat` does not accept list-likes *within* list-likes anymore (:issue:`27611`) -- +- Removed the previously deprecated :meth:`ExtensionArray._formatting_values`. Use :attr:`ExtensionArray._formatter` instead. (:issue:`23601`) +- Removed the previously deprecated ``IntervalIndex.from_intervals`` in favor of the :class:`IntervalIndex` constructor (:issue:`19263`) .. _whatsnew_1000.performance: @@ -75,6 +76,7 @@ Performance improvements - Performance improvement in indexing with a non-unique :class:`IntervalIndex` (:issue:`27489`) - Performance improvement in `MultiIndex.is_monotonic` (:issue:`27495`) - Performance improvement in :func:`cut` when ``bins`` is an :class:`IntervalIndex` (:issue:`27668`) +- Performance improvement in :meth:`DataFrame.replace` when provided a list of values to replace (:issue:`28099`) .. _whatsnew_1000.bug_fixes: @@ -86,6 +88,7 @@ Bug fixes Categorical ^^^^^^^^^^^ +- Added test to assert the :func:`fillna` raises the correct ValueError message when the value isn't a value from categories (:issue:`13628`) - - @@ -94,6 +97,7 @@ Datetimelike ^^^^^^^^^^^^ - Bug in :meth:`Series.__setitem__` incorrectly casting ``np.timedelta64("NaT")`` to ``np.datetime64("NaT")`` when inserting into a :class:`Series` with datetime64 dtype (:issue:`27311`) - Bug in :meth:`Series.dt` property lookups when the underlying data is read-only (:issue:`27529`) +- Bug in ``HDFStore.__getitem__`` incorrectly reading tz attribute created in Python 2 (:issue:`26443`) - @@ -138,7 +142,7 @@ Interval Indexing ^^^^^^^^ -- +- Bug in assignment using a reverse slicer (:issue:`26939`) - Missing @@ -156,14 +160,18 @@ MultiIndex I/O ^^^ -- +- :meth:`read_csv` now accepts binary mode file buffers when using the Python csv engine (:issue:`23779`) +- Bug in :meth:`DataFrame.to_json` where using a Tuple as a column or index value and using ``orient="columns"`` or ``orient="index"`` would produce invalid JSON (:issue:`20500`) - Plotting ^^^^^^^^ +- Bug in :meth:`Series.plot` not able to plot boolean values (:issue:`23719`) - -- +- Bug in :meth:`DataFrame.plot` producing incorrect legend markers when plotting multiple series on the same axis (:issue:`18222`) +- Bug in :meth:`DataFrame.plot` when ``kind='box'`` and data contains datetime or timedelta data. These types are now automatically dropped (:issue:`22799`) +- Bug in :meth:`DataFrame.plot.line` and :meth:`DataFrame.plot.area` produce wrong xlim in x-axis (:issue:`27686`, :issue:`25160`, :issue:`24784`) Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ @@ -171,6 +179,7 @@ Groupby/resample/rolling - - - Bug in :meth:`DataFrame.groupby` not offering selection by column name when ``axis=1`` (:issue:`27614`) +- Bug in :meth:`DataFrameGroupby.agg` not able to use lambda function with named aggregation (:issue:`27519`) Reshaping ^^^^^^^^^ @@ -187,6 +196,7 @@ Sparse Build Changes ^^^^^^^^^^^^^ +- Fixed pyqt development dependency issue because of different pyqt package name in conda and PyPI (:issue:`26838`) ExtensionArray @@ -195,6 +205,14 @@ ExtensionArray - - + +Other +^^^^^ +- Trying to set the ``display.precision``, ``display.max_rows`` or ``display.max_columns`` using :meth:`set_option` to anything but a ``None`` or a positive int will raise a ``ValueError`` (:issue:`23348`) +- Using :meth:`DataFrame.replace` with overlapping keys in a nested dictionary will no longer raise, now matching the behavior of a flat dictionary (:issue:`27660`) +- :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` now support dicts as ``compression`` argument with key ``'method'`` being the compression method and others as additional compression options when the compression method is ``'zip'``. (:issue:`26023`) + + .. _whatsnew_1000.contributors: Contributors diff --git a/doc/sphinxext/contributors.py b/doc/sphinxext/contributors.py index 4256e4659715d..1a064f71792e9 100644 --- a/doc/sphinxext/contributors.py +++ b/doc/sphinxext/contributors.py @@ -8,12 +8,11 @@ code contributors and commits, and then list each contributor individually. """ +from announce import build_components from docutils import nodes from docutils.parsers.rst import Directive import git -from announce import build_components - class ContributorsDirective(Directive): required_arguments = 1 diff --git a/environment.yml b/environment.yml index 93e8302b498a0..6d2cd701c3854 100644 --- a/environment.yml +++ b/environment.yml @@ -71,7 +71,7 @@ dependencies: - lxml # pandas.read_html - openpyxl # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile - pyarrow>=0.9.0 # pandas.read_paquet, DataFrame.to_parquet, pandas.read_feather, DataFrame.to_feather - - pyqt # pandas.read_clipbobard + - pyqt>=5.9.2 # pandas.read_clipboard - pytables>=3.4.2 # pandas.read_hdf, DataFrame.to_hdf - python-snappy # required by pyarrow - s3fs # pandas.read_csv... when using 's3://...' path diff --git a/pandas/_config/config.py b/pandas/_config/config.py index 4f0720abd1445..890db5b41907e 100644 --- a/pandas/_config/config.py +++ b/pandas/_config/config.py @@ -787,6 +787,7 @@ def is_instance_factory(_type): ValueError if x is not an instance of `_type` """ + if isinstance(_type, (tuple, list)): _type = tuple(_type) type_repr = "|".join(map(str, _type)) @@ -820,6 +821,32 @@ def inner(x): return inner +def is_nonnegative_int(value): + """ + Verify that value is None or a positive int. + + Parameters + ---------- + value : None or int + The `value` to be checked. + + Raises + ------ + ValueError + When the value is not None or is a negative integer + """ + + if value is None: + return + + elif isinstance(value, int): + if value >= 0: + return + + msg = "Value must be a nonnegative integer or None" + raise ValueError(msg) + + # common type validators, for convenience # usage: register_option(... , validator = is_int) is_int = is_type_factory(int) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index e3f18572abca1..3069bbbf34bb7 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -719,6 +719,11 @@ def group_quantile(ndarray[float64_t] out, ndarray[int64_t] counts, non_na_counts, sort_arr assert values.shape[0] == N + + if not (0 <= q <= 1): + raise ValueError("'q' must be between 0 and 1. Got" + " '{}' instead".format(q)) + inter_methods = { 'linear': INTERPOLATION_LINEAR, 'lower': INTERPOLATION_LOWER, diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index 3e620f5934d5e..b8df78e600a46 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -108,7 +108,7 @@ cdef class Int64Factorizer: def get_count(self): return self.count - def factorize(self, int64_t[:] values, sort=False, + def factorize(self, const int64_t[:] values, sort=False, na_sentinel=-1, na_value=None): """ Factorize values with nans replaced by na_sentinel diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index f704ceffa662e..7424c4ddc3d92 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -47,10 +47,6 @@ cpdef get_value_at(ndarray arr, object loc, object tz=None): return util.get_value_at(arr, loc) -def get_value_box(arr: ndarray, loc: object) -> object: - return get_value_at(arr, loc, tz=None) - - # Don't populate hash tables in monotonic indexes larger than this _SIZE_CUTOFF = 1000000 diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index cafc31dad3568..6cc9dd22ce7c9 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -2,7 +2,6 @@ # See LICENSE for the license import bz2 import gzip -import lzma import os import sys import time @@ -59,9 +58,12 @@ from pandas.core.arrays import Categorical from pandas.core.dtypes.concat import union_categoricals import pandas.io.common as icom +from pandas.compat import _import_lzma, _get_lzma_file from pandas.errors import (ParserError, DtypeWarning, EmptyDataError, ParserWarning) +lzma = _import_lzma() + # Import CParserError as alias of ParserError for backwards compatibility. # Ultimately, we want to remove this import. See gh-12665 and gh-14479. CParserError = ParserError @@ -645,9 +647,9 @@ cdef class TextReader: 'zip file %s', str(zip_names)) elif self.compression == 'xz': if isinstance(source, str): - source = lzma.LZMAFile(source, 'rb') + source = _get_lzma_file(lzma)(source, 'rb') else: - source = lzma.LZMAFile(filename=source) + source = _get_lzma_file(lzma)(filename=source) else: raise ValueError('Unrecognized compression type: %s' % self.compression) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index f95685c337969..c892c1cf1b8a3 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -296,8 +296,6 @@ cdef class SeriesBinGrouper: islider.advance(group_size) vslider.advance(group_size) - except: - raise finally: # so we don't free the wrong memory islider.reset() @@ -425,8 +423,6 @@ cdef class SeriesGrouper: group_size = 0 - except: - raise finally: # so we don't free the wrong memory islider.reset() diff --git a/pandas/_libs/src/ujson/lib/ultrajson.h b/pandas/_libs/src/ujson/lib/ultrajson.h index 0470fef450dde..ee6e7081bf00e 100644 --- a/pandas/_libs/src/ujson/lib/ultrajson.h +++ b/pandas/_libs/src/ujson/lib/ultrajson.h @@ -307,11 +307,4 @@ EXPORTFUNCTION JSOBJ JSON_DecodeObject(JSONObjectDecoder *dec, const char *buffer, size_t cbBuffer); EXPORTFUNCTION void encode(JSOBJ, JSONObjectEncoder *, const char *, size_t); -#define Buffer_Reserve(__enc, __len) \ - if ((size_t)((__enc)->end - (__enc)->offset) < (size_t)(__len)) { \ - Buffer_Realloc((__enc), (__len)); \ - } - -void Buffer_Realloc(JSONObjectEncoder *enc, size_t cbNeeded); - #endif // PANDAS__LIBS_SRC_UJSON_LIB_ULTRAJSON_H_ diff --git a/pandas/_libs/src/ujson/lib/ultrajsonenc.c b/pandas/_libs/src/ujson/lib/ultrajsonenc.c index 2d6c823a45515..d5b379bee585b 100644 --- a/pandas/_libs/src/ujson/lib/ultrajsonenc.c +++ b/pandas/_libs/src/ujson/lib/ultrajsonenc.c @@ -714,6 +714,12 @@ int Buffer_EscapeStringValidated(JSOBJ obj, JSONObjectEncoder *enc, } } +#define Buffer_Reserve(__enc, __len) \ + if ( (size_t) ((__enc)->end - (__enc)->offset) < (size_t) (__len)) \ + { \ + Buffer_Realloc((__enc), (__len));\ + } \ + #define Buffer_AppendCharUnchecked(__enc, __chr) *((__enc)->offset++) = __chr; FASTCALL_ATTR INLINE_PREFIX void FASTCALL_MSVC strreverse(char *begin, diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 926440218b5d9..4b612bb033761 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -16,18 +16,19 @@ derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE +GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF +THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc) https://github.com/client9/stringencoders -Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. +Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights +reserved. Numeric decoder derived from from TCL library http://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms @@ -48,13 +49,13 @@ Numeric decoder derived from from TCL library #include <../../../tslibs/src/datetime/np_datetime_strings.h> #include "datetime.h" -#define NPY_JSON_BUFSIZE 32768 - static PyTypeObject *type_decimal; static PyTypeObject *cls_dataframe; static PyTypeObject *cls_series; static PyTypeObject *cls_index; static PyTypeObject *cls_nat; +PyObject *cls_timestamp; +PyObject *cls_timedelta; npy_int64 get_nat(void) { return NPY_MIN_INT64; } @@ -64,9 +65,9 @@ typedef void *(*PFN_PyTypeToJSON)(JSOBJ obj, JSONTypeContext *ti, typedef struct __NpyArrContext { PyObject *array; char *dataptr; - int curdim; // current dimension in array's order - int stridedim; // dimension we are striding over - int inc; // stride dimension increment (+/- 1) + int curdim; // current dimension in array's order + int stridedim; // dimension we are striding over + int inc; // stride dimension increment (+/- 1) npy_intp dim; npy_intp stride; npy_intp ndim; @@ -83,8 +84,8 @@ typedef struct __PdBlockContext { int ncols; int transpose; - int *cindices; // frame column -> block column map - NpyArrContext **npyCtxts; // NpyArrContext for each column + int *cindices; // frame column -> block column map + NpyArrContext **npyCtxts; // NpyArrContext for each column } PdBlockContext; typedef struct __TypeContext { @@ -148,13 +149,12 @@ enum PANDAS_FORMAT { SPLIT, RECORDS, INDEX, COLUMNS, VALUES }; int PdBlock_iterNext(JSOBJ, JSONTypeContext *); -void *initObjToJSON(void) -{ +void *initObjToJSON(void) { PyObject *mod_pandas; PyObject *mod_nattype; PyObject *mod_decimal = PyImport_ImportModule("decimal"); type_decimal = - (PyTypeObject *)PyObject_GetAttrString(mod_decimal, "Decimal"); + (PyTypeObject *)PyObject_GetAttrString(mod_decimal, "Decimal"); Py_DECREF(mod_decimal); PyDateTime_IMPORT; @@ -166,13 +166,15 @@ void *initObjToJSON(void) cls_index = (PyTypeObject *)PyObject_GetAttrString(mod_pandas, "Index"); cls_series = (PyTypeObject *)PyObject_GetAttrString(mod_pandas, "Series"); + cls_timestamp = PyObject_GetAttrString(mod_pandas, "Timestamp"); + cls_timedelta = PyObject_GetAttrString(mod_pandas, "Timedelta"); Py_DECREF(mod_pandas); } mod_nattype = PyImport_ImportModule("pandas._libs.tslibs.nattype"); if (mod_nattype) { - cls_nat = (PyTypeObject *)PyObject_GetAttrString(mod_nattype, - "NaTType"); + cls_nat = + (PyTypeObject *)PyObject_GetAttrString(mod_nattype, "NaTType"); Py_DECREF(mod_nattype); } @@ -210,7 +212,6 @@ static TypeContext *createTypeContext(void) { return pc; } - static int is_sparse_array(PyObject *obj) { // TODO can be removed again once SparseArray.values is removed (GH26421) if (PyObject_HasAttrString(obj, "_subtyp")) { @@ -225,7 +226,6 @@ static int is_sparse_array(PyObject *obj) { return 0; } - static PyObject *get_values(PyObject *obj) { PyObject *values = NULL; @@ -240,7 +240,8 @@ static PyObject *get_values(PyObject *obj) { values = PyObject_CallMethod(values, "to_numpy", NULL); } - if (!is_sparse_array(values) && PyObject_HasAttrString(values, "values")) { + if (!is_sparse_array(values) && + PyObject_HasAttrString(values, "values")) { PyObject *subvals = get_values(values); PyErr_Clear(); PRINTMARK(); @@ -355,20 +356,20 @@ static Py_ssize_t get_attr_length(PyObject *obj, char *attr) { } static npy_int64 get_long_attr(PyObject *o, const char *attr) { - npy_int64 long_val; - PyObject *value = PyObject_GetAttrString(o, attr); - long_val = (PyLong_Check(value) ? - PyLong_AsLongLong(value) : PyLong_AsLong(value)); - Py_DECREF(value); - return long_val; + npy_int64 long_val; + PyObject *value = PyObject_GetAttrString(o, attr); + long_val = + (PyLong_Check(value) ? PyLong_AsLongLong(value) : PyLong_AsLong(value)); + Py_DECREF(value); + return long_val; } static npy_float64 total_seconds(PyObject *td) { - npy_float64 double_val; - PyObject *value = PyObject_CallMethod(td, "total_seconds", NULL); - double_val = PyFloat_AS_DOUBLE(value); - Py_DECREF(value); - return double_val; + npy_float64 double_val; + PyObject *value = PyObject_CallMethod(td, "total_seconds", NULL); + double_val = PyFloat_AS_DOUBLE(value); + Py_DECREF(value); + return double_val; } static PyObject *get_item(PyObject *obj, Py_ssize_t i) { @@ -448,7 +449,7 @@ static void *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc, void *outValue, if (PyUnicode_IS_COMPACT_ASCII(obj)) { Py_ssize_t len; - char *data = (char*)PyUnicode_AsUTF8AndSize(obj, &len); + char *data = (char *)PyUnicode_AsUTF8AndSize(obj, &len); *_outLen = len; return data; } @@ -503,7 +504,7 @@ static void *NpyDateTimeScalarToJSON(JSOBJ _obj, JSONTypeContext *tc, // TODO(anyone): Does not appear to be reached in tests. pandas_datetime_to_datetimestruct(obj->obval, - (NPY_DATETIMEUNIT)obj->obmeta.base, &dts); + (NPY_DATETIMEUNIT)obj->obmeta.base, &dts); return PandasDateTimeStructToJSON(&dts, tc, outValue, _outLen); } @@ -662,9 +663,9 @@ void NpyArr_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { GET_TC(tc)->npyarr = npyarr; if (!npyarr) { - PyErr_NoMemory(); - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - return; + PyErr_NoMemory(); + GET_TC(tc)->iterNext = NpyArr_iterNextNone; + return; } npyarr->array = (PyObject *)obj; @@ -675,17 +676,17 @@ void NpyArr_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { npyarr->type_num = PyArray_DESCR(obj)->type_num; if (GET_TC(tc)->transpose) { - npyarr->dim = PyArray_DIM(obj, npyarr->ndim); - npyarr->stride = PyArray_STRIDE(obj, npyarr->ndim); - npyarr->stridedim = npyarr->ndim; - npyarr->index[npyarr->ndim] = 0; - npyarr->inc = -1; + npyarr->dim = PyArray_DIM(obj, npyarr->ndim); + npyarr->stride = PyArray_STRIDE(obj, npyarr->ndim); + npyarr->stridedim = npyarr->ndim; + npyarr->index[npyarr->ndim] = 0; + npyarr->inc = -1; } else { - npyarr->dim = PyArray_DIM(obj, 0); - npyarr->stride = PyArray_STRIDE(obj, 0); - npyarr->stridedim = 0; - npyarr->index[0] = 0; - npyarr->inc = 1; + npyarr->dim = PyArray_DIM(obj, 0); + npyarr->stride = PyArray_STRIDE(obj, 0); + npyarr->stridedim = 0; + npyarr->index[0] = 0; + npyarr->inc = 1; } npyarr->columnLabels = GET_TC(tc)->columnLabels; @@ -733,8 +734,7 @@ int NpyArr_iterNextItem(JSOBJ obj, JSONTypeContext *tc) { NpyArr_freeItemValue(obj, tc); - if (PyArray_ISDATETIME(npyarr->array)) - { + if (PyArray_ISDATETIME(npyarr->array)) { PRINTMARK(); GET_TC(tc)->itemValue = obj; Py_INCREF(obj); @@ -787,30 +787,23 @@ JSOBJ NpyArr_iterGetValue(JSOBJ obj, JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -static void NpyArr_getLabel(JSOBJ obj, JSONTypeContext *tc, size_t *outLen, - npy_intp idx, char **labels) { - JSONObjectEncoder *enc = (JSONObjectEncoder *)tc->encoder; - PRINTMARK(); - *outLen = strlen(labels[idx]); - Buffer_Reserve(enc, *outLen); - memcpy(enc->offset, labels[idx], sizeof(char) * (*outLen)); - enc->offset += *outLen; - *outLen = 0; -} - char *NpyArr_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { NpyArrContext *npyarr = GET_TC(tc)->npyarr; npy_intp idx; PRINTMARK(); + char *cStr; if (GET_TC(tc)->iterNext == NpyArr_iterNextItem) { idx = npyarr->index[npyarr->stridedim] - 1; - NpyArr_getLabel(obj, tc, outLen, idx, npyarr->columnLabels); + cStr = npyarr->columnLabels[idx]; } else { idx = npyarr->index[npyarr->stridedim - npyarr->inc] - 1; - NpyArr_getLabel(obj, tc, outLen, idx, npyarr->rowLabels); + cStr = npyarr->rowLabels[idx]; } - return NULL; + + *outLen = strlen(cStr); + + return cStr; } //============================================================================= @@ -852,19 +845,22 @@ char *PdBlock_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; NpyArrContext *npyarr = blkCtxt->npyCtxts[0]; npy_intp idx; + char *cStr; PRINTMARK(); if (GET_TC(tc)->iterNext == PdBlock_iterNextItem) { idx = blkCtxt->colIdx - 1; - NpyArr_getLabel(obj, tc, outLen, idx, npyarr->columnLabels); + cStr = npyarr->columnLabels[idx]; } else { idx = GET_TC(tc)->iterNext != PdBlock_iterNext ? npyarr->index[npyarr->stridedim - npyarr->inc] - 1 : npyarr->index[npyarr->stridedim]; - NpyArr_getLabel(obj, tc, outLen, idx, npyarr->rowLabels); + cStr = npyarr->rowLabels[idx]; } - return NULL; + + *outLen = strlen(cStr); + return cStr; } char *PdBlock_iterGetName_Transpose(JSOBJ obj, JSONTypeContext *tc, @@ -872,16 +868,19 @@ char *PdBlock_iterGetName_Transpose(JSOBJ obj, JSONTypeContext *tc, PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; NpyArrContext *npyarr = blkCtxt->npyCtxts[blkCtxt->colIdx]; npy_intp idx; + char *cStr; PRINTMARK(); if (GET_TC(tc)->iterNext == NpyArr_iterNextItem) { idx = npyarr->index[npyarr->stridedim] - 1; - NpyArr_getLabel(obj, tc, outLen, idx, npyarr->columnLabels); + cStr = npyarr->columnLabels[idx]; } else { idx = blkCtxt->colIdx; - NpyArr_getLabel(obj, tc, outLen, idx, npyarr->rowLabels); + cStr = npyarr->rowLabels[idx]; } - return NULL; + + *outLen = strlen(cStr); + return cStr; } int PdBlock_iterNext(JSOBJ obj, JSONTypeContext *tc) { @@ -942,9 +941,9 @@ void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { dtype = PyArray_DescrFromType(NPY_INT64); obj = (PyObject *)_obj; - GET_TC(tc) - ->iterGetName = GET_TC(tc)->transpose ? PdBlock_iterGetName_Transpose - : PdBlock_iterGetName; + GET_TC(tc)->iterGetName = GET_TC(tc)->transpose + ? PdBlock_iterGetName_Transpose + : PdBlock_iterGetName; blkCtxt = PyObject_Malloc(sizeof(PdBlockContext)); if (!blkCtxt) { @@ -1395,7 +1394,7 @@ void Series_iterBegin(JSOBJ obj, JSONTypeContext *tc) { PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; GET_TC(tc)->index = 0; GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char)); - enc->outputFormat = VALUES; // for contained series + enc->outputFormat = VALUES; // for contained series if (!GET_TC(tc)->cStr) { PyErr_NoMemory(); } @@ -1454,7 +1453,7 @@ void DataFrame_iterBegin(JSOBJ obj, JSONTypeContext *tc) { PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; GET_TC(tc)->index = 0; GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char)); - enc->outputFormat = VALUES; // for contained series & index + enc->outputFormat = VALUES; // for contained series & index if (!GET_TC(tc)->cStr) { PyErr_NoMemory(); } @@ -1578,16 +1577,30 @@ void NpyArr_freeLabels(char **labels, npy_intp len) { } } -char **NpyArr_encodeLabels(PyArrayObject *labels, JSONObjectEncoder *enc, +/* + * Function: NpyArr_encodeLabels + * ----------------------------- + * + * Builds an array of "encoded" labels. + * + * labels: PyArrayObject pointer for labels to be "encoded" + * num : number of labels + * + * "encode" is quoted above because we aren't really doing encoding + * For historical reasons this function would actually encode the entire + * array into a separate buffer with a separate call to JSON_Encode + * and would leave it to complex pointer manipulation from there to + * unpack values as needed. To make things simpler and more idiomatic + * this has instead just stringified any input save for datetime values, + * which may need to be represented in various formats. + */ +char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, npy_intp num) { // NOTE this function steals a reference to labels. - PyObjectEncoder *pyenc = (PyObjectEncoder *)enc; PyObject *item = NULL; - npy_intp i, stride, len, need_quotes; + npy_intp i, stride, len; char **ret; - char *dataptr, *cLabel, *origend, *origst, *origoffset; - char labelBuffer[NPY_JSON_BUFSIZE]; - PyArray_GetItemFunc *getitem; + char *dataptr, *cLabel; int type_num; PRINTMARK(); @@ -1614,68 +1627,137 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, JSONObjectEncoder *enc, ret[i] = NULL; } - origst = enc->start; - origend = enc->end; - origoffset = enc->offset; - stride = PyArray_STRIDE(labels, 0); dataptr = PyArray_DATA(labels); - getitem = (PyArray_GetItemFunc *)PyArray_DESCR(labels)->f->getitem; type_num = PyArray_TYPE(labels); for (i = 0; i < num; i++) { - if (PyTypeNum_ISDATETIME(type_num) || PyTypeNum_ISNUMBER(type_num)) - { - item = (PyObject *)labels; - pyenc->npyType = type_num; - pyenc->npyValue = dataptr; - } else { - item = getitem(dataptr, labels); - if (!item) { + item = PyArray_GETITEM(labels, dataptr); + if (!item) { + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } + + // TODO: for any matches on type_num (date and timedeltas) should use a + // vectorized solution to convert to epoch or iso formats + if (enc->datetimeIso && + (type_num == NPY_TIMEDELTA || PyDelta_Check(item))) { + PyObject *td = PyObject_CallFunction(cls_timedelta, "(O)", item); + if (td == NULL) { + Py_DECREF(item); NpyArr_freeLabels(ret, num); ret = 0; break; } - } - cLabel = JSON_EncodeObject(item, enc, labelBuffer, NPY_JSON_BUFSIZE); + PyObject *iso = PyObject_CallMethod(td, "isoformat", NULL); + Py_DECREF(td); + if (iso == NULL) { + Py_DECREF(item); + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } - if (item != (PyObject *)labels) { - Py_DECREF(item); + cLabel = (char *)PyUnicode_AsUTF8(iso); + Py_DECREF(iso); + len = strlen(cLabel); + } else if (PyTypeNum_ISDATETIME(type_num) || PyDateTime_Check(item) || + PyDate_Check(item)) { + PyObject *ts = PyObject_CallFunction(cls_timestamp, "(O)", item); + if (ts == NULL) { + Py_DECREF(item); + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } + + if (enc->datetimeIso) { + PyObject *iso = PyObject_CallMethod(ts, "isoformat", NULL); + Py_DECREF(ts); + if (iso == NULL) { + Py_DECREF(item); + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } + + cLabel = (char *)PyUnicode_AsUTF8(iso); + Py_DECREF(iso); + len = strlen(cLabel); + } else { + npy_int64 value; + // TODO: refactor to not duplicate what goes on in + // beginTypeContext + if (PyObject_HasAttrString(ts, "value")) { + PRINTMARK(); + value = get_long_attr(ts, "value"); + } else { + PRINTMARK(); + value = total_seconds(ts) * + 1000000000LL; // nanoseconds per second + } + Py_DECREF(ts); + + switch (enc->datetimeUnit) { + case NPY_FR_ns: + break; + case NPY_FR_us: + value /= 1000LL; + break; + case NPY_FR_ms: + value /= 1000000LL; + break; + case NPY_FR_s: + value /= 1000000000LL; + break; + default: + Py_DECREF(item); + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } + + char buf[21] = {0}; // 21 chars for 2**63 as string + cLabel = buf; + sprintf(buf, "%" NPY_INT64_FMT, value); + len = strlen(cLabel); + } + } else { // Fallack to string representation + PyObject *str = PyObject_Str(item); + if (str == NULL) { + Py_DECREF(item); + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } + + cLabel = (char *)PyUnicode_AsUTF8(str); + Py_DECREF(str); + len = strlen(cLabel); } - if (PyErr_Occurred() || enc->errorMsg) { + Py_DECREF(item); + // Add 1 to include NULL terminator + ret[i] = PyObject_Malloc(len + 1); + memcpy(ret[i], cLabel, len + 1); + + if (PyErr_Occurred()) { NpyArr_freeLabels(ret, num); ret = 0; break; } - need_quotes = ((*cLabel) != '"'); - len = enc->offset - cLabel + 1 + 2 * need_quotes; - ret[i] = PyObject_Malloc(sizeof(char) * len); - if (!ret[i]) { PyErr_NoMemory(); ret = 0; break; } - if (need_quotes) { - ret[i][0] = '"'; - memcpy(ret[i] + 1, cLabel, sizeof(char) * (len - 4)); - ret[i][len - 3] = '"'; - } else { - memcpy(ret[i], cLabel, sizeof(char) * (len - 2)); - } - ret[i][len - 2] = ':'; - ret[i][len - 1] = '\0'; dataptr += stride; } - enc->start = origst; - enc->end = origend; - enc->offset = origoffset; - Py_DECREF(labels); return ret; } @@ -1840,23 +1922,22 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { value = get_long_attr(obj, "value"); } else { PRINTMARK(); - value = - total_seconds(obj) * 1000000000LL; // nanoseconds per second + value = total_seconds(obj) * 1000000000LL; // nanoseconds per second } base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; switch (base) { - case NPY_FR_ns: - break; - case NPY_FR_us: - value /= 1000LL; - break; - case NPY_FR_ms: - value /= 1000000LL; - break; - case NPY_FR_s: - value /= 1000000000LL; - break; + case NPY_FR_ns: + break; + case NPY_FR_us: + value /= 1000LL; + break; + case NPY_FR_ms: + value /= 1000000LL; + break; + case NPY_FR_s: + value /= 1000000000LL; + break; } exc = PyErr_Occurred(); @@ -1971,8 +2052,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { goto INVALID; } pc->columnLabelsLen = PyArray_DIM(pc->newObj, 0); - pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, - (JSONObjectEncoder *)enc, + pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, enc, pc->columnLabelsLen); if (!pc->columnLabels) { goto INVALID; @@ -2074,8 +2154,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { goto INVALID; } pc->columnLabelsLen = PyObject_Size(tmpObj); - pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, - (JSONObjectEncoder *)enc, + pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, enc, pc->columnLabelsLen); Py_DECREF(tmpObj); if (!pc->columnLabels) { @@ -2096,9 +2175,8 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { goto INVALID; } pc->rowLabelsLen = PyObject_Size(tmpObj); - pc->rowLabels = - NpyArr_encodeLabels((PyArrayObject *)values, - (JSONObjectEncoder *)enc, pc->rowLabelsLen); + pc->rowLabels = NpyArr_encodeLabels((PyArrayObject *)values, enc, + pc->rowLabelsLen); Py_DECREF(tmpObj); tmpObj = (enc->outputFormat == INDEX ? PyObject_GetAttrString(obj, "columns") @@ -2116,8 +2194,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { goto INVALID; } pc->columnLabelsLen = PyObject_Size(tmpObj); - pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, - (JSONObjectEncoder *)enc, + pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, enc, pc->columnLabelsLen); Py_DECREF(tmpObj); if (!pc->columnLabels) { @@ -2242,7 +2319,8 @@ void Object_endTypeContext(JSOBJ obj, JSONTypeContext *tc) { PyObject_Free(GET_TC(tc)->cStr); GET_TC(tc)->cStr = NULL; - if (tc->prv != &(((PyObjectEncoder *)tc->encoder)->basicTypeContext)) { // NOLINT + if (tc->prv != + &(((PyObjectEncoder *)tc->encoder)->basicTypeContext)) { // NOLINT PyObject_Free(tc->prv); } tc->prv = NULL; @@ -2305,7 +2383,7 @@ PyObject *objToJSON(PyObject *self, PyObject *args, PyObject *kwargs) { PyObject *newobj; PyObject *oinput = NULL; PyObject *oensureAscii = NULL; - int idoublePrecision = 10; // default double precision setting + int idoublePrecision = 10; // default double precision setting PyObject *oencodeHTMLChars = NULL; char *sOrient = NULL; char *sdateFormat = NULL; @@ -2328,10 +2406,10 @@ PyObject *objToJSON(PyObject *self, PyObject *args, PyObject *kwargs) { PyObject_Malloc, PyObject_Realloc, PyObject_Free, - -1, // recursionMax + -1, // recursionMax idoublePrecision, - 1, // forceAscii - 0, // encodeHTMLChars + 1, // forceAscii + 0, // encodeHTMLChars }}; JSONObjectEncoder *encoder = (JSONObjectEncoder *)&pyEncoder; @@ -2429,7 +2507,6 @@ PyObject *objToJSON(PyObject *self, PyObject *args, PyObject *kwargs) { PRINTMARK(); ret = JSON_EncodeObject(oinput, encoder, buffer, sizeof(buffer)); PRINTMARK(); - if (PyErr_Occurred()) { PRINTMARK(); return NULL; diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 4e49f660f5e19..01e500a80dcc4 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -71,7 +71,7 @@ cdef inline object create_time_from_ts( @cython.wraparound(False) @cython.boundscheck(False) -def ints_to_pydatetime(int64_t[:] arr, object tz=None, object freq=None, +def ints_to_pydatetime(const int64_t[:] arr, object tz=None, object freq=None, str box="datetime"): """ Convert an i8 repr to an ndarray of datetimes, date, time or Timestamp diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index 6fab1b5c02be1..020d1acf0b4ce 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -92,6 +92,9 @@ cdef class _NaT(datetime): # int64_t value # object freq + # higher than np.ndarray and np.matrix + __array_priority__ = 100 + def __hash__(_NaT self): # py3k needs this defined here return hash(self.value) @@ -103,61 +106,102 @@ cdef class _NaT(datetime): if ndim == -1: return _nat_scalar_rules[op] - if ndim == 0: + elif util.is_array(other): + result = np.empty(other.shape, dtype=np.bool_) + result.fill(_nat_scalar_rules[op]) + return result + + elif ndim == 0: if is_datetime64_object(other): return _nat_scalar_rules[op] else: raise TypeError('Cannot compare type %r with type %r' % (type(self).__name__, type(other).__name__)) + # Note: instead of passing "other, self, _reverse_ops[op]", we observe # that `_nat_scalar_rules` is invariant under `_reverse_ops`, # rendering it unnecessary. return PyObject_RichCompare(other, self, op) def __add__(self, other): + if self is not c_NaT: + # cython __radd__ semantics + self, other = other, self + if PyDateTime_Check(other): return c_NaT - + elif PyDelta_Check(other): + return c_NaT + elif is_datetime64_object(other) or is_timedelta64_object(other): + return c_NaT elif hasattr(other, 'delta'): # Timedelta, offsets.Tick, offsets.Week return c_NaT - elif getattr(other, '_typ', None) in ['dateoffset', 'series', - 'period', 'datetimeindex', - 'datetimearray', - 'timedeltaindex', - 'timedeltaarray']: - # Duplicate logic in _Timestamp.__add__ to avoid needing - # to subclass; allows us to @final(_Timestamp.__add__) - return NotImplemented - return c_NaT + + elif is_integer_object(other) or util.is_period_object(other): + # For Period compat + # TODO: the integer behavior is deprecated, remove it + return c_NaT + + elif util.is_array(other): + if other.dtype.kind in 'mM': + # If we are adding to datetime64, we treat NaT as timedelta + # Either way, result dtype is datetime64 + result = np.empty(other.shape, dtype="datetime64[ns]") + result.fill("NaT") + return result + + return NotImplemented def __sub__(self, other): # Duplicate some logic from _Timestamp.__sub__ to avoid needing # to subclass; allows us to @final(_Timestamp.__sub__) + cdef: + bint is_rsub = False + + if self is not c_NaT: + # cython __rsub__ semantics + self, other = other, self + is_rsub = True + if PyDateTime_Check(other): - return NaT + return c_NaT elif PyDelta_Check(other): - return NaT + return c_NaT + elif is_datetime64_object(other) or is_timedelta64_object(other): + return c_NaT + elif hasattr(other, 'delta'): + # offsets.Tick, offsets.Week + return c_NaT - elif getattr(other, '_typ', None) == 'datetimeindex': - # a Timestamp-DatetimeIndex -> yields a negative TimedeltaIndex - return -other.__sub__(self) + elif is_integer_object(other) or util.is_period_object(other): + # For Period compat + # TODO: the integer behavior is deprecated, remove it + return c_NaT - elif getattr(other, '_typ', None) == 'timedeltaindex': - # a Timestamp-TimedeltaIndex -> yields a negative TimedeltaIndex - return (-other).__add__(self) + elif util.is_array(other): + if other.dtype.kind == 'm': + if not is_rsub: + # NaT - timedelta64 we treat NaT as datetime64, so result + # is datetime64 + result = np.empty(other.shape, dtype="datetime64[ns]") + result.fill("NaT") + return result + + # timedelta64 - NaT we have to treat NaT as timedelta64 + # for this to be meaningful, and the result is timedelta64 + result = np.empty(other.shape, dtype="timedelta64[ns]") + result.fill("NaT") + return result + + elif other.dtype.kind == 'M': + # We treat NaT as a datetime, so regardless of whether this is + # NaT - other or other - NaT, the result is timedelta64 + result = np.empty(other.shape, dtype="timedelta64[ns]") + result.fill("NaT") + return result - elif hasattr(other, 'delta'): - # offsets.Tick, offsets.Week - neg_other = -other - return self + neg_other - - elif getattr(other, '_typ', None) in ['period', 'series', - 'periodindex', 'dateoffset', - 'datetimearray', - 'timedeltaarray']: - return NotImplemented - return NaT + return NotImplemented def __pos__(self): return NaT diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index c68d686ff2bf2..98e55f50062a2 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -21,7 +21,8 @@ PyDateTime_IMPORT from pandas._libs.tslibs.np_datetime cimport ( npy_datetimestruct, dtstruct_to_dt64, dt64_to_dtstruct, - pandas_datetime_to_datetimestruct, NPY_DATETIMEUNIT, NPY_FR_D) + pandas_datetime_to_datetimestruct, check_dts_bounds, + NPY_DATETIMEUNIT, NPY_FR_D) cdef extern from "src/datetime/np_datetime.h": int64_t npy_datetimestruct_to_datetime(NPY_DATETIMEUNIT fr, @@ -1011,7 +1012,7 @@ def dt64arr_to_periodarr(int64_t[:] dtarr, int freq, tz=None): @cython.wraparound(False) @cython.boundscheck(False) -def periodarr_to_dt64arr(int64_t[:] periodarr, int freq): +def periodarr_to_dt64arr(const int64_t[:] periodarr, int freq): """ Convert array to datetime64 values from a set of ordinals corresponding to periods per period convention. @@ -1024,9 +1025,8 @@ def periodarr_to_dt64arr(int64_t[:] periodarr, int freq): out = np.empty(l, dtype='i8') - with nogil: - for i in range(l): - out[i] = period_ordinal_to_dt64(periodarr[i], freq) + for i in range(l): + out[i] = period_ordinal_to_dt64(periodarr[i], freq) return out.base # .base to access underlying np.ndarray @@ -1179,7 +1179,7 @@ cpdef int64_t period_ordinal(int y, int m, int d, int h, int min, return get_period_ordinal(&dts, freq) -cpdef int64_t period_ordinal_to_dt64(int64_t ordinal, int freq) nogil: +cdef int64_t period_ordinal_to_dt64(int64_t ordinal, int freq) except? -1: cdef: npy_datetimestruct dts @@ -1187,6 +1187,7 @@ cpdef int64_t period_ordinal_to_dt64(int64_t ordinal, int freq) nogil: return NPY_NAT get_date_info(ordinal, freq, &dts) + check_dts_bounds(&dts) return dtstruct_to_dt64(&dts) diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index d93858cff5e05..fbda5f178e164 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -341,7 +341,8 @@ def array_strptime(object[:] values, object fmt, return result, result_timezone.base -"""_getlang, LocaleTime, TimeRE, _calc_julian_from_U_or_W are vendored +""" +_getlang, LocaleTime, TimeRE, _calc_julian_from_U_or_W are vendored from the standard library, see https://github.com/python/cpython/blob/master/Lib/_strptime.py The original module-level docstring follows. @@ -363,7 +364,8 @@ def _getlang(): class LocaleTime: - """Stores and handles locale-specific information related to time. + """ + Stores and handles locale-specific information related to time. ATTRIBUTES: f_weekday -- full weekday names (7-item list) @@ -382,7 +384,8 @@ class LocaleTime: """ def __init__(self): - """Set all attributes. + """ + Set all attributes. Order of methods called matters for dependency reasons. @@ -399,7 +402,6 @@ class LocaleTime: Only other possible issue is if someone changed the timezone and did not call tz.tzset . That is an issue for the programmer, though, since changing the timezone is worthless without that call. - """ self.lang = _getlang() self.__calc_weekday() @@ -518,15 +520,16 @@ class TimeRE(dict): """ def __init__(self, locale_time=None): - """Create keys/values. + """ + Create keys/values. Order of execution is important for dependency reasons. - """ if locale_time: self.locale_time = locale_time else: self.locale_time = LocaleTime() + self._Z = None base = super() base.__init__({ # The " \d" part of the regex is to make %c from ANSI C work @@ -555,21 +558,29 @@ class TimeRE(dict): 'B': self.__seqToRE(self.locale_time.f_month[1:], 'B'), 'b': self.__seqToRE(self.locale_time.a_month[1:], 'b'), 'p': self.__seqToRE(self.locale_time.am_pm, 'p'), - 'Z': self.__seqToRE(pytz.all_timezones, 'Z'), + # 'Z' key is generated lazily via __getitem__ '%': '%'}) base.__setitem__('W', base.__getitem__('U').replace('U', 'W')) base.__setitem__('c', self.pattern(self.locale_time.LC_date_time)) base.__setitem__('x', self.pattern(self.locale_time.LC_date)) base.__setitem__('X', self.pattern(self.locale_time.LC_time)) + def __getitem__(self, key): + if key == "Z": + # lazy computation + if self._Z is None: + self._Z = self.__seqToRE(pytz.all_timezones, 'Z') + return self._Z + return super().__getitem__(key) + def __seqToRE(self, to_convert, directive): - """Convert a list to a regex string for matching a directive. + """ + Convert a list to a regex string for matching a directive. Want possible matching values to be from longest to shortest. This prevents the possibility of a match occurring for a value that also a substring of a larger value that should have matched (e.g., 'abc' matching when 'abcdef' should have been the match). - """ to_convert = sorted(to_convert, key=len, reverse=True) for value in to_convert: @@ -582,11 +593,11 @@ class TimeRE(dict): return '%s)' % regex def pattern(self, format): - """Return regex pattern for the format string. + """ + Return regex pattern for the format string. Need to make sure that any characters that might be interpreted as regex syntax are escaped. - """ processed_format = '' # The sub() call escapes all characters that might be misconstrued @@ -619,7 +630,8 @@ _regex_cache = {} cdef int _calc_julian_from_U_or_W(int year, int week_of_year, int day_of_week, int week_starts_Mon): - """Calculate the Julian day based on the year, week of the year, and day of + """ + Calculate the Julian day based on the year, week of the year, and day of the week, with week_start_day representing whether the week of the year assumes the week starts on Sunday or Monday (6 or 0). @@ -660,8 +672,10 @@ cdef int _calc_julian_from_U_or_W(int year, int week_of_year, return 1 + days_to_week + day_of_week -cdef object _calc_julian_from_V(int iso_year, int iso_week, int iso_weekday): - """Calculate the Julian day based on the ISO 8601 year, week, and weekday. +cdef (int, int) _calc_julian_from_V(int iso_year, int iso_week, int iso_weekday): + """ + Calculate the Julian day based on the ISO 8601 year, week, and weekday. + ISO weeks start on Mondays, with week 01 being the week containing 4 Jan. ISO week days range from 1 (Monday) to 7 (Sunday). @@ -694,7 +708,7 @@ cdef object _calc_julian_from_V(int iso_year, int iso_week, int iso_weekday): return iso_year, ordinal -cdef parse_timezone_directive(object z): +cdef parse_timezone_directive(str z): """ Parse the '%z' directive and return a pytz.FixedOffset diff --git a/pandas/_typing.py b/pandas/_typing.py index 837a7a89e0b83..37a5d7945955d 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -11,9 +11,9 @@ from pandas.core.arrays.base import ExtensionArray # noqa: F401 from pandas.core.dtypes.dtypes import ExtensionDtype # noqa: F401 from pandas.core.indexes.base import Index # noqa: F401 - from pandas.core.frame import DataFrame # noqa: F401 from pandas.core.series import Series # noqa: F401 from pandas.core.sparse.series import SparseSeries # noqa: F401 + from pandas.core.generic import NDFrame # noqa: F401 AnyArrayLike = TypeVar( @@ -24,7 +24,10 @@ Dtype = Union[str, np.dtype, "ExtensionDtype"] FilePathOrBuffer = Union[str, Path, IO[AnyStr]] -FrameOrSeries = TypeVar("FrameOrSeries", "Series", "DataFrame") +FrameOrSeries = TypeVar("FrameOrSeries", bound="NDFrame") Scalar = Union[str, int, float] Axis = Union[str, int] Ordered = Optional[bool] + +# to maintain type information across generic functions and parametrization +_T = TypeVar("_T") diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 5ecd641fc68be..9c778f68727c6 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -10,10 +10,12 @@ import platform import struct import sys +import warnings PY35 = sys.version_info[:2] == (3, 5) PY36 = sys.version_info >= (3, 6) PY37 = sys.version_info >= (3, 7) +PY38 = sys.version_info >= (3, 8) PYPY = platform.python_implementation() == "PyPy" @@ -65,3 +67,32 @@ def is_platform_mac(): def is_platform_32bit(): return struct.calcsize("P") * 8 < 64 + + +def _import_lzma(): + """Attempts to import lzma, warning the user when lzma is not available. + """ + try: + import lzma + + return lzma + except ImportError: + msg = ( + "Could not import the lzma module. " + "Your installed Python is incomplete. " + "Attempting to use lzma compression will result in a RuntimeError." + ) + warnings.warn(msg) + + +def _get_lzma_file(lzma): + """Returns the lzma method LZMAFile when the module was correctly imported. + Otherwise, raises a RuntimeError. + """ + if lzma is None: + raise RuntimeError( + "lzma module not available. " + "A Python re-install with the proper " + "dependencies might be required to solve this issue." + ) + return lzma.LZMAFile diff --git a/pandas/compat/chainmap.py b/pandas/compat/chainmap.py index 83f1da597d6a6..84824207de2a9 100644 --- a/pandas/compat/chainmap.py +++ b/pandas/compat/chainmap.py @@ -15,9 +15,3 @@ def __delitem__(self, key): del mapping[key] return raise KeyError(key) - - # override because the m parameter is introduced in Python 3.4 - def new_child(self, m=None): - if m is None: - m = {} - return self.__class__(m, *self.maps) diff --git a/pandas/compat/pickle_compat.py b/pandas/compat/pickle_compat.py index bca33513b0069..87240a9f986c3 100644 --- a/pandas/compat/pickle_compat.py +++ b/pandas/compat/pickle_compat.py @@ -196,10 +196,6 @@ def load_newobj_ex(self): def load(fh, encoding=None, is_verbose=False): """load a pickle, with a provided encoding - if compat is True: - fake the old class hierarchy - if it works, then return the new type objects - Parameters ---------- fh : a filelike object diff --git a/pandas/conftest.py b/pandas/conftest.py index 2cf7bf6a6df41..b032e14d8f7e1 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -123,18 +123,22 @@ def ip(): @pytest.fixture(params=[True, False, None]) def observed(request): - """ pass in the observed keyword to groupby for [True, False] + """ + Pass in the observed keyword to groupby for [True, False] This indicates whether categoricals should return values for values which are not in the grouper [False / None], or only values which appear in the grouper [True]. [None] is supported for future compatibility if we decide to change the default (and would need to warn if this - parameter is not passed)""" + parameter is not passed). + """ return request.param @pytest.fixture(params=[True, False, None]) def ordered_fixture(request): - """Boolean 'ordered' parameter for Categorical.""" + """ + Boolean 'ordered' parameter for Categorical. + """ return request.param @@ -234,7 +238,8 @@ def cython_table_items(request): def _get_cython_table_params(ndframe, func_names_and_expected): - """combine frame, functions from SelectionMixin._cython_table + """ + Combine frame, functions from SelectionMixin._cython_table keys and expected result. Parameters @@ -242,7 +247,7 @@ def _get_cython_table_params(ndframe, func_names_and_expected): ndframe : DataFrame or Series func_names_and_expected : Sequence of two items The first item is a name of a NDFrame method ('sum', 'prod') etc. - The second item is the expected return value + The second item is the expected return value. Returns ------- @@ -341,7 +346,8 @@ def strict_data_files(pytestconfig): @pytest.fixture def datapath(strict_data_files): - """Get the path to a data file. + """ + Get the path to a data file. Parameters ---------- @@ -375,7 +381,9 @@ def deco(*args): @pytest.fixture def iris(datapath): - """The iris dataset as a DataFrame.""" + """ + The iris dataset as a DataFrame. + """ return pd.read_csv(datapath("data", "iris.csv")) @@ -504,7 +512,8 @@ def tz_aware_fixture(request): @pytest.fixture(params=STRING_DTYPES) def string_dtype(request): - """Parametrized fixture for string dtypes. + """ + Parametrized fixture for string dtypes. * str * 'str' @@ -515,7 +524,8 @@ def string_dtype(request): @pytest.fixture(params=BYTES_DTYPES) def bytes_dtype(request): - """Parametrized fixture for bytes dtypes. + """ + Parametrized fixture for bytes dtypes. * bytes * 'bytes' @@ -525,7 +535,8 @@ def bytes_dtype(request): @pytest.fixture(params=OBJECT_DTYPES) def object_dtype(request): - """Parametrized fixture for object dtypes. + """ + Parametrized fixture for object dtypes. * object * 'object' @@ -535,7 +546,8 @@ def object_dtype(request): @pytest.fixture(params=DATETIME64_DTYPES) def datetime64_dtype(request): - """Parametrized fixture for datetime64 dtypes. + """ + Parametrized fixture for datetime64 dtypes. * 'datetime64[ns]' * 'M8[ns]' @@ -545,7 +557,8 @@ def datetime64_dtype(request): @pytest.fixture(params=TIMEDELTA64_DTYPES) def timedelta64_dtype(request): - """Parametrized fixture for timedelta64 dtypes. + """ + Parametrized fixture for timedelta64 dtypes. * 'timedelta64[ns]' * 'm8[ns]' diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 21d12d02c9008..1132f7d6ffdfd 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -28,13 +28,11 @@ is_complex_dtype, is_datetime64_any_dtype, is_datetime64_ns_dtype, - is_datetime64tz_dtype, is_datetimelike, is_extension_array_dtype, is_float_dtype, is_integer, is_integer_dtype, - is_interval_dtype, is_list_like, is_numeric_dtype, is_object_dtype, @@ -183,8 +181,6 @@ def _reconstruct_data(values, dtype, original): if is_extension_array_dtype(dtype): values = dtype.construct_array_type()._from_sequence(values) - elif is_datetime64tz_dtype(dtype) or is_period_dtype(dtype): - values = Index(original)._shallow_copy(values, name=None) elif is_bool_dtype(dtype): values = values.astype(dtype) @@ -1645,19 +1641,13 @@ def take_nd( May be the same type as the input, or cast to an ndarray. """ - # TODO(EA): Remove these if / elifs as datetimeTZ, interval, become EAs - # dispatch to internal type takes if is_extension_array_dtype(arr): return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) - elif is_datetime64tz_dtype(arr): - return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) - elif is_interval_dtype(arr): - return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) if is_sparse(arr): arr = arr.to_dense() elif isinstance(arr, (ABCIndexClass, ABCSeries)): - arr = arr.values + arr = arr._values arr = np.asarray(arr) @@ -1925,6 +1915,7 @@ def diff(arr, n, axis=0): dtype = arr.dtype is_timedelta = False + is_bool = False if needs_i8_conversion(arr): dtype = np.float64 arr = arr.view("i8") @@ -1933,6 +1924,7 @@ def diff(arr, n, axis=0): elif is_bool_dtype(dtype): dtype = np.object_ + is_bool = True elif is_integer_dtype(dtype): dtype = np.float64 @@ -1972,6 +1964,8 @@ def diff(arr, n, axis=0): result = res - lag result[mask] = na out_arr[res_indexer] = result + elif is_bool: + out_arr[res_indexer] = arr[res_indexer] ^ arr[lag_indexer] else: out_arr[res_indexer] = arr[res_indexer] - arr[lag_indexer] diff --git a/pandas/core/api.py b/pandas/core/api.py index 73323d93b8215..bd2a57a15bdd2 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -2,6 +2,16 @@ import numpy as np +from pandas.core.dtypes.dtypes import ( + CategoricalDtype, + DatetimeTZDtype, + IntervalDtype, + PeriodDtype, +) +from pandas.core.dtypes.missing import isna, isnull, notna, notnull + +from pandas.core.algorithms import factorize, unique, value_counts +from pandas.core.arrays import Categorical from pandas.core.arrays.integer import ( Int8Dtype, Int16Dtype, @@ -12,45 +22,38 @@ UInt32Dtype, UInt64Dtype, ) -from pandas.core.algorithms import factorize, unique, value_counts -from pandas.core.dtypes.missing import isna, isnull, notna, notnull -from pandas.core.dtypes.dtypes import ( - CategoricalDtype, - PeriodDtype, - IntervalDtype, - DatetimeTZDtype, -) -from pandas.core.arrays import Categorical from pandas.core.construction import array + from pandas.core.groupby import Grouper, NamedAgg -from pandas.io.formats.format import set_eng_float_format + +# DataFrame needs to be imported after NamedAgg to avoid a circular import +from pandas.core.frame import DataFrame # isort:skip from pandas.core.index import ( - Index, CategoricalIndex, - Int64Index, - UInt64Index, - RangeIndex, + DatetimeIndex, Float64Index, - MultiIndex, + Index, + Int64Index, IntervalIndex, - TimedeltaIndex, - DatetimeIndex, - PeriodIndex, + MultiIndex, NaT, + PeriodIndex, + RangeIndex, + TimedeltaIndex, + UInt64Index, ) +from pandas.core.indexes.datetimes import Timestamp, bdate_range, date_range +from pandas.core.indexes.interval import Interval, interval_range from pandas.core.indexes.period import Period, period_range from pandas.core.indexes.timedeltas import Timedelta, timedelta_range -from pandas.core.indexes.datetimes import Timestamp, date_range, bdate_range -from pandas.core.indexes.interval import Interval, interval_range - -from pandas.core.series import Series -from pandas.core.frame import DataFrame - -# TODO: Remove import when statsmodels updates #18264 -from pandas.core.reshape.reshape import get_dummies - from pandas.core.indexing import IndexSlice -from pandas.core.tools.numeric import to_numeric -from pandas.tseries.offsets import DateOffset +from pandas.core.reshape.reshape import ( + get_dummies, +) # TODO: Remove get_dummies import when statsmodels updates #18264 +from pandas.core.series import Series from pandas.core.tools.datetimes import to_datetime +from pandas.core.tools.numeric import to_numeric from pandas.core.tools.timedeltas import to_timedelta + +from pandas.io.formats.format import set_eng_float_format +from pandas.tseries.offsets import DateOffset diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 5c8599dbb054b..b96b3c7572031 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -3,7 +3,7 @@ import numpy as np -from pandas._libs import reduction +from pandas._libs import reduction as libreduction from pandas.util._decorators import cache_readonly from pandas.core.dtypes.common import ( @@ -221,7 +221,7 @@ def apply_raw(self): """ apply to the values as a numpy array """ try: - result = reduction.compute_reduction(self.values, self.f, axis=self.axis) + result = libreduction.compute_reduction(self.values, self.f, axis=self.axis) except Exception: result = np.apply_along_axis(self.f, self.axis, self.values) @@ -281,7 +281,7 @@ def apply_standard(self): dummy = Series(empty_arr, index=index, dtype=values.dtype) try: - result = reduction.compute_reduction( + result = libreduction.compute_reduction( values, self.f, axis=self.axis, dummy=dummy, labels=labels ) return self.obj._constructor_sliced(result, index=labels) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index e517be4f03a16..0778b6726d104 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -64,9 +64,9 @@ class ExtensionArray: shift take unique + view _concat_same_type _formatter - _formatting_values _from_factorized _from_sequence _from_sequence_of_strings @@ -147,7 +147,7 @@ class ExtensionArray: If implementing NumPy's ``__array_ufunc__`` interface, pandas expects that - 1. You defer by raising ``NotImplemented`` when any Series are present + 1. You defer by returning ``NotImplemented`` when any Series are present in `inputs`. Pandas will extract the arrays and call the ufunc again. 2. You define a ``_HANDLED_TYPES`` tuple as an attribute on the class. Pandas inspect this to determine whether the ufunc is valid for the @@ -514,7 +514,7 @@ def fillna(self, value=None, method=None, limit=None): def dropna(self): """ - Return ExtensionArray without NA values + Return ExtensionArray without NA values. Returns ------- @@ -862,6 +862,27 @@ def copy(self) -> ABCExtensionArray: """ raise AbstractMethodError(self) + def view(self, dtype=None) -> Union[ABCExtensionArray, np.ndarray]: + """ + Return a view on the array. + + Parameters + ---------- + dtype : str, np.dtype, or ExtensionDtype, optional + Default None + + Returns + ------- + ExtensionArray + """ + # NB: + # - This must return a *new* object referencing the same data, not self. + # - The only case that *must* be implemented is with dtype=None, + # giving a view with the same dtype as self. + if dtype is not None: + raise NotImplementedError(dtype) + return self[:] + # ------------------------------------------------------------------------ # Printing # ------------------------------------------------------------------------ @@ -908,21 +929,6 @@ def _formatter(self, boxed: bool = False) -> Callable[[Any], Optional[str]]: return str return repr - def _formatting_values(self) -> np.ndarray: - # At the moment, this has to be an array since we use result.dtype - """ - An array of values to be printed in, e.g. the Series repr - - .. deprecated:: 0.24.0 - - Use :meth:`ExtensionArray._formatter` instead. - - Returns - ------- - array : ndarray - """ - return np.array(self) - # ------------------------------------------------------------------------ # Reshaping # ------------------------------------------------------------------------ @@ -951,7 +957,7 @@ def _concat_same_type( cls, to_concat: Sequence[ABCExtensionArray] ) -> ABCExtensionArray: """ - Concatenate multiple array + Concatenate multiple array. Parameters ---------- diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index d22b4bd4d3f2b..5929a8d51fe43 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1,3 +1,4 @@ +import operator from shutil import get_terminal_size import textwrap from typing import Type, Union, cast @@ -22,7 +23,6 @@ ensure_int64, ensure_object, ensure_platform_int, - is_categorical, is_categorical_dtype, is_datetime64_dtype, is_datetimelike, @@ -79,6 +79,8 @@ def _cat_compare_op(op): + opname = "__{op}__".format(op=op.__name__) + def f(self, other): # On python2, you can usually compare any type to any type, and # Categoricals can be seen as a custom type, but having different @@ -89,9 +91,12 @@ def f(self, other): return NotImplemented other = lib.item_from_zerodim(other) + if is_list_like(other) and len(other) != len(self): + # TODO: Could this fail if the categories are listlike objects? + raise ValueError("Lengths must match.") if not self.ordered: - if op in ["__lt__", "__gt__", "__le__", "__ge__"]: + if opname in ["__lt__", "__gt__", "__le__", "__ge__"]: raise TypeError( "Unordered Categoricals can only compare equality or not" ) @@ -118,7 +123,7 @@ def f(self, other): other_codes = other._codes mask = (self._codes == -1) | (other_codes == -1) - f = getattr(self._codes, op) + f = getattr(self._codes, opname) ret = f(other_codes) if mask.any(): # In other series, the leads to False, so do that here too @@ -128,38 +133,38 @@ def f(self, other): if is_scalar(other): if other in self.categories: i = self.categories.get_loc(other) - ret = getattr(self._codes, op)(i) + ret = getattr(self._codes, opname)(i) # check for NaN in self mask = self._codes == -1 ret[mask] = False return ret else: - if op == "__eq__": + if opname == "__eq__": return np.repeat(False, len(self)) - elif op == "__ne__": + elif opname == "__ne__": return np.repeat(True, len(self)) else: msg = ( "Cannot compare a Categorical for op {op} with a " "scalar, which is not a category." ) - raise TypeError(msg.format(op=op)) + raise TypeError(msg.format(op=opname)) else: # allow categorical vs object dtype array comparisons for equality # these are only positional comparisons - if op in ["__eq__", "__ne__"]: - return getattr(np.array(self), op)(np.array(other)) + if opname in ["__eq__", "__ne__"]: + return getattr(np.array(self), opname)(np.array(other)) msg = ( "Cannot compare a Categorical for op {op} with type {typ}." "\nIf you want to compare values, use 'np.asarray(cat) " " other'." ) - raise TypeError(msg.format(op=op, typ=type(other))) + raise TypeError(msg.format(op=opname, typ=type(other))) - f.__name__ = op + f.__name__ = opname return f @@ -466,7 +471,7 @@ def ordered(self) -> Ordered: @property def dtype(self) -> CategoricalDtype: """ - The :class:`~pandas.api.types.CategoricalDtype` for this instance + The :class:`~pandas.api.types.CategoricalDtype` for this instance. """ return self._dtype @@ -517,19 +522,12 @@ def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike: return self._set_dtype(dtype) return np.array(self, dtype=dtype, copy=copy) - @cache_readonly - def ndim(self) -> int: - """ - Number of dimensions of the Categorical - """ - return self._codes.ndim - @cache_readonly def size(self) -> int: """ return the len of myself """ - return len(self) + return self._codes.size @cache_readonly def itemsize(self) -> int: @@ -1248,12 +1246,12 @@ def map(self, mapper): new_categories = new_categories.insert(len(new_categories), np.nan) return np.take(new_categories, self._codes) - __eq__ = _cat_compare_op("__eq__") - __ne__ = _cat_compare_op("__ne__") - __lt__ = _cat_compare_op("__lt__") - __gt__ = _cat_compare_op("__gt__") - __le__ = _cat_compare_op("__le__") - __ge__ = _cat_compare_op("__ge__") + __eq__ = _cat_compare_op(operator.eq) + __ne__ = _cat_compare_op(operator.ne) + __lt__ = _cat_compare_op(operator.lt) + __gt__ = _cat_compare_op(operator.gt) + __le__ = _cat_compare_op(operator.le) + __ge__ = _cat_compare_op(operator.ge) # for Series/ndarray like compat @property @@ -1764,18 +1762,10 @@ def ravel(self, order="C"): ) return np.array(self) - def view(self): - """ - Return a view of myself. - - For internal compatibility with numpy arrays. - - Returns - ------- - view : Categorical - Returns `self`! - """ - return self + def view(self, dtype=None): + if dtype is not None: + raise NotImplementedError(dtype) + return self._constructor(values=self._codes, dtype=self.dtype, fastpath=True) def to_dense(self): """ @@ -1850,8 +1840,8 @@ def fillna(self, value=None, method=None, limit=None): raise ValueError("fill value must be in categories") values_codes = _get_codes_for_values(value, self.categories) - indexer = np.where(values_codes != -1) - codes[indexer] = values_codes[values_codes != -1] + indexer = np.where(codes == -1) + codes[indexer] = values_codes[indexer] # If value is not a dict or Series it should be a scalar elif is_hashable(value): @@ -2659,18 +2649,18 @@ def _get_codes_for_values(values, categories): return coerce_indexer_dtype(t.lookup(vals), cats) -def _recode_for_categories(codes, old_categories, new_categories): +def _recode_for_categories(codes: np.ndarray, old_categories, new_categories): """ Convert a set of codes for to a new set of categories Parameters ---------- - codes : array + codes : np.ndarray old_categories, new_categories : Index Returns ------- - new_codes : array + new_codes : np.ndarray[np.int64] Examples -------- @@ -2725,17 +2715,15 @@ def _factorize_from_iterable(values): If `values` has a categorical dtype, then `categories` is a CategoricalIndex keeping the categories and order of `values`. """ - from pandas.core.indexes.category import CategoricalIndex - if not is_list_like(values): raise TypeError("Input must be list-like") - if is_categorical(values): - values = CategoricalIndex(values) - # The CategoricalIndex level we want to build has the same categories + if is_categorical_dtype(values): + values = extract_array(values) + # The Categorical we want to build has the same categories # as values but its codes are by def [0, ..., len(n_categories) - 1] cat_codes = np.arange(len(values.categories), dtype=values.codes.dtype) - categories = values._create_from_codes(cat_codes) + categories = Categorical.from_codes(cat_codes, dtype=values.dtype) codes = values.codes else: # The value of ordered is irrelevant since we don't use cat as such, diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 2747b1d7dd9f1..1988726edc79b 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -22,7 +22,6 @@ is_datetime64tz_dtype, is_datetime_or_timedelta_dtype, is_dtype_equal, - is_extension_array_dtype, is_float_dtype, is_integer_dtype, is_list_like, @@ -44,9 +43,10 @@ from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna from pandas._typing import DatetimeLikeScalar -from pandas.core import missing, nanops, ops +from pandas.core import missing, nanops from pandas.core.algorithms import checked_add_with_arr, take, unique1d, value_counts import pandas.core.common as com +from pandas.core.ops.invalid import make_invalid_op from pandas.tseries import frequencies from pandas.tseries.offsets import DateOffset, Tick @@ -160,8 +160,8 @@ def strftime(self, date_format): Returns ------- - Index - Index of formatted strings. + ndarray + NumPy ndarray of formatted strings. See Also -------- @@ -179,9 +179,7 @@ def strftime(self, date_format): 'March 10, 2018, 09:00:02 AM'], dtype='object') """ - from pandas import Index - - return Index(self._format_native_types(date_format=date_format)) + return self._format_native_types(date_format=date_format).astype(object) class TimelikeOps: @@ -545,18 +543,8 @@ def astype(self, dtype, copy=True): return np.asarray(self, dtype=dtype) def view(self, dtype=None): - """ - New view on this array with the same data. - - Parameters - ---------- - dtype : numpy dtype, optional - - Returns - ------- - ndarray - With the specified `dtype`. - """ + if dtype is None or dtype is self.dtype: + return type(self)(self._data, dtype=self.dtype) return self._data.view(dtype=dtype) # ------------------------------------------------------------------ @@ -921,18 +909,18 @@ def _is_unique(self): # pow is invalid for all three subclasses; TimedeltaArray will override # the multiplication and division ops - __pow__ = ops.make_invalid_op("__pow__") - __rpow__ = ops.make_invalid_op("__rpow__") - __mul__ = ops.make_invalid_op("__mul__") - __rmul__ = ops.make_invalid_op("__rmul__") - __truediv__ = ops.make_invalid_op("__truediv__") - __rtruediv__ = ops.make_invalid_op("__rtruediv__") - __floordiv__ = ops.make_invalid_op("__floordiv__") - __rfloordiv__ = ops.make_invalid_op("__rfloordiv__") - __mod__ = ops.make_invalid_op("__mod__") - __rmod__ = ops.make_invalid_op("__rmod__") - __divmod__ = ops.make_invalid_op("__divmod__") - __rdivmod__ = ops.make_invalid_op("__rdivmod__") + __pow__ = make_invalid_op("__pow__") + __rpow__ = make_invalid_op("__rpow__") + __mul__ = make_invalid_op("__mul__") + __rmul__ = make_invalid_op("__rmul__") + __truediv__ = make_invalid_op("__truediv__") + __rtruediv__ = make_invalid_op("__rtruediv__") + __floordiv__ = make_invalid_op("__floordiv__") + __rfloordiv__ = make_invalid_op("__rfloordiv__") + __mod__ = make_invalid_op("__mod__") + __rmod__ = make_invalid_op("__rmod__") + __divmod__ = make_invalid_op("__divmod__") + __rdivmod__ = make_invalid_op("__rdivmod__") def _add_datetimelike_scalar(self, other): # Overriden by TimedeltaArray @@ -1017,9 +1005,9 @@ def _add_delta_tdi(self, other): if isinstance(other, np.ndarray): # ndarray[timedelta64]; wrap in TimedeltaIndex for op - from pandas import TimedeltaIndex + from pandas.core.arrays import TimedeltaArray - other = TimedeltaIndex(other) + other = TimedeltaArray._from_sequence(other) self_i8 = self.asi8 other_i8 = other.asi8 @@ -1241,29 +1229,17 @@ def __add__(self, other): if not is_period_dtype(self): maybe_integer_op_deprecated(self) result = self._addsub_int_array(other, operator.add) - elif is_float_dtype(other): - # Explicitly catch invalid dtypes - raise TypeError( - "cannot add {dtype}-dtype to {cls}".format( - dtype=other.dtype, cls=type(self).__name__ - ) - ) - elif is_period_dtype(other): - # if self is a TimedeltaArray and other is a PeriodArray with - # a timedelta-like (i.e. Tick) freq, this operation is valid. - # Defer to the PeriodArray implementation. - # In remaining cases, this will end up raising TypeError. - return NotImplemented - elif is_extension_array_dtype(other): - # Categorical op will raise; defer explicitly - return NotImplemented - else: # pragma: no cover + else: + # Includes Categorical, other ExtensionArrays + # For PeriodDtype, if self is a TimedeltaArray and other is a + # PeriodArray with a timedelta-like (i.e. Tick) freq, this + # operation is valid. Defer to the PeriodArray implementation. + # In remaining cases, this will end up raising TypeError. return NotImplemented if is_timedelta64_dtype(result) and isinstance(result, np.ndarray): from pandas.core.arrays import TimedeltaArray - # TODO: infer freq? return TimedeltaArray(result) return result @@ -1313,29 +1289,13 @@ def __sub__(self, other): if not is_period_dtype(self): maybe_integer_op_deprecated(self) result = self._addsub_int_array(other, operator.sub) - elif isinstance(other, ABCIndexClass): - raise TypeError( - "cannot subtract {cls} and {typ}".format( - cls=type(self).__name__, typ=type(other).__name__ - ) - ) - elif is_float_dtype(other): - # Explicitly catch invalid dtypes - raise TypeError( - "cannot subtract {dtype}-dtype from {cls}".format( - dtype=other.dtype, cls=type(self).__name__ - ) - ) - elif is_extension_array_dtype(other): - # Categorical op will raise; defer explicitly - return NotImplemented - else: # pragma: no cover + else: + # Includes ExtensionArrays, float_dtype return NotImplemented if is_timedelta64_dtype(result) and isinstance(result, np.ndarray): from pandas.core.arrays import TimedeltaArray - # TODO: infer freq? return TimedeltaArray(result) return result diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 061ee4b90d0e9..732f819e743a4 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -53,6 +53,7 @@ from pandas.core.arrays import datetimelike as dtl from pandas.core.arrays._ranges import generate_regular_range import pandas.core.common as com +from pandas.core.ops.invalid import invalid_comparison from pandas.tseries.frequencies import get_period_alias, to_offset from pandas.tseries.offsets import Day, Tick @@ -171,13 +172,13 @@ def wrapper(self, other): other = _to_M8(other, tz=self.tz) except ValueError: # string that cannot be parsed to Timestamp - return ops.invalid_comparison(self, other, op) + return invalid_comparison(self, other, op) result = op(self.asi8, other.view("i8")) if isna(other): result.fill(nat_result) elif lib.is_scalar(other) or np.ndim(other) == 0: - return ops.invalid_comparison(self, other, op) + return invalid_comparison(self, other, op) elif len(other) != len(self): raise ValueError("Lengths must match") else: @@ -191,20 +192,20 @@ def wrapper(self, other): ): # Following Timestamp convention, __eq__ is all-False # and __ne__ is all True, others raise TypeError. - return ops.invalid_comparison(self, other, op) + return invalid_comparison(self, other, op) if is_object_dtype(other): - # We have to use _comp_method_OBJECT_ARRAY instead of numpy + # We have to use comp_method_OBJECT_ARRAY instead of numpy # comparison otherwise it would fail to raise when # comparing tz-aware and tz-naive with np.errstate(all="ignore"): - result = ops._comp_method_OBJECT_ARRAY( + result = ops.comp_method_OBJECT_ARRAY( op, self.astype(object), other ) o_mask = isna(other) elif not (is_datetime64_dtype(other) or is_datetime64tz_dtype(other)): # e.g. is_timedelta64_dtype(other) - return ops.invalid_comparison(self, other, op) + return invalid_comparison(self, other, op) else: self._assert_tzawareness_compat(other) if isinstance(other, (ABCIndexClass, ABCSeries)): @@ -222,8 +223,6 @@ def wrapper(self, other): result = op(self.view("i8"), other.view("i8")) o_mask = other._isnan - result = com.values_from_object(result) - if o_mask.any(): result[o_mask] = nat_result @@ -1064,6 +1063,7 @@ def tz_localize(self, tz, ambiguous="raise", nonexistent="raise", errors=None): Be careful with DST changes. When there is sequential data, pandas can infer the DST time: + >>> s = pd.to_datetime(pd.Series(['2018-10-28 01:30:00', ... '2018-10-28 02:00:00', ... '2018-10-28 02:30:00', @@ -1095,6 +1095,7 @@ def tz_localize(self, tz, ambiguous="raise", nonexistent="raise", errors=None): If the DST transition causes nonexistent times, you can shift these dates forward or backwards with a timedelta object or `'shift_forward'` or `'shift_backwards'`. + >>> s = pd.to_datetime(pd.Series(['2015-03-29 02:30:00', ... '2015-03-29 03:30:00'])) >>> s.dt.tz_localize('Europe/Warsaw', nonexistent='shift_forward') @@ -1159,7 +1160,7 @@ def tz_localize(self, tz, ambiguous="raise", nonexistent="raise", errors=None): def to_pydatetime(self): """ Return Datetime Array/Index as object ndarray of datetime.datetime - objects + objects. Returns ------- @@ -1284,7 +1285,7 @@ def to_perioddelta(self, freq): """ Calculate TimedeltaArray of difference between index values and index converted to PeriodArray at specified - freq. Used for vectorized offsets + freq. Used for vectorized offsets. Parameters ---------- diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 1f14bd169a228..069d661e6af34 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -21,7 +21,7 @@ is_scalar, ) from pandas.core.dtypes.dtypes import register_extension_dtype -from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries +from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries from pandas.core.dtypes.missing import isna, notna from pandas.core import nanops, ops @@ -592,25 +592,29 @@ def _values_for_argsort(self) -> np.ndarray: @classmethod def _create_comparison_method(cls, op): - def cmp_method(self, other): + op_name = op.__name__ - op_name = op.__name__ - mask = None + def cmp_method(self, other): - if isinstance(other, (ABCSeries, ABCIndexClass)): + if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): # Rely on pandas to unbox and dispatch to us. return NotImplemented + other = lib.item_from_zerodim(other) + mask = None + if isinstance(other, IntegerArray): other, mask = other._data, other._mask elif is_list_like(other): other = np.asarray(other) - if other.ndim > 0 and len(self) != len(other): + if other.ndim > 1: + raise NotImplementedError( + "can only perform ops with 1-d structures" + ) + if len(self) != len(other): raise ValueError("Lengths must match to compare") - other = lib.item_from_zerodim(other) - # numpy will show a DeprecationWarning on invalid elementwise # comparisons, this will raise in the future with warnings.catch_warnings(): @@ -683,31 +687,31 @@ def _maybe_mask_result(self, result, mask, other, op_name): @classmethod def _create_arithmetic_method(cls, op): - def integer_arithmetic_method(self, other): + op_name = op.__name__ - op_name = op.__name__ - mask = None + def integer_arithmetic_method(self, other): - if isinstance(other, (ABCSeries, ABCIndexClass)): + if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): # Rely on pandas to unbox and dispatch to us. return NotImplemented - if getattr(other, "ndim", 0) > 1: - raise NotImplementedError("can only perform ops with 1-d structures") + other = lib.item_from_zerodim(other) + mask = None if isinstance(other, IntegerArray): other, mask = other._data, other._mask - elif getattr(other, "ndim", None) == 0: - other = other.item() - elif is_list_like(other): other = np.asarray(other) - if not other.ndim: - other = other.item() - elif other.ndim == 1: - if not (is_float_dtype(other) or is_integer_dtype(other)): - raise TypeError("can only perform ops with numeric values") + if other.ndim > 1: + raise NotImplementedError( + "can only perform ops with 1-d structures" + ) + if len(self) != len(other): + raise ValueError("Lengths must match") + if not (is_float_dtype(other) or is_integer_dtype(other)): + raise TypeError("can only perform ops with numeric values") + else: if not (is_float(other) or is_integer(other)): raise TypeError("can only perform ops with numeric values") diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 2b3c02bd1cade..7a14d6f1b619a 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -358,54 +358,10 @@ def from_arrays(cls, left, right, closed="right", copy=False, dtype=None): left, right, closed, copy=copy, dtype=dtype, verify_integrity=True ) - _interval_shared_docs[ - "from_intervals" - ] = """ - Construct an %(klass)s from a 1d array of Interval objects - - .. deprecated:: 0.23.0 - - Parameters - ---------- - data : array-like (1-dimensional) - Array of Interval objects. All intervals must be closed on the same - sides. - copy : boolean, default False - by-default copy the data, this is compat only and ignored - dtype : dtype or None, default None - If None, dtype will be inferred - - ..versionadded:: 0.23.0 - - See Also - -------- - interval_range : Function to create a fixed frequency IntervalIndex. - %(klass)s.from_arrays : Construct an %(klass)s from a left and - right array. - %(klass)s.from_breaks : Construct an %(klass)s from an array of - splits. - %(klass)s.from_tuples : Construct an %(klass)s from an - array-like of tuples. - - Examples - -------- - >>> pd.%(qualname)s.from_intervals([pd.Interval(0, 1), - ... pd.Interval(1, 2)]) - %(klass)s([(0, 1], (1, 2]], - closed='right', dtype='interval[int64]') - - The generic Index constructor work identically when it infers an array - of all intervals: - - >>> pd.Index([pd.Interval(0, 1), pd.Interval(1, 2)]) - %(klass)s([(0, 1], (1, 2]], - closed='right', dtype='interval[int64]') - """ - _interval_shared_docs[ "from_tuples" ] = """ - Construct an %(klass)s from an array-like of tuples + Construct an %(klass)s from an array-like of tuples. Parameters ---------- @@ -739,18 +695,14 @@ def isna(self): return isna(self.left) @property - def nbytes(self): + def nbytes(self) -> int: return self.left.nbytes + self.right.nbytes @property - def size(self): + def size(self) -> int: # Avoid materializing self.values return self.left.size - @property - def shape(self): - return self.left.shape - def take(self, indices, allow_fill=False, fill_value=None, axis=None, **kwargs): """ Take elements from the IntervalArray. @@ -902,7 +854,7 @@ def _format_space(self): def left(self): """ Return the left endpoints of each Interval in the IntervalArray as - an Index + an Index. """ return self._left @@ -910,7 +862,7 @@ def left(self): def right(self): """ Return the right endpoints of each Interval in the IntervalArray as - an Index + an Index. """ return self._right @@ -918,7 +870,7 @@ def right(self): def closed(self): """ Whether the intervals are closed on the left-side, right-side, both or - neither + neither. """ return self._closed @@ -926,7 +878,7 @@ def closed(self): "set_closed" ] = """ Return an %(klass)s identical to the current one, but closed on the - specified side + specified side. .. versionadded:: 0.24.0 @@ -965,7 +917,7 @@ def set_closed(self, closed): def length(self): """ Return an Index with entries denoting the length of each Interval in - the IntervalArray + the IntervalArray. """ try: return self.right - self.left @@ -993,7 +945,7 @@ def mid(self): ] = """ Return True if the %(klass)s is non-overlapping (no Intervals share points) and is either monotonic increasing or monotonic decreasing, - else False + else False. """ # https://github.com/python/mypy/issues/1362 # Mypy does not support decorated properties @@ -1043,7 +995,7 @@ def __array__(self, dtype=None): _interval_shared_docs[ "to_tuples" ] = """ - Return an %(return_type)s of tuples of the form (left, right) + Return an %(return_type)s of tuples of the form (left, right). Parameters ---------- diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 39529177b9e35..4e2e37d88eb9a 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -125,7 +125,11 @@ def __init__(self, values, copy=False): if isinstance(values, type(self)): values = values._ndarray if not isinstance(values, np.ndarray): - raise ValueError("'values' must be a NumPy array.") + raise ValueError( + "'values' must be a NumPy array, not {typ}".format( + typ=type(values).__name__ + ) + ) if values.ndim != 1: raise ValueError("PandasArray must be 1-dimensional.") @@ -241,11 +245,11 @@ def __setitem__(self, key, value): else: self._ndarray[key] = value - def __len__(self): + def __len__(self) -> int: return len(self._ndarray) @property - def nbytes(self): + def nbytes(self) -> int: return self._ndarray.nbytes def isna(self): diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 20ce11c70c344..f2d74794eadf5 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -426,7 +426,7 @@ def __array__(self, dtype=None): @property def is_leap_year(self): """ - Logical indicating if the date belongs to a leap year + Logical indicating if the date belongs to a leap year. """ return isleapyear_arr(np.asarray(self.year)) diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py index 47c7c72051150..201174b6b1995 100644 --- a/pandas/core/arrays/sparse.py +++ b/pandas/core/arrays/sparse.py @@ -39,6 +39,7 @@ ) from pandas.core.dtypes.dtypes import register_extension_dtype from pandas.core.dtypes.generic import ( + ABCDataFrame, ABCIndexClass, ABCSeries, ABCSparseArray, @@ -839,7 +840,7 @@ def fill_value(self, value): self._dtype = SparseDtype(self.dtype.subtype, value) @property - def kind(self): + def kind(self) -> str: """ The kind of sparse index for this array. One of {'integer', 'block'}. """ @@ -854,7 +855,7 @@ def _valid_sp_values(self): mask = notna(sp_vals) return sp_vals[mask] - def __len__(self): + def __len__(self) -> int: return self.sp_index.length @property @@ -868,7 +869,7 @@ def _fill_value_matches(self, fill_value): return self.fill_value == fill_value @property - def nbytes(self): + def nbytes(self) -> int: return self.sp_values.nbytes + self.sp_index.nbytes @property @@ -886,7 +887,7 @@ def density(self): return r @property - def npoints(self): + def npoints(self) -> int: """ The number of non- ``fill_value`` points. @@ -1693,6 +1694,9 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): for sp_value, fv in zip(sp_values, fill_value) ) return arrays + elif is_scalar(sp_values): + # e.g. reductions + return sp_values return self._simple_new( sp_values, self.sp_index, SparseDtype(sp_values.dtype, fill_value) @@ -1732,13 +1736,15 @@ def sparse_unary_method(self): @classmethod def _create_arithmetic_method(cls, op): - def sparse_arithmetic_method(self, other): - op_name = op.__name__ + op_name = op.__name__ - if isinstance(other, (ABCSeries, ABCIndexClass)): + def sparse_arithmetic_method(self, other): + if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): # Rely on pandas to dispatch to us. return NotImplemented + other = lib.item_from_zerodim(other) + if isinstance(other, SparseArray): return _sparse_array_op(self, other, op, op_name) @@ -1781,11 +1787,11 @@ def sparse_arithmetic_method(self, other): @classmethod def _create_comparison_method(cls, op): - def cmp_method(self, other): - op_name = op.__name__ + op_name = op.__name__ + if op_name in {"and_", "or_"}: + op_name = op_name[:-1] - if op_name in {"and_", "or_"}: - op_name = op_name[:-1] + def cmp_method(self, other): if isinstance(other, (ABCSeries, ABCIndexClass)): # Rely on pandas to unbox and dispatch to us. diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index afd1e8203059e..3609c68a26c0f 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -41,9 +41,9 @@ ) from pandas.core.dtypes.missing import isna -from pandas.core import ops from pandas.core.algorithms import checked_add_with_arr import pandas.core.common as com +from pandas.core.ops.invalid import invalid_comparison from pandas.tseries.frequencies import to_offset from pandas.tseries.offsets import Tick @@ -90,14 +90,14 @@ def wrapper(self, other): other = Timedelta(other) except ValueError: # failed to parse as timedelta - return ops.invalid_comparison(self, other, op) + return invalid_comparison(self, other, op) result = op(self.view("i8"), other.value) if isna(other): result.fill(nat_result) elif not is_list_like(other): - return ops.invalid_comparison(self, other, op) + return invalid_comparison(self, other, op) elif len(other) != len(self): raise ValueError("Lengths must match") @@ -106,7 +106,7 @@ def wrapper(self, other): try: other = type(self)._from_sequence(other)._data except (ValueError, TypeError): - return ops.invalid_comparison(self, other, op) + return invalid_comparison(self, other, op) result = op(self.view("i8"), other.view("i8")) result = com.values_from_object(result) @@ -173,8 +173,8 @@ class TimedeltaArray(dtl.DatetimeLikeArrayMixin, dtl.TimelikeOps): "ceil", ] - # Needed so that NaT.__richcmp__(DateTimeArray) operates pointwise - ndim = 1 + # Note: ndim must be defined to ensure NaT.__richcmp(TimedeltaArray) + # operates pointwise. @property def _box_func(self): @@ -776,12 +776,14 @@ def __rdivmod__(self, other): res2 = other - res1 * self return res1, res2 - # Note: TimedeltaIndex overrides this in call to cls._add_numeric_methods def __neg__(self): if self.freq is not None: return type(self)(-self._data, freq=-self.freq) return type(self)(-self._data) + def __pos__(self): + return type(self)(self._data, freq=self.freq) + def __abs__(self): # Note: freq is not preserved return type(self)(np.abs(self._data)) diff --git a/pandas/core/base.py b/pandas/core/base.py index 7d2a62318232c..2d5ffb5e91392 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -47,7 +47,6 @@ class PandasObject(DirNamesMixin): - """baseclass for various pandas objects""" @property @@ -1462,7 +1461,7 @@ def is_monotonic_decreasing(self): def memory_usage(self, deep=False): """ - Memory usage of the values + Memory usage of the values. Parameters ---------- diff --git a/pandas/core/computation/align.py b/pandas/core/computation/align.py index 1046401850963..3e1e5ed89d877 100644 --- a/pandas/core/computation/align.py +++ b/pandas/core/computation/align.py @@ -9,6 +9,7 @@ from pandas.errors import PerformanceWarning import pandas as pd +from pandas.core.base import PandasObject import pandas.core.common as com from pandas.core.computation.common import _result_type_many @@ -34,7 +35,7 @@ def _zip_axes_from_type(typ, new_axes): def _any_pandas_objects(terms): """Check a sequence of terms for instances of PandasObject.""" - return any(isinstance(term.value, pd.core.generic.PandasObject) for term in terms) + return any(isinstance(term.value, PandasObject) for term in terms) def _filter_special_cases(f): @@ -132,7 +133,8 @@ def _align(terms): def _reconstruct_object(typ, obj, axes, dtype): - """Reconstruct an object given its type, raw value, and possibly empty + """ + Reconstruct an object given its type, raw value, and possibly empty (None) axes. Parameters @@ -157,7 +159,7 @@ def _reconstruct_object(typ, obj, axes, dtype): res_t = np.result_type(obj.dtype, dtype) - if not isinstance(typ, partial) and issubclass(typ, pd.core.generic.PandasObject): + if not isinstance(typ, partial) and issubclass(typ, PandasObject): return typ(obj, dtype=res_t, **axes) # special case for pathological things like ~True/~False diff --git a/pandas/core/computation/common.py b/pandas/core/computation/common.py index b8e212fd2a32e..bd32c8bee1cdf 100644 --- a/pandas/core/computation/common.py +++ b/pandas/core/computation/common.py @@ -2,7 +2,7 @@ import numpy as np -import pandas as pd +from pandas._config import get_option # A token value Python's tokenizer probably will never use. _BACKTICK_QUOTED_STRING = 100 @@ -11,7 +11,7 @@ def _ensure_decoded(s): """ if we have bytes, decode them to unicode """ if isinstance(s, (np.bytes_, bytes)): - s = s.decode(pd.get_option("display.encoding")) + s = s.decode(get_option("display.encoding")) return s @@ -36,8 +36,3 @@ def _remove_spaces_column_name(name): class NameResolutionError(NameError): pass - - -class StringMixin: - # TODO: delete this class. Removing this ATM caused a failure. - pass diff --git a/pandas/core/computation/engines.py b/pandas/core/computation/engines.py index 2c94b142a45b3..3cc34ea1f4ed7 100644 --- a/pandas/core/computation/engines.py +++ b/pandas/core/computation/engines.py @@ -17,7 +17,8 @@ class NumExprClobberingError(NameError): def _check_ne_builtin_clash(expr): - """Attempt to prevent foot-shooting in a helpful way. + """ + Attempt to prevent foot-shooting in a helpful way. Parameters ---------- @@ -53,7 +54,8 @@ def convert(self): return printing.pprint_thing(self.expr) def evaluate(self): - """Run the engine on the expression + """ + Run the engine on the expression. This method performs alignment which is necessary no matter what engine is being used, thus its implementation is in the base class. @@ -78,7 +80,8 @@ def _is_aligned(self): @abc.abstractmethod def _evaluate(self): - """Return an evaluated expression. + """ + Return an evaluated expression. Parameters ---------- @@ -94,7 +97,6 @@ def _evaluate(self): class NumExprEngine(AbstractEngine): - """NumExpr engine class""" has_neg_frac = True @@ -127,8 +129,8 @@ def _evaluate(self): class PythonEngine(AbstractEngine): - - """Evaluate an expression in Python space. + """ + Evaluate an expression in Python space. Mostly for testing purposes. """ diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py index d0d87c23e9346..45319a4d63d94 100644 --- a/pandas/core/computation/expr.py +++ b/pandas/core/computation/expr.py @@ -41,7 +41,8 @@ def tokenize_string(source): - """Tokenize a Python source code string. + """ + Tokenize a Python source code string. Parameters ---------- @@ -366,8 +367,8 @@ def f(cls): @disallow(_unsupported_nodes) @add_ops(_op_classes) class BaseExprVisitor(ast.NodeVisitor): - - """Custom ast walker. Parsers of other engines should subclass this class + """ + Custom ast walker. Parsers of other engines should subclass this class if necessary. Parameters @@ -581,6 +582,9 @@ def visit_NameConstant(self, node, **kwargs): def visit_Num(self, node, **kwargs): return self.const_type(node.n, self.env) + def visit_Constant(self, node, **kwargs): + return self.const_type(node.n, self.env) + def visit_Str(self, node, **kwargs): name = self.env.add_tmp(node.s) return self.term_type(name, self.env) @@ -799,8 +803,8 @@ def __init__(self, env, engine, parser, preparser=lambda x: x): class Expr: - - """Object encapsulating an expression. + """ + Object encapsulating an expression. Parameters ---------- diff --git a/pandas/core/computation/expressions.py b/pandas/core/computation/expressions.py index d9dc194d484ae..29c8239fa518f 100644 --- a/pandas/core/computation/expressions.py +++ b/pandas/core/computation/expressions.py @@ -76,16 +76,17 @@ def _can_use_numexpr(op, op_str, a, b, dtype_check): # required min elements (otherwise we are adding overhead) if np.prod(a.shape) > _MIN_ELEMENTS: - # check for dtype compatibility dtypes = set() for o in [a, b]: - if hasattr(o, "dtypes"): + # Series implements dtypes, check for dimension count as well + if hasattr(o, "dtypes") and o.ndim > 1: s = o.dtypes.value_counts() if len(s) > 1: return False dtypes |= set(s.index.astype(str)) - elif isinstance(o, np.ndarray): + # ndarray and Series Case + elif hasattr(o, "dtype"): dtypes |= {o.dtype.name} # allowed are a superset @@ -99,15 +100,13 @@ def _evaluate_numexpr(op, op_str, a, b, truediv=True, reversed=False, **eval_kwa result = None if _can_use_numexpr(op, op_str, a, b, "evaluate"): - try: - - # we were originally called by a reversed op - # method - if reversed: - a, b = b, a + if reversed: + # we were originally called by a reversed op method + a, b = b, a - a_value = getattr(a, "values", a) - b_value = getattr(b, "values", b) + a_value = getattr(a, "values", a) + b_value = getattr(b, "values", b) + try: result = ne.evaluate( "a_value {op} b_value".format(op=op_str), local_dict={"a_value": a_value, "b_value": b_value}, @@ -138,11 +137,11 @@ def _where_numexpr(cond, a, b): result = None if _can_use_numexpr(None, "where", a, b, "where"): + cond_value = getattr(cond, "values", cond) + a_value = getattr(a, "values", a) + b_value = getattr(b, "values", b) try: - cond_value = getattr(cond, "values", cond) - a_value = getattr(a, "values", a) - b_value = getattr(b, "values", b) result = ne.evaluate( "where(cond_value, a_value, b_value)", local_dict={ @@ -203,17 +202,19 @@ def _bool_arith_check( def evaluate(op, op_str, a, b, use_numexpr=True, **eval_kwargs): - """ evaluate and return the expression of the op on a and b - - Parameters - ---------- - - op : the actual operand - op_str: the string version of the op - a : left operand - b : right operand - use_numexpr : whether to try to use numexpr (default True) - """ + """ + Evaluate and return the expression of the op on a and b. + + Parameters + ---------- + op : the actual operand + op_str : str + The string version of the op. + a : left operand + b : right operand + use_numexpr : bool, default True + Whether to try to use numexpr. + """ use_numexpr = use_numexpr and _bool_arith_check(op_str, a, b) if use_numexpr: @@ -222,16 +223,17 @@ def evaluate(op, op_str, a, b, use_numexpr=True, **eval_kwargs): def where(cond, a, b, use_numexpr=True): - """ evaluate the where condition cond on a and b - - Parameters - ---------- - - cond : a boolean array - a : return if cond is True - b : return if cond is False - use_numexpr : whether to try to use numexpr (default True) - """ + """ + Evaluate the where condition cond on a and b. + + Parameters + ---------- + cond : np.ndarray[bool] + a : return if cond is True + b : return if cond is False + use_numexpr : bool, default True + Whether to try to use numexpr. + """ if use_numexpr: return _where(cond, a, b) diff --git a/pandas/core/computation/ops.py b/pandas/core/computation/ops.py index 2bf09a553ce18..28b6aef693bfe 100644 --- a/pandas/core/computation/ops.py +++ b/pandas/core/computation/ops.py @@ -51,8 +51,9 @@ class UndefinedVariableError(NameError): - - """NameError subclass for local variables.""" + """ + NameError subclass for local variables. + """ def __init__(self, name, is_local): if is_local: @@ -191,8 +192,8 @@ def __repr__(self): class Op: - - """Hold an operator of arbitrary arity + """ + Hold an operator of arbitrary arity. """ def __init__(self, op, operands, *args, **kwargs): @@ -204,8 +205,9 @@ def __iter__(self): return iter(self.operands) def __repr__(self): - """Print a generic n-ary operator and its operands using infix - notation""" + """ + Print a generic n-ary operator and its operands using infix notation. + """ # recurse over the operands parened = ("({0})".format(pprint_thing(opr)) for opr in self.operands) return pprint_thing(" {0} ".format(self.op).join(parened)) @@ -296,7 +298,8 @@ def _not_in(x, y): def _cast_inplace(terms, acceptable_dtypes, dtype): - """Cast an expression inplace. + """ + Cast an expression inplace. Parameters ---------- @@ -304,7 +307,6 @@ def _cast_inplace(terms, acceptable_dtypes, dtype): The expression that should cast. acceptable_dtypes : list of acceptable numpy.dtype Will not cast if term's dtype in this list. - dtype : str or numpy.dtype The dtype to cast to. """ @@ -325,8 +327,8 @@ def is_term(obj): class BinOp(Op): - - """Hold a binary operator and its operands + """ + Hold a binary operator and its operands. Parameters ---------- @@ -355,7 +357,8 @@ def __init__(self, op, lhs, rhs, **kwargs): ) def __call__(self, env): - """Recursively evaluate an expression in Python space. + """ + Recursively evaluate an expression in Python space. Parameters ---------- @@ -377,7 +380,8 @@ def __call__(self, env): return self.func(left, right) def evaluate(self, env, engine, parser, term_type, eval_in_python): - """Evaluate a binary operation *before* being passed to the engine. + """ + Evaluate a binary operation *before* being passed to the engine. Parameters ---------- @@ -472,8 +476,8 @@ def isnumeric(dtype): class Div(BinOp): - - """Div operator to special case casting. + """ + Div operator to special case casting. Parameters ---------- @@ -504,8 +508,8 @@ def __init__(self, lhs, rhs, truediv, *args, **kwargs): class UnaryOp(Op): - - """Hold a unary operator and its operands + """ + Hold a unary operator and its operands. Parameters ---------- diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py index 1523eb05ac41d..81658ab23ba46 100644 --- a/pandas/core/computation/pytables.py +++ b/pandas/core/computation/pytables.py @@ -478,7 +478,6 @@ def _validate_where(w): class Expr(expr.Expr): - """ hold a pytables like expression, comprised of possibly multiple 'terms' Parameters @@ -573,7 +572,6 @@ def evaluate(self): class TermValue: - """ hold a term value the we use to construct a condition/filter """ def __init__(self, value, converted, kind): diff --git a/pandas/core/computation/scope.py b/pandas/core/computation/scope.py index 8ddd0dd7622e7..b11411eb2dc66 100644 --- a/pandas/core/computation/scope.py +++ b/pandas/core/computation/scope.py @@ -15,9 +15,6 @@ from pandas._libs.tslibs import Timestamp from pandas.compat.chainmap import DeepChainMap -import pandas.core.computation as compu -from pandas.core.computation.common import StringMixin - def _ensure_scope( level, global_dict=None, local_dict=None, resolvers=(), target=None, **kwargs @@ -67,7 +64,8 @@ def _raw_hex_id(obj): def _get_pretty_string(obj): - """Return a prettier version of obj + """ + Return a prettier version of obj. Parameters ---------- @@ -84,9 +82,9 @@ def _get_pretty_string(obj): return sio.getvalue() -class Scope(StringMixin): - - """Object to hold scope, with a few bells to deal with some custom syntax +class Scope: + """ + Object to hold scope, with a few bells to deal with some custom syntax and contexts added by pandas. Parameters @@ -105,7 +103,7 @@ class Scope(StringMixin): temps : dict """ - __slots__ = "level", "scope", "target", "temps" + __slots__ = ["level", "scope", "target", "resolvers", "temps"] def __init__( self, level, global_dict=None, local_dict=None, resolvers=(), target=None @@ -163,7 +161,8 @@ def has_resolvers(self): return bool(len(self.resolvers)) def resolve(self, key, is_local): - """Resolve a variable name in a possibly local context + """ + Resolve a variable name in a possibly local context. Parameters ---------- @@ -198,10 +197,14 @@ def resolve(self, key, is_local): # e.g., df[df > 0] return self.temps[key] except KeyError: - raise compu.ops.UndefinedVariableError(key, is_local) + # runtime import because ops imports from scope + from pandas.core.computation.ops import UndefinedVariableError + + raise UndefinedVariableError(key, is_local) def swapkey(self, old_key, new_key, new_value=None): - """Replace a variable name, with a potentially new value. + """ + Replace a variable name, with a potentially new value. Parameters ---------- @@ -225,7 +228,8 @@ def swapkey(self, old_key, new_key, new_value=None): return def _get_vars(self, stack, scopes): - """Get specifically scoped variables from a list of stack frames. + """ + Get specifically scoped variables from a list of stack frames. Parameters ---------- @@ -247,7 +251,8 @@ def _get_vars(self, stack, scopes): del frame def update(self, level): - """Update the current scope by going back `level` levels. + """ + Update the current scope by going back `level` levels. Parameters ---------- @@ -266,7 +271,8 @@ def update(self, level): del stack[:], stack def add_tmp(self, value): - """Add a temporary variable to the scope. + """ + Add a temporary variable to the scope. Parameters ---------- @@ -297,7 +303,8 @@ def ntemps(self): @property def full_scope(self): - """Return the full scope for use with passing to engines transparently + """ + Return the full scope for use with passing to engines transparently as a mapping. Returns diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index be6086dd360f2..08dce6aca6e6d 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -17,6 +17,7 @@ is_callable, is_instance_factory, is_int, + is_nonnegative_int, is_one_of_factory, is_text, ) @@ -319,7 +320,7 @@ def is_terminal(): with cf.config_prefix("display"): - cf.register_option("precision", 6, pc_precision_doc, validator=is_int) + cf.register_option("precision", 6, pc_precision_doc, validator=is_nonnegative_int) cf.register_option( "float_format", None, @@ -333,12 +334,7 @@ def is_terminal(): pc_max_info_rows_doc, validator=is_instance_factory((int, type(None))), ) - cf.register_option( - "max_rows", - 60, - pc_max_rows_doc, - validator=is_instance_factory([type(None), int]), - ) + cf.register_option("max_rows", 60, pc_max_rows_doc, validator=is_nonnegative_int) cf.register_option( "min_rows", 10, @@ -352,10 +348,7 @@ def is_terminal(): else: max_cols = 20 # cannot determine optimal number of columns cf.register_option( - "max_columns", - max_cols, - pc_max_cols_doc, - validator=is_instance_factory([type(None), int]), + "max_columns", max_cols, pc_max_cols_doc, validator=is_nonnegative_int ) cf.register_option( "large_repr", diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 6f599a6be6021..056cd2222af3c 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -133,6 +133,8 @@ def _isna_new(obj): # hack (for now) because MI registers as ndarray elif isinstance(obj, ABCMultiIndex): raise NotImplementedError("isna is not defined for MultiIndex") + elif isinstance(obj, type): + return False elif isinstance( obj, ( @@ -171,6 +173,8 @@ def _isna_old(obj): # hack (for now) because MI registers as ndarray elif isinstance(obj, ABCMultiIndex): raise NotImplementedError("isna is not defined for MultiIndex") + elif isinstance(obj, type): + return False elif isinstance(obj, (ABCSeries, np.ndarray, ABCIndexClass)): return _isna_ndarraylike_old(obj) elif isinstance(obj, ABCGeneric): diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 02241eeaae7b2..16fece1c7eb8b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -86,12 +86,7 @@ from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin as DatetimeLikeArray from pandas.core.arrays.sparse import SparseFrameAccessor from pandas.core.generic import NDFrame, _shared_docs -from pandas.core.index import ( - Index, - MultiIndex, - ensure_index, - ensure_index_from_sequences, -) +from pandas.core.index import Index, ensure_index, ensure_index_from_sequences from pandas.core.indexes import base as ibase from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.indexes.multi import maybe_droplevels @@ -108,6 +103,7 @@ sanitize_index, to_arrays, ) +from pandas.core.ops.missing import dispatch_fill_zeros from pandas.core.series import Series from pandas.io.formats import console, format as fmt @@ -669,15 +665,33 @@ def _repr_html_(self): if get_option("display.notebook_repr_html"): max_rows = get_option("display.max_rows") + min_rows = get_option("display.min_rows") max_cols = get_option("display.max_columns") show_dimensions = get_option("display.show_dimensions") - return self.to_html( + formatter = fmt.DataFrameFormatter( + self, + columns=None, + col_space=None, + na_rep="NaN", + formatters=None, + float_format=None, + sparsify=None, + justify=None, + index_names=True, + header=True, + index=True, + bold_rows=True, + escape=True, max_rows=max_rows, + min_rows=min_rows, max_cols=max_cols, show_dimensions=show_dimensions, - notebook=True, + decimal=".", + table_id=None, + render_links=False, ) + return formatter.to_html(notebook=True) else: return None @@ -770,12 +784,13 @@ def style(self): _shared_docs[ "items" ] = r""" - Iterator over (column name, Series) pairs. + Iterate over (column name, Series) pairs. Iterates over the DataFrame columns, returning a tuple with the column name and the content as a Series. - %s + Yields + ------ label : object The column names for the DataFrame being iterated over. content : Series @@ -816,7 +831,7 @@ def style(self): Name: population, dtype: int64 """ - @Appender(_shared_docs["items"] % "Yields\n ------") + @Appender(_shared_docs["items"]) def items(self): if self.columns.is_unique and hasattr(self, "_item_cache"): for k in self.columns: @@ -825,9 +840,9 @@ def items(self): for i, k in enumerate(self.columns): yield k, self._ixs(i, axis=1) - @Appender(_shared_docs["items"] % "Returns\n -------") + @Appender(_shared_docs["items"]) def iteritems(self): - return self.items() + yield from self.items() def iterrows(self): """ @@ -845,8 +860,8 @@ def iterrows(self): See Also -------- - itertuples : Iterate over DataFrame rows as namedtuples of the values. - items : Iterate over (column name, Series) pairs. + DataFrame.itertuples : Iterate over DataFrame rows as namedtuples of the values. + DataFrame.items : Iterate over (column name, Series) pairs. Notes ----- @@ -1189,7 +1204,7 @@ def to_numpy(self, dtype=None, copy=False): Parameters ---------- dtype : str or numpy.dtype, optional - The dtype to pass to :meth:`numpy.asarray` + The dtype to pass to :meth:`numpy.asarray`. copy : bool, default False Whether to ensure that the returned value is a not a view on another array. Note that ``copy=False`` does not *ensure* that @@ -1729,7 +1744,7 @@ def to_records( if is_datetime64_any_dtype(self.index) and convert_datetime64: ix_vals = [self.index.to_pydatetime()] else: - if isinstance(self.index, MultiIndex): + if isinstance(self.index, ABCMultiIndex): # array of tuples to numpy cols. copy copy copy ix_vals = list(map(np.array, zip(*self.index.values))) else: @@ -1740,7 +1755,7 @@ def to_records( count = 0 index_names = list(self.index.names) - if isinstance(self.index, MultiIndex): + if isinstance(self.index, ABCMultiIndex): for i, n in enumerate(index_names): if n is None: index_names[i] = "level_%d" % count @@ -2863,7 +2878,7 @@ def __getitem__(self, key): # The behavior is inconsistent. It returns a Series, except when # - the key itself is repeated (test on data.shape, #9519), or # - we have a MultiIndex on columns (test on self.columns, #21309) - if data.shape[1] == 1 and not isinstance(self.columns, MultiIndex): + if data.shape[1] == 1 and not isinstance(self.columns, ABCMultiIndex): data = data[key] return data @@ -3093,7 +3108,7 @@ def _ensure_valid_index(self, value): passed value. """ # GH5632, make sure that we are a Series convertible - if not len(self.index) and is_list_like(value): + if not len(self.index) and is_list_like(value) and len(value): try: value = Series(value) except (ValueError, NotImplementedError, TypeError): @@ -3447,15 +3462,14 @@ def _get_info_slice(obj, indexer): if not is_list_like(exclude): exclude = (exclude,) if exclude is not None else () - selection = tuple(map(frozenset, (include, exclude))) + selection = (frozenset(include), frozenset(exclude)) if not any(selection): raise ValueError("at least one of include or exclude must be nonempty") # convert the myriad valid dtypes object to a single representation - include, exclude = map( - lambda x: frozenset(map(infer_dtype_from_object, x)), selection - ) + include = frozenset(infer_dtype_from_object(x) for x in include) + exclude = frozenset(infer_dtype_from_object(x) for x in exclude) for dtypes in (include, exclude): invalidate_string_dtypes(dtypes) @@ -3653,7 +3667,7 @@ def reindexer(value): elif isinstance(value, DataFrame): # align right-hand-side columns if self.columns # is multi-index and self[key] is a sub-frame - if isinstance(self.columns, MultiIndex) and key in self.columns: + if isinstance(self.columns, ABCMultiIndex) and key in self.columns: loc = self.columns.get_loc(key) if isinstance(loc, (slice, Series, np.ndarray, Index)): cols = maybe_droplevels(self.columns[loc], key) @@ -3702,7 +3716,7 @@ def reindexer(value): # broadcast across multiple columns if necessary if broadcast and key in self.columns and value.ndim == 1: - if not self.columns.is_unique or isinstance(self.columns, MultiIndex): + if not self.columns.is_unique or isinstance(self.columns, ABCMultiIndex): existing_piece = self[key] if isinstance(existing_piece, DataFrame): value = np.tile(value, (len(existing_piece.columns), 1)) @@ -4597,7 +4611,7 @@ def _maybe_casted_values(index, labels=None): new_index = self.index.droplevel(level) if not drop: - if isinstance(self.index, MultiIndex): + if isinstance(self.index, ABCMultiIndex): names = [ n if n is not None else ("level_%d" % i) for (i, n) in enumerate(self.index.names) @@ -4608,7 +4622,7 @@ def _maybe_casted_values(index, labels=None): names = [default] if self.index.name is None else [self.index.name] to_insert = ((self.index, None),) - multi_col = isinstance(self.columns, MultiIndex) + multi_col = isinstance(self.columns, ABCMultiIndex) for i, (lev, lab) in reversed(list(enumerate(to_insert))): if not (level is None or i in level): continue @@ -4990,7 +5004,7 @@ def sort_index( level, ascending=ascending, sort_remaining=sort_remaining ) - elif isinstance(labels, MultiIndex): + elif isinstance(labels, ABCMultiIndex): from pandas.core.sorting import lexsort_indexer indexer = lexsort_indexer( @@ -5276,7 +5290,7 @@ def reorder_levels(self, order, axis=0): type of caller (new object) """ axis = self._get_axis_number(axis) - if not isinstance(self._get_axis(axis), MultiIndex): # pragma: no cover + if not isinstance(self._get_axis(axis), ABCMultiIndex): # pragma: no cover raise TypeError("Can only reorder levels on a hierarchical axis.") result = self.copy() @@ -5294,25 +5308,34 @@ def _combine_frame(self, other, func, fill_value=None, level=None): this, other = self.align(other, join="outer", level=level, copy=False) new_index, new_columns = this.index, this.columns - def _arith_op(left, right): - # for the mixed_type case where we iterate over columns, - # _arith_op(left, right) is equivalent to - # left._binop(right, func, fill_value=fill_value) - left, right = ops.fill_binop(left, right, fill_value) - return func(left, right) + if fill_value is None: + # since _arith_op may be called in a loop, avoid function call + # overhead if possible by doing this check once + _arith_op = func + + else: + + def _arith_op(left, right): + # for the mixed_type case where we iterate over columns, + # _arith_op(left, right) is equivalent to + # left._binop(right, func, fill_value=fill_value) + left, right = ops.fill_binop(left, right, fill_value) + return func(left, right) if ops.should_series_dispatch(this, other, func): # iterate over columns return ops.dispatch_to_series(this, other, _arith_op) else: - result = _arith_op(this.values, other.values) + with np.errstate(all="ignore"): + result = _arith_op(this.values, other.values) + result = dispatch_fill_zeros(func, this.values, other.values, result) return self._constructor( result, index=new_index, columns=new_columns, copy=False ) def _combine_match_index(self, other, func, level=None): left, right = self.align(other, join="outer", axis=0, level=level, copy=False) - assert left.index.equals(right.index) + # at this point we have `left.index.equals(right.index)` if left._is_mixed_type or right._is_mixed_type: # operate column-wise; avoid costly object-casting in `.values` @@ -5325,14 +5348,13 @@ def _combine_match_index(self, other, func, level=None): new_data, index=left.index, columns=self.columns, copy=False ) - def _combine_match_columns(self, other, func, level=None): - assert isinstance(other, Series) + def _combine_match_columns(self, other: Series, func, level=None): left, right = self.align(other, join="outer", axis=1, level=level, copy=False) - assert left.columns.equals(right.index) + # at this point we have `left.columns.equals(right.index)` return ops.dispatch_to_series(left, right, func, axis="columns") def _combine_const(self, other, func): - assert lib.is_scalar(other) or np.ndim(other) == 0 + # scalar other or np.ndim(other) == 0 return ops.dispatch_to_series(self, other, func) def combine(self, other, func, fill_value=None, overwrite=True): @@ -6177,14 +6199,14 @@ def stack(self, level=-1, dropna=True): def explode(self, column: Union[str, Tuple]) -> "DataFrame": """ - Transform each element of a list-like to a row, replicating the - index values. + Transform each element of a list-like to a row, replicating index values. .. versionadded:: 0.25.0 Parameters ---------- column : str or tuple + Column to explode. Returns ------- @@ -6200,8 +6222,8 @@ def explode(self, column: Union[str, Tuple]) -> "DataFrame": See Also -------- DataFrame.unstack : Pivot a level of the (necessarily hierarchical) - index labels - DataFrame.melt : Unpivot a DataFrame from wide format to long format + index labels. + DataFrame.melt : Unpivot a DataFrame from wide format to long format. Series.explode : Explode a DataFrame from list-like columns to long format. Notes @@ -7772,7 +7794,7 @@ def _count_level(self, level, axis=0, numeric_only=False): count_axis = frame._get_axis(axis) agg_axis = frame._get_agg_axis(axis) - if not isinstance(count_axis, MultiIndex): + if not isinstance(count_axis, ABCMultiIndex): raise TypeError( "Can only count levels on hierarchical " "{ax}.".format(ax=self._get_axis_name(axis)) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 1b39f9225a0ed..1a5b36b07e93c 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7,7 +7,17 @@ import pickle import re from textwrap import dedent -from typing import Callable, Dict, FrozenSet, List, Optional, Set +from typing import ( + Callable, + Dict, + FrozenSet, + Hashable, + List, + Optional, + Sequence, + Set, + Union, +) import warnings import weakref @@ -50,7 +60,7 @@ from pandas.core.dtypes.missing import isna, notna import pandas as pd -from pandas._typing import Dtype +from pandas._typing import Dtype, FilePathOrBuffer from pandas.core import missing, nanops import pandas.core.algorithms as algos from pandas.core.base import PandasObject, SelectionMixin @@ -122,6 +132,9 @@ def _single_replace(self, to_replace, method, inplace, limit): return result +bool_t = bool # Need alias because NDFrame has def bool: + + class NDFrame(PandasObject, SelectionMixin): """ N-dimensional analogue of DataFrame. Store multi-dimensional in a @@ -131,7 +144,7 @@ class NDFrame(PandasObject, SelectionMixin): ---------- data : BlockManager axes : list - copy : boolean, default False + copy : bool, default False """ _internal_names = [ @@ -280,7 +293,8 @@ def _setup_axes( ns=None, docs=None, ): - """Provide axes setup for the major PandasObjects. + """ + Provide axes setup for the major PandasObjects. Parameters ---------- @@ -288,8 +302,8 @@ def _setup_axes( info_axis_num : the axis of the selector dimension (int) stat_axis_num : the number of axis for the default stats (int) aliases : other names for a single axis (dict) - axes_are_reversed : boolean whether to treat passed axes as - reversed (DataFrame) + axes_are_reversed : bool + Whether to treat passed axes as reversed (DataFrame). build_axes : setup the axis properties (default True) """ @@ -676,7 +690,7 @@ def transpose(self, *args, **kwargs): Parameters ---------- args : %(args_transpose)s - copy : boolean, default False + copy : bool, default False Make a copy of the underlying data. Mixed-dtype data will always result in a copy **kwargs @@ -1874,7 +1888,7 @@ def __iter__(self): # can we get a better explanation of this? def keys(self): """ - Get the 'info axis' (see Indexing for more) + Get the 'info axis' (see Indexing for more). This is index for Series, columns for DataFrame. @@ -2179,6 +2193,12 @@ def _repr_data_resource_(self): ... df1.to_excel(writer, sheet_name='Sheet_name_1') ... df2.to_excel(writer, sheet_name='Sheet_name_2') + ExcelWriter can also be used to append to an existing Excel file: + + >>> with pd.ExcelWriter('output.xlsx', + ... mode='a') as writer: # doctest: +SKIP + ... df.to_excel(writer, sheet_name='Sheet_name_3') + To set the library that is used to write the Excel file, you can pass the `engine` keyword (the default engine is automatically chosen depending on the file extension): @@ -2251,10 +2271,10 @@ def to_json( Parameters ---------- - path_or_buf : string or file handle, optional + path_or_buf : str or file handle, optional File path or object. If not specified, the result is returned as a string. - orient : string + orient : str Indication of expected JSON string format. * Series @@ -2533,7 +2553,7 @@ def to_msgpack(self, path_or_buf=None, encoding="utf-8", **kwargs): def to_sql( self, - name, + name: str, con, schema=None, if_exists="fail", @@ -2551,12 +2571,12 @@ def to_sql( Parameters ---------- - name : string + name : str Name of SQL table. con : sqlalchemy.engine.Engine or sqlite3.Connection Using SQLAlchemy makes it possible to use any DB supported by that library. Legacy support is provided for sqlite3.Connection objects. - schema : string, optional + schema : str, optional Specify the schema (if database flavor supports this). If None, use default schema. if_exists : {'fail', 'replace', 'append'}, default 'fail' @@ -2569,18 +2589,19 @@ def to_sql( index : bool, default True Write DataFrame index as a column. Uses `index_label` as the column name in the table. - index_label : string or sequence, default None + index_label : str or sequence, default None Column label for index column(s). If None is given (default) and `index` is True, then the index names are used. A sequence should be given if the DataFrame uses MultiIndex. chunksize : int, optional - Rows will be written in batches of this size at a time. By default, - all rows will be written at once. - dtype : dict, optional - Specifying the datatype for columns. The keys should be the column - names and the values should be the SQLAlchemy types or strings for - the sqlite3 legacy mode. - method : {None, 'multi', callable}, default None + Specify the number of rows in each batch to be written at a time. + By default, all rows will be written at once. + dtype : dict or scalar, optional + Specifying the datatype for columns. If a dictionary is used, the + keys should be the column names and the values should be the + SQLAlchemy types or strings for the sqlite3 legacy mode. If a + scalar is provided, it will be applied to all columns. + method : {None, 'multi', callable}, optional Controls the SQL insertion clause used: * None : Uses standard SQL ``INSERT`` clause (one per row). @@ -2993,10 +3014,15 @@ def to_latex( >>> df = pd.DataFrame({'name': ['Raphael', 'Donatello'], ... 'mask': ['red', 'purple'], ... 'weapon': ['sai', 'bo staff']}) - >>> df.to_latex(index=False) # doctest: +NORMALIZE_WHITESPACE - '\\begin{tabular}{lll}\n\\toprule\n name & mask & weapon - \\\\\n\\midrule\n Raphael & red & sai \\\\\n Donatello & - purple & bo staff \\\\\n\\bottomrule\n\\end{tabular}\n' + >>> print(df.to_latex(index=False)) # doctest: +NORMALIZE_WHITESPACE + \begin{tabular}{lll} + \toprule + name & mask & weapon \\ + \midrule + Raphael & red & sai \\ + Donatello & purple & bo staff \\ + \bottomrule + \end{tabular} """ # Get defaults from the pandas config if self.ndim == 1: @@ -3039,26 +3065,26 @@ def to_latex( def to_csv( self, - path_or_buf=None, - sep=",", - na_rep="", - float_format=None, - columns=None, - header=True, - index=True, - index_label=None, - mode="w", - encoding=None, - compression="infer", - quoting=None, - quotechar='"', - line_terminator=None, - chunksize=None, - date_format=None, - doublequote=True, - escapechar=None, - decimal=".", - ): + path_or_buf: Optional[FilePathOrBuffer] = None, + sep: str = ",", + na_rep: str = "", + float_format: Optional[str] = None, + columns: Optional[Sequence[Hashable]] = None, + header: Union[bool_t, List[str]] = True, + index: bool_t = True, + index_label: Optional[Union[bool_t, str, Sequence[Hashable]]] = None, + mode: str = "w", + encoding: Optional[str] = None, + compression: Optional[Union[str, Dict[str, str]]] = "infer", + quoting: Optional[int] = None, + quotechar: str = '"', + line_terminator: Optional[str] = None, + chunksize: Optional[int] = None, + date_format: Optional[str] = None, + doublequote: bool_t = True, + escapechar: Optional[str] = None, + decimal: Optional[str] = ".", + ) -> Optional[str]: r""" Write object to a comma-separated values (csv) file. @@ -3105,16 +3131,21 @@ def to_csv( encoding : str, optional A string representing the encoding to use in the output file, defaults to 'utf-8'. - compression : str, default 'infer' - Compression mode among the following possible values: {'infer', - 'gzip', 'bz2', 'zip', 'xz', None}. If 'infer' and `path_or_buf` - is path-like, then detect compression from the following - extensions: '.gz', '.bz2', '.zip' or '.xz'. (otherwise no - compression). - - .. versionchanged:: 0.24.0 - - 'infer' option added and set to default. + compression : str or dict, default 'infer' + If str, represents compression mode. If dict, value at 'method' is + the compression mode. Compression mode may be any of the following + possible values: {'infer', 'gzip', 'bz2', 'zip', 'xz', None}. If + compression mode is 'infer' and `path_or_buf` is path-like, then + detect compression mode from the following extensions: '.gz', + '.bz2', '.zip' or '.xz'. (otherwise no compression). If dict given + and mode is 'zip' or inferred as 'zip', other entries passed as + additional compression options. + + .. versionchanged:: 0.25.0 + + May now be a dict with key 'method' as compression mode + and other entries as additional compression options if + compression mode is 'zip'. quoting : optional constant from csv module Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format` @@ -3159,6 +3190,13 @@ def to_csv( ... 'weapon': ['sai', 'bo staff']}) >>> df.to_csv(index=False) 'name,mask,weapon\nRaphael,red,sai\nDonatello,purple,bo staff\n' + + # create 'out.zip' containing 'out.csv' + >>> compression_opts = dict(method='zip', + ... archive_name='out.csv') # doctest: +SKIP + + >>> df.to_csv('out.zip', index=False, + ... compression=compression_opts) # doctest: +SKIP """ df = self if isinstance(self, ABCDataFrame) else self.to_frame() @@ -3192,6 +3230,8 @@ def to_csv( if path_or_buf is None: return formatter.path_or_buf.getvalue() + return None + # ---------------------------------------------------------------------- # Fancy Indexing @@ -3240,11 +3280,10 @@ def _maybe_update_cacher(self, clear=False, verify_is_copy=True): Parameters ---------- - clear : boolean, default False - clear the item cache - verify_is_copy : boolean, default True - provide is_copy checks - + clear : bool, default False + Clear the item cache. + verify_is_copy : bool, default True + Provide is_copy checks. """ cacher = getattr(self, "_cacher", None) @@ -3610,11 +3649,11 @@ def _check_setitem_copy(self, stacklevel=4, t="setting", force=False): Parameters ---------- - stacklevel : integer, default 4 + stacklevel : int, default 4 the level to show of the stack when the error is output - t : string, the type of setting error - force : boolean, default False - if True, then force showing an error + t : str, the type of setting error + force : bool, default False + If True, then force showing an error. validate if we are doing a setitem on a chained copy. @@ -3943,9 +3982,8 @@ def _update_inplace(self, result, verify_is_copy=True): Parameters ---------- - verify_is_copy : boolean, default True - provide is_copy checks - + verify_is_copy : bool, default True + Provide is_copy checks. """ # NOTE: This does *not* call __finalize__ and that's an explicit # decision that we may revisit in the future. @@ -4560,9 +4598,9 @@ def filter(self, items=None, like=None, regex=None, axis=None): ---------- items : list-like Keep labels from axis which are in items. - like : string + like : str Keep labels from axis for which "like in label == True". - regex : string (regular expression) + regex : str (regular expression) Keep labels from axis for which re.search(regex, label) == True. axis : int or string axis name The axis to filter on. By default this is the info axis, @@ -4781,7 +4819,7 @@ def sample( frac : float, optional Fraction of axis items to return. Cannot be used with `n`. replace : bool, default False - Sample with or without replacement. + Allow or disallow sampling of the same row more than once. weights : str or ndarray-like, optional Default 'None' results in equal probability weighting. If passed a Series, will align with target object on index. Index @@ -5222,8 +5260,8 @@ def _consolidate(self, inplace=False): Parameters ---------- - inplace : boolean, default False - If False return new object, otherwise modify existing object + inplace : bool, default False + If False return new object, otherwise modify existing object. Returns ------- @@ -5669,11 +5707,12 @@ def as_blocks(self, copy=True): Parameters ---------- - copy : boolean, default True + copy : bool, default True Returns ------- - values : a dict of dtype -> Constructor Types + dict + Mapping dtype -> Constructor Types. """ warnings.warn( "as_blocks is deprecated and will be removed in a future version", @@ -5982,17 +6021,17 @@ def _convert( Parameters ---------- - datetime : boolean, default False + datetime : bool, default False If True, convert to date where possible. - numeric : boolean, default False + numeric : bool, default False If True, attempt to convert to numbers (including strings), with unconvertible values becoming NaN. - timedelta : boolean, default False + timedelta : bool, default False If True, convert to timedelta where possible. - coerce : boolean, default False + coerce : bool, default False If True, force conversion with unconvertible values converted to - nulls (NaN or NaT) - copy : boolean, default True + nulls (NaN or NaT). + copy : bool, default True If True, return a copy even if no copy is necessary (e.g. no conversion was done). Note: This is meant for internal use, and should not be confused with inplace. @@ -6631,11 +6670,7 @@ def replace( for k, v in items: keys, values = list(zip(*v.items())) or ([], []) - if set(keys) & set(values): - raise ValueError( - "Replacement not allowed with " - "overlapping keys and values" - ) + to_rep_dict[k] = list(keys) value_dict[k] = list(values) @@ -7859,7 +7894,7 @@ def asfreq(self, freq, method=None, how=None, normalize=False, fill_value=None): Parameters ---------- - freq : DateOffset object, or string + freq : DateOffset or str method : {'backfill'/'bfill', 'pad'/'ffill'}, default None Method to use for filling holes in reindexed Series (note this does not fill NaNs that already were present): @@ -8660,7 +8695,7 @@ def ranker(data): level : int or level name, default None Broadcast across a level, matching Index values on the passed MultiIndex level - copy : boolean, default True + copy : bool, default True Always returns new objects. If copy=False and no reindexing is required then original objects are returned. fill_value : scalar, default np.NaN @@ -9452,7 +9487,7 @@ def truncate(self, before=None, after=None, axis=None, copy=True): Truncate all rows after this index value. axis : {0 or 'index', 1 or 'columns'}, optional Axis to truncate. Truncates the index (rows) by default. - copy : boolean, default is True, + copy : bool, default is True, Return a copy of the truncated section. Returns @@ -9596,13 +9631,13 @@ def tz_convert(self, tz, axis=0, level=None, copy=True): Parameters ---------- - tz : string or pytz.timezone object + tz : str or tzinfo object axis : the axis to convert level : int, str, default None - If axis ia a MultiIndex, convert a specific level. Otherwise - must be None - copy : boolean, default True - Also make a copy of the underlying data + If axis is a MultiIndex, convert a specific level. Otherwise + must be None. + copy : bool, default True + Also make a copy of the underlying data. Returns ------- @@ -9656,12 +9691,12 @@ def tz_localize( Parameters ---------- - tz : string or pytz.timezone object + tz : str or tzinfo axis : the axis to localize level : int, str, default None If axis ia a MultiIndex, localize a specific level. Otherwise must be None - copy : boolean, default True + copy : bool, default True Also make a copy of the underlying data ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise' When clocks moved backward due to DST, ambiguous times may arise. @@ -10683,9 +10718,9 @@ def _add_series_or_dataframe_operations(cls): the doc strings again. """ - from pandas.core import window as rwindow + from pandas.core.window import EWM, Expanding, Rolling, Window - @Appender(rwindow.rolling.__doc__) + @Appender(Rolling.__doc__) def rolling( self, window, @@ -10697,7 +10732,20 @@ def rolling( closed=None, ): axis = self._get_axis_number(axis) - return rwindow.rolling( + + if win_type is not None: + return Window( + self, + window=window, + min_periods=min_periods, + center=center, + win_type=win_type, + on=on, + axis=axis, + closed=closed, + ) + + return Rolling( self, window=window, min_periods=min_periods, @@ -10710,16 +10758,14 @@ def rolling( cls.rolling = rolling - @Appender(rwindow.expanding.__doc__) + @Appender(Expanding.__doc__) def expanding(self, min_periods=1, center=False, axis=0): axis = self._get_axis_number(axis) - return rwindow.expanding( - self, min_periods=min_periods, center=center, axis=axis - ) + return Expanding(self, min_periods=min_periods, center=center, axis=axis) cls.expanding = expanding - @Appender(rwindow.ewm.__doc__) + @Appender(EWM.__doc__) def ewm( self, com=None, @@ -10732,7 +10778,7 @@ def ewm( axis=0, ): axis = self._get_axis_number(axis) - return rwindow.ewm( + return EWM( self, com=com, span=span, @@ -10987,7 +11033,7 @@ def _doc_parms(cls): ---------- axis : {0 or 'index', 1 or 'columns'}, default 0 The index or the name of the axis. 0 is equivalent to None or 'index'. -skipna : boolean, default True +skipna : bool, default True Exclude NA/null values. If an entire row/column is NA, the result will be NA. *args, **kwargs : diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 2ad85903b916b..5e463d50d43d6 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -21,7 +21,11 @@ from pandas.errors import AbstractMethodError from pandas.util._decorators import Appender, Substitution -from pandas.core.dtypes.cast import maybe_convert_objects, maybe_downcast_to_dtype +from pandas.core.dtypes.cast import ( + maybe_convert_objects, + maybe_downcast_numeric, + maybe_downcast_to_dtype, +) from pandas.core.dtypes.common import ( ensure_int64, ensure_platform_int, @@ -180,10 +184,8 @@ def _cython_agg_blocks(self, how, alt=None, numeric_only=True, min_count=-1): continue finally: if result is not no_result: - dtype = block.values.dtype - # see if we can cast the block back to the original dtype - result = block._try_coerce_and_cast_result(result, dtype=dtype) + result = maybe_downcast_numeric(result, block.dtype) newb = block.make_block(result) new_items.append(locs) @@ -240,15 +242,18 @@ def aggregate(self, func, *args, **kwargs): # grouper specific aggregations if self.grouper.nkeys > 1: return self._python_agg_general(func, *args, **kwargs) + elif args or kwargs: + result = self._aggregate_generic(func, *args, **kwargs) else: # try to treat as if we are passing a list try: - assert not args and not kwargs result = self._aggregate_multiple_funcs( [func], _level=_level, _axis=self.axis ) - + except Exception: + result = self._aggregate_generic(func) + else: result.columns = Index( result.columns.levels[0], name=self._selected_obj.columns.name ) @@ -258,15 +263,15 @@ def aggregate(self, func, *args, **kwargs): # values. concat no longer converts DataFrame[Sparse] # to SparseDataFrame, so we do it here. result = SparseDataFrame(result._data) - except Exception: - result = self._aggregate_generic(func, *args, **kwargs) if not self.as_index: self._insert_inaxis_grouper_inplace(result) result.index = np.arange(len(result)) if relabeling: - result = result[order] + + # used reordered index of columns + result = result.iloc[:, order] result.columns = columns return result._convert(datetime=True) @@ -309,10 +314,10 @@ def _aggregate_item_by_item(self, func, *args, **kwargs): cannot_agg = [] errors = None for item in obj: - try: - data = obj[item] - colg = SeriesGroupBy(data, selection=item, grouper=self.grouper) + data = obj[item] + colg = SeriesGroupBy(data, selection=item, grouper=self.grouper) + try: cast = self._transform_should_cast(func) result[item] = colg.aggregate(func, *args, **kwargs) @@ -680,7 +685,7 @@ def _transform_item_by_item(self, obj, wrapper): return DataFrame(output, index=obj.index, columns=columns) - def filter(self, func, dropna=True, *args, **kwargs): # noqa + def filter(self, func, dropna=True, *args, **kwargs): """ Return a copy of a DataFrame excluding elements from groups that do not satisfy the boolean criterion specified by func. @@ -831,45 +836,45 @@ def apply(self, func, *args, **kwargs): axis="", ) @Appender(_shared_docs["aggregate"]) - def aggregate(self, func_or_funcs=None, *args, **kwargs): + def aggregate(self, func=None, *args, **kwargs): _level = kwargs.pop("_level", None) - relabeling = func_or_funcs is None + relabeling = func is None columns = None - no_arg_message = "Must provide 'func_or_funcs' or named aggregation **kwargs." + no_arg_message = "Must provide 'func' or named aggregation **kwargs." if relabeling: columns = list(kwargs) if not PY36: # sort for 3.5 and earlier columns = list(sorted(columns)) - func_or_funcs = [kwargs[col] for col in columns] + func = [kwargs[col] for col in columns] kwargs = {} if not columns: raise TypeError(no_arg_message) - if isinstance(func_or_funcs, str): - return getattr(self, func_or_funcs)(*args, **kwargs) + if isinstance(func, str): + return getattr(self, func)(*args, **kwargs) - if isinstance(func_or_funcs, abc.Iterable): + if isinstance(func, abc.Iterable): # Catch instances of lists / tuples # but not the class list / tuple itself. - func_or_funcs = _maybe_mangle_lambdas(func_or_funcs) - ret = self._aggregate_multiple_funcs(func_or_funcs, (_level or 0) + 1) + func = _maybe_mangle_lambdas(func) + ret = self._aggregate_multiple_funcs(func, (_level or 0) + 1) if relabeling: ret.columns = columns else: - cyfunc = self._get_cython_func(func_or_funcs) + cyfunc = self._get_cython_func(func) if cyfunc and not args and not kwargs: return getattr(self, cyfunc)() if self.grouper.nkeys > 1: - return self._python_agg_general(func_or_funcs, *args, **kwargs) + return self._python_agg_general(func, *args, **kwargs) try: - return self._python_agg_general(func_or_funcs, *args, **kwargs) + return self._python_agg_general(func, *args, **kwargs) except Exception: - result = self._aggregate_named(func_or_funcs, *args, **kwargs) + result = self._aggregate_named(func, *args, **kwargs) index = Index(sorted(result), name=self.grouper.names[0]) ret = Series(result, index=index) @@ -1462,8 +1467,8 @@ class DataFrameGroupBy(NDFrameGroupBy): axis="", ) @Appender(_shared_docs["aggregate"]) - def aggregate(self, arg=None, *args, **kwargs): - return super().aggregate(arg, *args, **kwargs) + def aggregate(self, func=None, *args, **kwargs): + return super().aggregate(func, *args, **kwargs) agg = aggregate @@ -1729,8 +1734,8 @@ def _normalize_keyword_aggregation(kwargs): The transformed kwargs. columns : List[str] The user-provided keys. - order : List[Tuple[str, str]] - Pairs of the input and output column names. + col_idx_order : List[int] + List of columns indices. Examples -------- @@ -1757,7 +1762,39 @@ def _normalize_keyword_aggregation(kwargs): else: aggspec[column] = [aggfunc] order.append((column, com.get_callable_name(aggfunc) or aggfunc)) - return aggspec, columns, order + + # uniquify aggfunc name if duplicated in order list + uniquified_order = _make_unique(order) + + # GH 25719, due to aggspec will change the order of assigned columns in aggregation + # uniquified_aggspec will store uniquified order list and will compare it with order + # based on index + aggspec_order = [ + (column, com.get_callable_name(aggfunc) or aggfunc) + for column, aggfuncs in aggspec.items() + for aggfunc in aggfuncs + ] + uniquified_aggspec = _make_unique(aggspec_order) + + # get the new indice of columns by comparison + col_idx_order = Index(uniquified_aggspec).get_indexer(uniquified_order) + return aggspec, columns, col_idx_order + + +def _make_unique(seq): + """Uniquify aggfunc name of the pairs in the order list + + Examples: + -------- + >>> _make_unique([('a', ''), ('a', ''), ('b', '')]) + [('a', '_0'), ('a', '_1'), ('b', '')] + """ + return [ + (pair[0], "_".join([pair[1], str(seq[:i].count(pair))])) + if seq.count(pair) > 1 + else pair + for i, pair in enumerate(seq) + ] # TODO: Can't use, because mypy doesn't like us setting __name__ diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index c5e81e21e9fd5..6deef16bdec13 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -653,7 +653,8 @@ def curried(x): # mark this column as an error try: return self._aggregate_item_by_item(name, *args, **kwargs) - except (AttributeError): + except AttributeError: + # e.g. SparseArray has no flags attr raise ValueError return wrapper @@ -1011,7 +1012,6 @@ def _apply_filter(self, indices, dropna): class GroupBy(_GroupBy): - """ Class for grouping and aggregating relational data. @@ -1773,7 +1773,11 @@ def nth(self, n: Union[int, List[int]], dropna: Optional[str] = None) -> DataFra if not self.as_index: return out - out.index = self.grouper.result_index[ids[mask]] + result_index = self.grouper.result_index + out.index = result_index[ids[mask]] + + if not self.observed and isinstance(result_index, CategoricalIndex): + out = out.reindex(result_index) return out.sort_index() if self.sort else out @@ -1870,6 +1874,7 @@ def quantile(self, q=0.5, interpolation="linear"): a 2.0 b 3.0 """ + from pandas import concat def pre_processor(vals: np.ndarray) -> Tuple[np.ndarray, Optional[Type]]: if is_object_dtype(vals): @@ -1897,18 +1902,57 @@ def post_processor(vals: np.ndarray, inference: Optional[Type]) -> np.ndarray: return vals - return self._get_cythonized_result( - "group_quantile", - self.grouper, - aggregate=True, - needs_values=True, - needs_mask=True, - cython_dtype=np.float64, - pre_processing=pre_processor, - post_processing=post_processor, - q=q, - interpolation=interpolation, - ) + if is_scalar(q): + return self._get_cythonized_result( + "group_quantile", + self.grouper, + aggregate=True, + needs_values=True, + needs_mask=True, + cython_dtype=np.float64, + pre_processing=pre_processor, + post_processing=post_processor, + q=q, + interpolation=interpolation, + ) + else: + results = [ + self._get_cythonized_result( + "group_quantile", + self.grouper, + aggregate=True, + needs_values=True, + needs_mask=True, + cython_dtype=np.float64, + pre_processing=pre_processor, + post_processing=post_processor, + q=qi, + interpolation=interpolation, + ) + for qi in q + ] + result = concat(results, axis=0, keys=q) + # fix levels to place quantiles on the inside + # TODO(GH-10710): Ideally, we could write this as + # >>> result.stack(0).loc[pd.IndexSlice[:, ..., q], :] + # but this hits https://github.com/pandas-dev/pandas/issues/10710 + # which doesn't reorder the list-like `q` on the inner level. + order = np.roll(list(range(result.index.nlevels)), -1) + result = result.reorder_levels(order) + result = result.reindex(q, level=-1) + + # fix order. + hi = len(q) * self.ngroups + arr = np.arange(0, hi, self.ngroups) + arrays = [] + + for i in range(self.ngroups): + arr2 = arr + i + arrays.append(arr2) + + indices = np.concatenate(arrays) + assert len(indices) == len(result) + return result.take(indices) @Substitution(name="groupby") def ngroup(self, ascending=True): @@ -2326,8 +2370,9 @@ def head(self, n=5): """ Return first n rows of each group. - Essentially equivalent to ``.apply(lambda x: x.head(n))``, - except ignores as_index flag. + Similar to ``.apply(lambda x: x.head(n))``, but it returns a subset of rows + from the original DataFrame with original index and order preserved + (``as_index`` flag is ignored). Returns ------- @@ -2338,10 +2383,6 @@ def head(self, n=5): >>> df = pd.DataFrame([[1, 2], [1, 4], [5, 6]], ... columns=['A', 'B']) - >>> df.groupby('A', as_index=False).head(1) - A B - 0 1 2 - 2 5 6 >>> df.groupby('A').head(1) A B 0 1 2 @@ -2357,8 +2398,9 @@ def tail(self, n=5): """ Return last n rows of each group. - Essentially equivalent to ``.apply(lambda x: x.tail(n))``, - except ignores as_index flag. + Similar to ``.apply(lambda x: x.tail(n))``, but it returns a subset of rows + from the original DataFrame with original index and order preserved + (``as_index`` flag is ignored). Returns ------- @@ -2373,10 +2415,6 @@ def tail(self, n=5): A B 1 a 2 3 b 2 - >>> df.groupby('A').head(1) - A B - 0 a 1 - 2 b 1 """ self._reset_group_selection() mask = self._cumcount_array(ascending=False) < n diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 143755a47b97b..31623171e9e63 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -37,7 +37,7 @@ class Grouper: """ A Grouper allows the user to specify a groupby instruction for a target - object + object. This specification will select a column via the key parameter, or if the level and/or axis parameters are given, a level of the index of the target @@ -217,7 +217,6 @@ def __repr__(self): class Grouping: - """ Holds the grouping information for a single key diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 676f243c9c8d3..7afb0a28f943e 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -12,7 +12,7 @@ from pandas._libs import NaT, iNaT, lib import pandas._libs.groupby as libgroupby -import pandas._libs.reduction as reduction +import pandas._libs.reduction as libreduction from pandas.errors import AbstractMethodError from pandas.util._decorators import cache_readonly @@ -207,7 +207,7 @@ def apply(self, f, data, axis=0): if len(result_values) == len(group_keys): return group_keys, result_values, mutated - except reduction.InvalidApply: + except libreduction.InvalidApply: # Cannot fast apply on MultiIndex (_has_complex_internals). # This Exception is also raised if `f` triggers an exception # but it is preferable to raise the exception in Python. @@ -591,6 +591,8 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1, **kwargs): if is_datetime64tz_dtype(orig_values.dtype): result = type(orig_values)(result.astype(np.int64), dtype=orig_values.dtype) + elif is_datetimelike and kind == "aggregate": + result = result.astype(orig_values.dtype) return result, names @@ -676,7 +678,7 @@ def _aggregate_series_fast(self, obj, func): indexer = get_group_index_sorter(group_index, ngroups) obj = obj.take(indexer) group_index = algorithms.take_nd(group_index, indexer, allow_fill=False) - grouper = reduction.SeriesGrouper(obj, func, group_index, ngroups, dummy) + grouper = libreduction.SeriesGrouper(obj, func, group_index, ngroups, dummy) result, counts = grouper.get_result() return result, counts @@ -704,7 +706,6 @@ def _aggregate_series_pure_python(self, obj, func): class BinGrouper(BaseGrouper): - """ This is an internal Grouper class @@ -850,7 +851,7 @@ def groupings(self): def agg_series(self, obj, func): dummy = obj[:0] - grouper = reduction.SeriesBinGrouper(obj, func, self.bins, dummy) + grouper = libreduction.SeriesBinGrouper(obj, func, self.bins, dummy) return grouper.get_result() @@ -938,7 +939,7 @@ def fast_apply(self, f, names): return [], True sdata = self._get_sorted_data() - return reduction.apply_frame_axis0(sdata, f, names, starts, ends) + return libreduction.apply_frame_axis0(sdata, f, names, starts, ends) def _chop(self, sdata, slice_obj): if self.axis == 0: diff --git a/pandas/core/indexers.py b/pandas/core/indexers.py index 70c48e969172f..433bca940c028 100644 --- a/pandas/core/indexers.py +++ b/pandas/core/indexers.py @@ -226,6 +226,7 @@ def length_of_indexer(indexer, target=None) -> int: if step is None: step = 1 elif step < 0: + start, stop = stop + 1, start + 1 step = -step return (stop - start + step - 1) // step elif isinstance(indexer, (ABCSeries, ABCIndexClass, np.ndarray, list)): diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 2271ff643bc15..2dbd592fc6787 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1,4 +1,4 @@ -from datetime import datetime, timedelta +from datetime import datetime import operator from textwrap import dedent from typing import Union @@ -10,6 +10,7 @@ import pandas._libs.join as libjoin from pandas._libs.lib import is_datetime_array from pandas._libs.tslibs import OutOfBoundsDatetime, Timestamp +from pandas._libs.tslibs.period import IncompatibleFrequency from pandas._libs.tslibs.timezones import tz_compare from pandas.compat import set_function_name from pandas.compat.numpy import function as nv @@ -48,8 +49,8 @@ ) from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.generic import ( + ABCCategorical, ABCDataFrame, - ABCDateOffset, ABCDatetimeArray, ABCDatetimeIndex, ABCIndexClass, @@ -70,7 +71,8 @@ from pandas.core.indexers import maybe_convert_indices from pandas.core.indexes.frozen import FrozenList import pandas.core.missing as missing -from pandas.core.ops import get_op_result_name, make_invalid_op +from pandas.core.ops import get_op_result_name +from pandas.core.ops.invalid import make_invalid_op import pandas.core.sorting as sorting from pandas.core.strings import StringMethods @@ -98,30 +100,27 @@ def _make_comparison_op(op, cls): def cmp_method(self, other): - if isinstance(other, (np.ndarray, Index, ABCSeries)): + if isinstance(other, (np.ndarray, Index, ABCSeries, ExtensionArray)): if other.ndim > 0 and len(self) != len(other): raise ValueError("Lengths must match to compare") - if is_object_dtype(self) and not isinstance(self, ABCMultiIndex): + if is_object_dtype(self) and isinstance(other, ABCCategorical): + left = type(other)(self._values, dtype=other.dtype) + return op(left, other) + elif is_object_dtype(self) and not isinstance(self, ABCMultiIndex): # don't pass MultiIndex with np.errstate(all="ignore"): - result = ops._comp_method_OBJECT_ARRAY(op, self.values, other) + result = ops.comp_method_OBJECT_ARRAY(op, self.values, other) else: with np.errstate(all="ignore"): result = op(self.values, np.asarray(other)) - # technically we could support bool dtyped Index - # for now just return the indexing array directly if is_bool_dtype(result): return result - try: - return Index(result) - except TypeError: - return result + return ops.invalid_comparison(self, other, op) name = "__{name}__".format(name=op.__name__) - # TODO: docstring? return set_function_name(cmp_method, name, cls) @@ -264,7 +263,13 @@ def __new__( fastpath=None, tupleize_cols=True, **kwargs - ): + ) -> "Index": + + from .range import RangeIndex + from pandas import PeriodIndex, DatetimeIndex, TimedeltaIndex + from .numeric import Float64Index, Int64Index, UInt64Index + from .interval import IntervalIndex + from .category import CategoricalIndex if name is None and hasattr(data, "name"): name = data.name @@ -279,8 +284,6 @@ def __new__( if fastpath: return cls._simple_new(data, name) - from .range import RangeIndex - if isinstance(data, ABCPandasArray): # ensure users don't accidentally put a PandasArray in an index. data = data.to_numpy() @@ -293,72 +296,53 @@ def __new__( # categorical elif is_categorical_dtype(data) or is_categorical_dtype(dtype): - from .category import CategoricalIndex - return CategoricalIndex(data, dtype=dtype, copy=copy, name=name, **kwargs) # interval elif ( is_interval_dtype(data) or is_interval_dtype(dtype) ) and not is_object_dtype(dtype): - from .interval import IntervalIndex - closed = kwargs.get("closed", None) return IntervalIndex(data, dtype=dtype, name=name, copy=copy, closed=closed) elif ( is_datetime64_any_dtype(data) - or (dtype is not None and is_datetime64_any_dtype(dtype)) + or is_datetime64_any_dtype(dtype) or "tz" in kwargs ): - from pandas import DatetimeIndex - - if dtype is not None and is_dtype_equal(_o_dtype, dtype): + if is_dtype_equal(_o_dtype, dtype): # GH#23524 passing `dtype=object` to DatetimeIndex is invalid, # will raise in the where `data` is already tz-aware. So # we leave it out of this step and cast to object-dtype after # the DatetimeIndex construction. # Note we can pass copy=False because the .astype below # will always make a copy - result = DatetimeIndex(data, copy=False, name=name, **kwargs) + result = DatetimeIndex( + data, copy=False, name=name, **kwargs + ) # type: "Index" return result.astype(object) else: - result = DatetimeIndex( - data, copy=copy, name=name, dtype=dtype, **kwargs - ) - return result + return DatetimeIndex(data, copy=copy, name=name, dtype=dtype, **kwargs) - elif is_timedelta64_dtype(data) or ( - dtype is not None and is_timedelta64_dtype(dtype) - ): - from pandas import TimedeltaIndex - - if dtype is not None and is_dtype_equal(_o_dtype, dtype): + elif is_timedelta64_dtype(data) or is_timedelta64_dtype(dtype): + if is_dtype_equal(_o_dtype, dtype): # Note we can pass copy=False because the .astype below # will always make a copy result = TimedeltaIndex(data, copy=False, name=name, **kwargs) return result.astype(object) else: - result = TimedeltaIndex( - data, copy=copy, name=name, dtype=dtype, **kwargs - ) - return result + return TimedeltaIndex(data, copy=copy, name=name, dtype=dtype, **kwargs) elif is_period_dtype(data) and not is_object_dtype(dtype): - from pandas import PeriodIndex - - result = PeriodIndex(data, copy=copy, name=name, **kwargs) - return result + return PeriodIndex(data, copy=copy, name=name, **kwargs) # extension dtype elif is_extension_array_dtype(data) or is_extension_array_dtype(dtype): data = np.asarray(data) if not (dtype is None or is_object_dtype(dtype)): - # coerce to the provided dtype - data = dtype.construct_array_type()._from_sequence( - data, dtype=dtype, copy=False - ) + ea_cls = dtype.construct_array_type() + data = ea_cls._from_sequence(data, dtype=dtype, copy=False) # coerce to the object dtype data = data.astype(object) @@ -367,73 +351,53 @@ def __new__( # index-like elif isinstance(data, (np.ndarray, Index, ABCSeries)): if dtype is not None: - try: - - # we need to avoid having numpy coerce - # things that look like ints/floats to ints unless - # they are actually ints, e.g. '0' and 0.0 - # should not be coerced - # GH 11836 - if is_integer_dtype(dtype): - inferred = lib.infer_dtype(data, skipna=False) - if inferred == "integer": - data = maybe_cast_to_integer_array(data, dtype, copy=copy) - elif inferred in ["floating", "mixed-integer-float"]: - if isna(data).any(): - raise ValueError("cannot convert float NaN to integer") - - if inferred == "mixed-integer-float": - data = maybe_cast_to_integer_array(data, dtype) - - # If we are actually all equal to integers, - # then coerce to integer. - try: - return cls._try_convert_to_int_index( - data, copy, name, dtype - ) - except ValueError: - pass - - # Return an actual float index. - from .numeric import Float64Index - - return Float64Index(data, copy=copy, dtype=dtype, name=name) - - elif inferred == "string": - pass - else: - data = data.astype(dtype) - elif is_float_dtype(dtype): - inferred = lib.infer_dtype(data, skipna=False) - if inferred == "string": + # we need to avoid having numpy coerce + # things that look like ints/floats to ints unless + # they are actually ints, e.g. '0' and 0.0 + # should not be coerced + # GH 11836 + if is_integer_dtype(dtype): + inferred = lib.infer_dtype(data, skipna=False) + if inferred == "integer": + data = maybe_cast_to_integer_array(data, dtype, copy=copy) + elif inferred in ["floating", "mixed-integer-float"]: + if isna(data).any(): + raise ValueError("cannot convert float NaN to integer") + + if inferred == "mixed-integer-float": + data = maybe_cast_to_integer_array(data, dtype) + + # If we are actually all equal to integers, + # then coerce to integer. + try: + return cls._try_convert_to_int_index( + data, copy, name, dtype + ) + except ValueError: pass - else: - data = data.astype(dtype) + + # Return an actual float index. + return Float64Index(data, copy=copy, dtype=dtype, name=name) + + elif inferred == "string": + pass else: - data = np.array(data, dtype=dtype, copy=copy) - - except (TypeError, ValueError) as e: - msg = str(e) - if ( - "cannot convert float" in msg - or "Trying to coerce float values to integer" in msg - ): - raise + data = data.astype(dtype) + elif is_float_dtype(dtype): + inferred = lib.infer_dtype(data, skipna=False) + if inferred == "string": + pass + else: + data = data.astype(dtype) + else: + data = np.array(data, dtype=dtype, copy=copy) # maybe coerce to a sub-class - from pandas.core.indexes.period import PeriodIndex, IncompatibleFrequency - if is_signed_integer_dtype(data.dtype): - from .numeric import Int64Index - return Int64Index(data, copy=copy, dtype=dtype, name=name) elif is_unsigned_integer_dtype(data.dtype): - from .numeric import UInt64Index - return UInt64Index(data, copy=copy, dtype=dtype, name=name) elif is_float_dtype(data.dtype): - from .numeric import Float64Index - return Float64Index(data, copy=copy, dtype=dtype, name=name) elif issubclass(data.dtype.type, np.bool) or is_bool_dtype(data): subarr = data.astype("object") @@ -456,12 +420,8 @@ def __new__( return Index(subarr, copy=copy, dtype=object, name=name) elif inferred in ["floating", "mixed-integer-float", "integer-na"]: # TODO: Returns IntegerArray for integer-na case in the future - from .numeric import Float64Index - return Float64Index(subarr, copy=copy, name=name) elif inferred == "interval": - from .interval import IntervalIndex - try: return IntervalIndex(subarr, name=name, copy=copy) except ValueError: @@ -472,8 +432,6 @@ def __new__( pass elif inferred != "string": if inferred.startswith("datetime"): - from pandas import DatetimeIndex - try: return DatetimeIndex(subarr, copy=copy, name=name, **kwargs) except (ValueError, OutOfBoundsDatetime): @@ -483,8 +441,6 @@ def __new__( pass elif inferred.startswith("timedelta"): - from pandas import TimedeltaIndex - return TimedeltaIndex(subarr, copy=copy, name=name, **kwargs) elif inferred == "period": try: @@ -554,16 +510,6 @@ def _simple_new(cls, values, name=None, dtype=None, **kwargs): Must be careful not to recurse. """ - if not hasattr(values, "dtype"): - if (values is None or not len(values)) and dtype is not None: - values = np.empty(0, dtype=dtype) - else: - values = np.array(values, copy=False) - if is_object_dtype(values): - values = cls( - values, name=name, dtype=dtype, **kwargs - )._ndarray_values - if isinstance(values, (ABCSeries, ABCIndexClass)): # Index._data must always be an ndarray. # This is no-copy for when _values is an ndarray, @@ -690,7 +636,11 @@ def _cleanup(self): @cache_readonly def _engine(self): # property, for now, slow to look up - return self._engine_type(lambda: self._ndarray_values, len(self)) + + # to avoid a reference cycle, bind `_ndarray_values` to a local variable, so + # `self` is not passed into the lambda. + _ndarray_values = self._ndarray_values + return self._engine_type(lambda: _ndarray_values, len(self)) # -------------------------------------------------------------------- # Array-Like Methods @@ -717,7 +667,6 @@ def __array_wrap__(self, result, context=None): return result attrs = self._get_attributes_dict() - attrs = self._maybe_update_attributes(attrs) return Index(result, **attrs) @cache_readonly @@ -1857,8 +1806,6 @@ def inferred_type(self): @cache_readonly def is_all_dates(self): - if self._data is None: - return False return is_datetime_array(ensure_object(self.values)) # -------------------------------------------------------------------- @@ -2045,7 +1992,7 @@ def notna(self): _index_shared_docs[ "fillna" ] = """ - Fill NA/NaN values with the specified value + Fill NA/NaN values with the specified value. Parameters ---------- @@ -2076,7 +2023,7 @@ def fillna(self, value=None, downcast=None): _index_shared_docs[ "dropna" ] = """ - Return Index without NA/NaN values + Return Index without NA/NaN values. Parameters ---------- @@ -2350,7 +2297,10 @@ def __sub__(self, other): return Index(np.array(self) - other) def __rsub__(self, other): - return Index(other - np.array(self)) + # wrap Series to ensure we pin name correctly + from pandas import Series + + return Index(other - Series(self)) def __and__(self, other): return self.intersection(other) @@ -3129,13 +3079,9 @@ def _convert_scalar_indexer(self, key, kind=None): """ @Appender(_index_shared_docs["_convert_slice_indexer"]) - def _convert_slice_indexer(self, key, kind=None): + def _convert_slice_indexer(self, key: slice, kind=None): assert kind in ["ix", "loc", "getitem", "iloc", None] - # if we are not a slice, then we are done - if not isinstance(key, slice): - return key - # validate iloc if kind == "iloc": return slice( @@ -4713,7 +4659,7 @@ def get_value(self, series, key): raise try: - return libindex.get_value_box(s, key) + return libindex.get_value_at(s, key) except IndexError: raise except TypeError: @@ -5363,67 +5309,6 @@ def _add_numeric_methods_disabled(cls): cls.__abs__ = make_invalid_op("__abs__") cls.__inv__ = make_invalid_op("__inv__") - def _maybe_update_attributes(self, attrs): - """ - Update Index attributes (e.g. freq) depending on op. - """ - return attrs - - def _validate_for_numeric_unaryop(self, op, opstr): - """ - Validate if we can perform a numeric unary operation. - """ - if not self._is_numeric_dtype: - raise TypeError( - "cannot evaluate a numeric op " - "{opstr} for type: {typ}".format(opstr=opstr, typ=type(self).__name__) - ) - - def _validate_for_numeric_binop(self, other, op): - """ - Return valid other; evaluate or raise TypeError if we are not of - the appropriate type. - - Notes - ----- - This is an internal method called by ops. - """ - opstr = "__{opname}__".format(opname=op.__name__) - # if we are an inheritor of numeric, - # but not actually numeric (e.g. DatetimeIndex/PeriodIndex) - if not self._is_numeric_dtype: - raise TypeError( - "cannot evaluate a numeric op {opstr} " - "for type: {typ}".format(opstr=opstr, typ=type(self).__name__) - ) - - if isinstance(other, Index): - if not other._is_numeric_dtype: - raise TypeError( - "cannot evaluate a numeric op " - "{opstr} with type: {typ}".format(opstr=opstr, typ=type(other)) - ) - elif isinstance(other, np.ndarray) and not other.ndim: - other = other.item() - - if isinstance(other, (Index, ABCSeries, np.ndarray)): - if len(self) != len(other): - raise ValueError("cannot evaluate a numeric op with unequal lengths") - other = com.values_from_object(other) - if other.dtype.kind not in ["f", "i", "u"]: - raise TypeError("cannot evaluate a numeric op with a non-numeric dtype") - elif isinstance(other, (ABCDateOffset, np.timedelta64, timedelta)): - # higher up to handle - pass - elif isinstance(other, (datetime, np.datetime64)): - # higher up to handle - pass - else: - if not (is_float(other) or is_integer(other)): - raise TypeError("can only perform ops with scalar values") - - return other - @classmethod def _add_numeric_methods_binary(cls): """ @@ -5456,9 +5341,7 @@ def _add_numeric_methods_unary(cls): def _make_evaluate_unary(op, opstr): def _evaluate_numeric_unary(self): - self._validate_for_numeric_unaryop(op, opstr) attrs = self._get_attributes_dict() - attrs = self._maybe_update_attributes(attrs) return Index(op(self.values), **attrs) _evaluate_numeric_unary.__name__ = opstr @@ -5599,7 +5482,10 @@ def shape(self): """ Return a tuple of the shape of the underlying data. """ - return (len(self),) + # not using "(len(self), )" to return "correct" shape if the values + # consists of a >1 D array (see GH-27775) + # overridden in MultiIndex.shape to avoid materializing the values + return self._values.shape Index._add_numeric_methods_disabled() diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 0f6aa711adc90..82806c7351db6 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -446,9 +446,11 @@ def argsort(self, *args, **kwargs): @cache_readonly def _engine(self): - - # we are going to look things up with the codes themselves - return self._engine_type(lambda: self.codes, len(self)) + # we are going to look things up with the codes themselves. + # To avoid a reference cycle, bind `codes` to a local variable, so + # `self` is not passed into the lambda. + codes = self.codes + return self._engine_type(lambda: codes, len(self)) # introspection @cache_readonly @@ -899,31 +901,12 @@ def _make_compare(op): opname = "__{op}__".format(op=op.__name__) def _evaluate_compare(self, other): - - # if we have a Categorical type, then must have the same - # categories - if isinstance(other, CategoricalIndex): - other = other._values - elif isinstance(other, Index): - other = self._create_categorical(other._values, dtype=self.dtype) - - if isinstance(other, (ABCCategorical, np.ndarray, ABCSeries)): - if len(self.values) != len(other): - raise ValueError("Lengths must match to compare") - - if isinstance(other, ABCCategorical): - if not self.values.is_dtype_equal(other): - raise TypeError( - "categorical index comparisons must " - "have the same categories and ordered " - "attributes" - ) - - result = op(self.values, other) + with np.errstate(all="ignore"): + result = op(self.array, other) if isinstance(result, ABCSeries): # Dispatch to pd.Categorical returned NotImplemented # and we got a Series back; down-cast to ndarray - result = result.values + result = result._values return result return compat.set_function_name(_evaluate_compare, opname, cls) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index af99c7a2754e5..c7664d9777c71 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -15,6 +15,7 @@ from pandas.core.dtypes.common import ( ensure_int64, + is_bool_dtype, is_dtype_equal, is_float, is_integer, @@ -163,6 +164,20 @@ def values(self): def asi8(self): return self._data.asi8 + def __array_wrap__(self, result, context=None): + """ + Gets called after a ufunc. + """ + result = lib.item_from_zerodim(result) + if is_bool_dtype(result) or lib.is_scalar(result): + return result + + attrs = self._get_attributes_dict() + if not is_period_dtype(self) and attrs["freq"]: + # no need to infer if freq is None + attrs["freq"] = "infer" + return Index(result, **attrs) + # ------------------------------------------------------------------------ def equals(self, other): diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 67de7b0196b8e..cce390d98c037 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -4,7 +4,7 @@ import numpy as np -from pandas._libs import Timestamp, index as libindex, lib, tslib as libts +from pandas._libs import NaT, Timestamp, index as libindex, lib, tslib as libts import pandas._libs.join as libjoin from pandas._libs.tslibs import ccalendar, fields, parsing, timezones from pandas.util._decorators import Appender, Substitution, cache_readonly @@ -69,7 +69,7 @@ class DatetimeDelegateMixin(DatetimelikeDelegateMixin): # Some are "raw" methods, the result is not not re-boxed in an Index # We also have a few "extra" attrs, which may or may not be raw, # which we we dont' want to expose in the .dt accessor. - _extra_methods = ["to_period", "to_perioddelta", "to_julian_date"] + _extra_methods = ["to_period", "to_perioddelta", "to_julian_date", "strftime"] _extra_raw_methods = ["to_pydatetime", "_local_timestamps", "_has_same_tz"] _extra_raw_properties = ["_box_func", "tz", "tzinfo"] _delegated_properties = DatetimeArray._datetimelike_ops + _extra_raw_properties @@ -465,14 +465,6 @@ def _convert_for_op(self, value): return _to_M8(value) raise ValueError("Passed item and index have different timezone") - def _maybe_update_attributes(self, attrs): - """ Update Index attributes (e.g. freq) depending on op """ - freq = attrs.get("freq", None) - if freq is not None: - # no need to infer if freq is None - attrs["freq"] = "infer" - return attrs - # -------------------------------------------------------------------- # Rendering Methods @@ -669,7 +661,7 @@ def _get_time_micros(self): def to_series(self, keep_tz=None, index=None, name=None): """ Create a Series with both index and values equal to the index keys - useful with map for returning an indexer based on an index + useful with map for returning an indexer based on an index. Parameters ---------- @@ -695,10 +687,10 @@ def to_series(self, keep_tz=None, index=None, name=None): behaviour and silence the warning. index : Index, optional - index of resulting Series. If None, defaults to original index - name : string, optional - name of resulting Series. If None, defaults to name of original - index + Index of resulting Series. If None, defaults to original index. + name : str, optional + Name of resulting Series. If None, defaults to name of original + index. Returns ------- @@ -743,7 +735,7 @@ def to_series(self, keep_tz=None, index=None, name=None): def snap(self, freq="S"): """ - Snap time stamps to nearest occurring frequency + Snap time stamps to nearest occurring frequency. Returns ------- @@ -1184,7 +1176,6 @@ def slice_indexer(self, start=None, end=None, step=None, kind=None): is_normalized = cache_readonly(DatetimeArray.is_normalized.fget) # type: ignore _resolution = cache_readonly(DatetimeArray._resolution.fget) # type: ignore - strftime = ea_passthrough(DatetimeArray.strftime) _has_same_tz = ea_passthrough(DatetimeArray._has_same_tz) @property @@ -1282,7 +1273,9 @@ def insert(self, loc, item): raise ValueError("Passed item and index have different timezone") # check freq can be preserved on edge cases if self.size and self.freq is not None: - if (loc == 0 or loc == -len(self)) and item + self.freq == self[0]: + if item is NaT: + pass + elif (loc == 0 or loc == -len(self)) and item + self.freq == self[0]: freq = self.freq elif (loc == len(self)) and item - self.freq == self[-1]: freq = self.freq @@ -1601,7 +1594,7 @@ def bdate_range( ): """ Return a fixed frequency DatetimeIndex, with business day as the default - frequency + frequency. Parameters ---------- diff --git a/pandas/core/indexes/frozen.py b/pandas/core/indexes/frozen.py index 2e5b3ff8ef502..329456e25bded 100644 --- a/pandas/core/indexes/frozen.py +++ b/pandas/core/indexes/frozen.py @@ -22,7 +22,6 @@ class FrozenList(PandasObject, list): - """ Container that doesn't allow setting item *but* because it's technically non-hashable, will be used diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 7a444683ffcb2..7c581a12764b1 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -269,22 +269,6 @@ def from_arrays( ) return cls._simple_new(array, name=name) - @classmethod - @Appender(_interval_shared_docs["from_intervals"] % _index_doc_kwargs) - def from_intervals(cls, data, closed=None, name=None, copy=False, dtype=None): - msg = ( - "IntervalIndex.from_intervals is deprecated and will be " - "removed in a future version; Use IntervalIndex(...) instead" - ) - warnings.warn(msg, FutureWarning, stacklevel=2) - with rewrite_exception("IntervalArray", cls.__name__): - array = IntervalArray(data, closed=closed, copy=copy, dtype=dtype) - - if name is None and isinstance(data, cls): - name = data.name - - return cls._simple_new(array, name=name) - @classmethod @Appender(_interval_shared_docs["from_tuples"] % _index_doc_kwargs) def from_tuples(cls, data, closed="right", name=None, copy=False, dtype=None): @@ -347,7 +331,8 @@ def __contains__(self, key): >>> idx.to_tuples() Index([(0.0, 1.0), (nan, nan), (2.0, 3.0)], dtype='object') >>> idx.to_tuples(na_tuple=False) - Index([(0.0, 1.0), nan, (2.0, 3.0)], dtype='object')""", + Index([(0.0, 1.0), nan, (2.0, 3.0)], dtype='object') + """, ) ) def to_tuples(self, na_tuple=True): @@ -804,7 +789,7 @@ def _find_non_overlapping_monotonic_bounds(self, key): return start, stop def get_loc( - self, key: Any, method: Optional[str] = None + self, key: Any, method: Optional[str] = None, tolerance=None ) -> Union[int, slice, np.ndarray]: """ Get integer location, slice or boolean mask for requested label. @@ -998,7 +983,7 @@ def get_indexer_for(self, target: AnyArrayLike, **kwargs) -> np.ndarray: List of indices. """ if self.is_overlapping: - return self.get_indexer_non_unique(target, **kwargs)[0] + return self.get_indexer_non_unique(target)[0] return self.get_indexer(target, **kwargs) @Appender(_index_shared_docs["get_value"] % _index_doc_kwargs) @@ -1111,12 +1096,8 @@ def _format_with_header(self, header, **kwargs): return header + list(self._format_native_types(**kwargs)) def _format_native_types(self, na_rep="NaN", quoting=None, **kwargs): - """ actually format my specific types """ - from pandas.io.formats.format import ExtensionArrayFormatter - - return ExtensionArrayFormatter( - values=self, na_rep=na_rep, justify="all", leading_space=False - ).get_result() + # GH 28210: use base method but with different default na_rep + return super()._format_native_types(na_rep=na_rep, quoting=quoting, **kwargs) def _format_data(self, name=None): @@ -1326,7 +1307,7 @@ def interval_range( start=None, end=None, periods=None, freq=None, name=None, closed="right" ): """ - Return a fixed frequency IntervalIndex + Return a fixed frequency IntervalIndex. Parameters ---------- diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 488107690fbd6..761862b9f30e9 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -622,6 +622,15 @@ def _values(self): # We override here, since our parent uses _data, which we dont' use. return self.values + @property + def shape(self): + """ + Return a tuple of the shape of the underlying data. + """ + # overriding the base Index.shape definition to avoid materializing + # the values (GH-27384, GH-27775) + return (len(self),) + @property def array(self): """ @@ -1241,7 +1250,7 @@ def _set_names(self, names, level=None, validate=True): self.levels[l].rename(name, inplace=True) names = property( - fset=_set_names, fget=_get_names, doc="""\nNames of levels in MultiIndex\n""" + fset=_set_names, fget=_get_names, doc="""\nNames of levels in MultiIndex.\n""" ) @Appender(_index_shared_docs["_get_grouper_for_level"]) @@ -1753,7 +1762,7 @@ def is_all_dates(self): def is_lexsorted(self): """ - Return True if the codes are lexicographically sorted + Return True if the codes are lexicographically sorted. Returns ------- @@ -2237,7 +2246,7 @@ def swaplevel(self, i=-2, j=-1): def reorder_levels(self, order): """ - Rearrange levels using input order. May not drop or duplicate levels + Rearrange levels using input order. May not drop or duplicate levels. Parameters ---------- diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index f6b3d1076043e..f7bf77928bdc7 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -1,5 +1,6 @@ from datetime import datetime, timedelta import warnings +import weakref import numpy as np @@ -63,7 +64,10 @@ class PeriodDelegateMixin(DatetimelikeDelegateMixin): _delegate_class = PeriodArray _delegated_properties = PeriodArray._datetimelike_ops - _delegated_methods = set(PeriodArray._datetimelike_methods) | {"_addsub_int_array"} + _delegated_methods = set(PeriodArray._datetimelike_methods) | { + "_addsub_int_array", + "strftime", + } _raw_properties = {"is_leap_year"} @@ -438,7 +442,9 @@ def _formatter_func(self): @cache_readonly def _engine(self): - return self._engine_type(lambda: self, len(self)) + # To avoid a reference cycle, pass a weakref of self to _engine_type. + period = weakref.ref(self) + return self._engine_type(period, len(self)) @Appender(_index_shared_docs["contains"]) def __contains__(self, key): @@ -988,7 +994,7 @@ def memory_usage(self, deep=False): def period_range(start=None, end=None, periods=None, freq=None, name=None): """ Return a fixed frequency PeriodIndex, with day (calendar) as the default - frequency + frequency. Parameters ---------- diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index d2bea5f68b92d..8783351cc74d1 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -25,6 +25,7 @@ from pandas.core import ops import pandas.core.common as com +from pandas.core.construction import extract_array import pandas.core.indexes.base as ibase from pandas.core.indexes.base import Index, _index_shared_docs from pandas.core.indexes.numeric import Int64Index @@ -74,7 +75,7 @@ class RangeIndex(Int64Index): _engine_type = libindex.Int64Engine _range = None # type: range - # check whether self._data has benn called + # check whether self._data has been called _cached_data = None # type: np.ndarray # -------------------------------------------------------------------- # Constructors @@ -235,7 +236,7 @@ def _format_with_header(self, header, na_rep="NaN", **kwargs): @cache_readonly def start(self): """ - The value of the `start` parameter (``0`` if this was not supplied) + The value of the `start` parameter (``0`` if this was not supplied). """ # GH 25710 return self._range.start @@ -243,7 +244,7 @@ def start(self): @property def _start(self): """ - The value of the `start` parameter (``0`` if this was not supplied) + The value of the `start` parameter (``0`` if this was not supplied). .. deprecated:: 0.25.0 Use ``start`` instead. @@ -258,14 +259,14 @@ def _start(self): @cache_readonly def stop(self): """ - The value of the `stop` parameter + The value of the `stop` parameter. """ return self._range.stop @property def _stop(self): """ - The value of the `stop` parameter + The value of the `stop` parameter. .. deprecated:: 0.25.0 Use ``stop`` instead. @@ -281,7 +282,7 @@ def _stop(self): @cache_readonly def step(self): """ - The value of the `step` parameter (``1`` if this was not supplied) + The value of the `step` parameter (``1`` if this was not supplied). """ # GH 25710 return self._range.step @@ -289,7 +290,7 @@ def step(self): @property def _step(self): """ - The value of the `step` parameter (``1`` if this was not supplied) + The value of the `step` parameter (``1`` if this was not supplied). .. deprecated:: 0.25.0 Use ``step`` instead. @@ -782,9 +783,8 @@ def _evaluate_numeric_binop(self, other): # Must be an np.ndarray; GH#22390 return op(self._int64index, other) - other = self._validate_for_numeric_binop(other, op) + other = extract_array(other, extract_numpy=True) attrs = self._get_attributes_dict() - attrs = self._maybe_update_attributes(attrs) left, right = self, other diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index f2ce562536b95..b03d60c7b5b37 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -44,7 +44,12 @@ class TimedeltaDelegateMixin(DatetimelikeDelegateMixin): # which we we dont' want to expose in the .dt accessor. _delegate_class = TimedeltaArray _delegated_properties = TimedeltaArray._datetimelike_ops + ["components"] - _delegated_methods = TimedeltaArray._datetimelike_methods + ["_box_values"] + _delegated_methods = TimedeltaArray._datetimelike_methods + [ + "_box_values", + "__neg__", + "__pos__", + "__abs__", + ] _raw_properties = {"components"} _raw_methods = {"to_pytimedelta"} @@ -56,27 +61,27 @@ class TimedeltaDelegateMixin(DatetimelikeDelegateMixin): TimedeltaArray, TimedeltaDelegateMixin._delegated_methods, typ="method", - overwrite=False, + overwrite=True, ) class TimedeltaIndex( DatetimeIndexOpsMixin, dtl.TimelikeOps, Int64Index, TimedeltaDelegateMixin ): """ Immutable ndarray of timedelta64 data, represented internally as int64, and - which can be boxed to timedelta objects + which can be boxed to timedelta objects. Parameters ---------- data : array-like (1-dimensional), optional - Optional timedelta-like data to construct index with + Optional timedelta-like data to construct index with. unit : unit of the arg (D,h,m,s,ms,us,ns) denote the unit, optional - which is an integer/float number - freq : string or pandas offset object, optional + Which is an integer/float number. + freq : str or pandas offset object, optional One of pandas date offset strings or corresponding objects. The string 'infer' can be passed in order to set the frequency of the index as the - inferred frequency upon creation + inferred frequency upon creation. copy : bool - Make a copy of input ndarray + Make a copy of input ndarray. start : starting value, timedelta-like, optional If data is None, start is used as the start point in generating regular timedelta data. @@ -85,24 +90,24 @@ class TimedeltaIndex( periods : int, optional, > 0 Number of periods to generate, if generating index. Takes precedence - over end argument + over end argument. .. deprecated:: 0.24.0 end : end time, timedelta-like, optional If periods is none, generated index will extend to first conforming - time on or just past end argument + time on or just past end argument. .. deprecated:: 0.24. 0 - closed : string or None, default None + closed : str or None, default None Make the interval closed with respect to the given frequency to - the 'left', 'right', or both sides (None) + the 'left', 'right', or both sides (None). .. deprecated:: 0.24. 0 name : object - Name to be stored in the index + Name to be stored in the index. Attributes ---------- @@ -279,14 +284,6 @@ def __setstate__(self, state): _unpickle_compat = __setstate__ - def _maybe_update_attributes(self, attrs): - """ Update Index attributes (e.g. freq) depending on op """ - freq = attrs.get("freq", None) - if freq is not None: - # no need to infer if freq is None - attrs["freq"] = "infer" - return attrs - # ------------------------------------------------------------------- # Rendering Methods @@ -689,7 +686,6 @@ def delete(self, loc): TimedeltaIndex._add_comparison_ops() -TimedeltaIndex._add_numeric_methods_unary() TimedeltaIndex._add_logical_methods_disabled() TimedeltaIndex._add_datetimelike_methods() @@ -717,7 +713,7 @@ def timedelta_range( ): """ Return a fixed frequency TimedeltaIndex, with day as the default - frequency + frequency. Parameters ---------- diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index e308ae03730b3..3d495eeb8c885 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -22,11 +22,11 @@ is_sparse, ) from pandas.core.dtypes.concat import concat_compat -from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries +from pandas.core.dtypes.generic import ABCDataFrame, ABCMultiIndex, ABCSeries from pandas.core.dtypes.missing import _infer_fill_value, isna import pandas.core.common as com -from pandas.core.index import Index, InvalidIndexError, MultiIndex +from pandas.core.index import Index, InvalidIndexError from pandas.core.indexers import is_list_like_indexer, length_of_indexer @@ -49,7 +49,7 @@ def get_indexers_list(): # the public IndexSlicerMaker class _IndexSlice: """ - Create an object to more easily perform multi-index slicing + Create an object to more easily perform multi-index slicing. See Also -------- @@ -124,7 +124,7 @@ def __getitem__(self, key): key = tuple(com.apply_if_callable(x, self.obj) for x in key) try: values = self.obj._get_value(*key) - except (KeyError, TypeError, InvalidIndexError): + except (KeyError, TypeError, InvalidIndexError, AttributeError): # TypeError occurs here if the key has non-hashable entries, # generally slice or list. # TODO(ix): most/all of the TypeError cases here are for ix, @@ -132,6 +132,9 @@ def __getitem__(self, key): # The InvalidIndexError is only catched for compatibility # with geopandas, see # https://github.com/pandas-dev/pandas/issues/27258 + # TODO: The AttributeError is for IntervalIndex which + # incorrectly implements get_value, see + # https://github.com/pandas-dev/pandas/issues/27865 pass else: if is_scalar(values): @@ -169,7 +172,7 @@ def _get_setitem_indexer(self, key): ax = self.obj._get_axis(0) - if isinstance(ax, MultiIndex) and self.name != "iloc": + if isinstance(ax, ABCMultiIndex) and self.name != "iloc": try: return ax.get_loc(key) except Exception: @@ -238,7 +241,7 @@ def _has_valid_tuple(self, key: Tuple): ) def _is_nested_tuple_indexer(self, tup: Tuple): - if any(isinstance(ax, MultiIndex) for ax in self.obj.axes): + if any(isinstance(ax, ABCMultiIndex) for ax in self.obj.axes): return any(is_nested_tuple(tup, ax) for ax in self.obj.axes) return False @@ -321,6 +324,17 @@ def _setitem_with_indexer(self, indexer, value): val = list(value.values()) if isinstance(value, dict) else value take_split_path = not blk._can_hold_element(val) + # if we have any multi-indexes that have non-trivial slices + # (not null slices) then we must take the split path, xref + # GH 10360, GH 27841 + if isinstance(indexer, tuple) and len(indexer) == len(self.obj.axes): + for i, ax in zip(indexer, self.obj.axes): + if isinstance(ax, ABCMultiIndex) and not ( + is_integer(i) or com.is_null_slice(i) + ): + take_split_path = True + break + if isinstance(indexer, tuple): nindexer = [] for i, idx in enumerate(indexer): @@ -408,7 +422,9 @@ def _setitem_with_indexer(self, indexer, value): # if we have a partial multiindex, then need to adjust the plane # indexer here - if len(labels) == 1 and isinstance(self.obj[labels[0]].axes[0], MultiIndex): + if len(labels) == 1 and isinstance( + self.obj[labels[0]].axes[0], ABCMultiIndex + ): item = labels[0] obj = self.obj[item] index = obj.index @@ -481,7 +497,7 @@ def setter(item, v): # we have an equal len Frame if isinstance(value, ABCDataFrame): sub_indexer = list(indexer) - multiindex_indexer = isinstance(labels, MultiIndex) + multiindex_indexer = isinstance(labels, ABCMultiIndex) for item in labels: if item in value: @@ -763,8 +779,8 @@ def _align_frame(self, indexer, df: ABCDataFrame): # we have a multi-index and are trying to align # with a particular, level GH3738 if ( - isinstance(ax, MultiIndex) - and isinstance(df.index, MultiIndex) + isinstance(ax, ABCMultiIndex) + and isinstance(df.index, ABCMultiIndex) and ax.nlevels != df.index.nlevels ): raise TypeError( @@ -890,7 +906,7 @@ def _getitem_lowerdim(self, tup: Tuple): ax0 = self.obj._get_axis(0) # ...but iloc should handle the tuple as simple integer-location # instead of checking it as multiindex representation (GH 13797) - if isinstance(ax0, MultiIndex) and self.name != "iloc": + if isinstance(ax0, ABCMultiIndex) and self.name != "iloc": result = self._handle_lowerdim_multi_index_axis0(tup) if result is not None: return result @@ -990,7 +1006,7 @@ def _getitem_axis(self, key, axis: int): if isinstance(key, slice): return self._get_slice_axis(key, axis=axis) elif is_list_like_indexer(key) and not ( - isinstance(key, tuple) and isinstance(labels, MultiIndex) + isinstance(key, tuple) and isinstance(labels, ABCMultiIndex) ): if hasattr(key, "ndim") and key.ndim > 1: @@ -1003,7 +1019,7 @@ def _getitem_axis(self, key, axis: int): key = labels._maybe_cast_indexer(key) if is_integer(key): - if axis == 0 and isinstance(labels, MultiIndex): + if axis == 0 and isinstance(labels, ABCMultiIndex): try: return self._get_label(key, axis=axis) except (KeyError, TypeError): @@ -1214,7 +1230,7 @@ def _convert_to_indexer(self, obj, axis: int, raise_missing: bool = False): try: return labels.get_loc(obj) except LookupError: - if isinstance(obj, tuple) and isinstance(labels, MultiIndex): + if isinstance(obj, tuple) and isinstance(labels, ABCMultiIndex): if len(obj) == labels.nlevels: return {"key": obj} raise @@ -1234,7 +1250,7 @@ def _convert_to_indexer(self, obj, axis: int, raise_missing: bool = False): # always valid return {"key": obj} - if obj >= self.obj.shape[axis] and not isinstance(labels, MultiIndex): + if obj >= self.obj.shape[axis] and not isinstance(labels, ABCMultiIndex): # a positional raise ValueError("cannot set by positional indexing with enlargement") @@ -1701,7 +1717,7 @@ def _is_scalar_access(self, key: Tuple): return False ax = self.obj.axes[i] - if isinstance(ax, MultiIndex): + if isinstance(ax, ABCMultiIndex): return False if isinstance(k, str) and ax._supports_partial_string_indexing: @@ -1723,7 +1739,7 @@ def _getitem_scalar(self, key): def _get_partial_string_timestamp_match_key(self, key, labels): """Translate any partial string timestamp matches in key, returning the new key (GH 10331)""" - if isinstance(labels, MultiIndex): + if isinstance(labels, ABCMultiIndex): if ( isinstance(key, str) and labels.levels[0]._supports_partial_string_indexing @@ -1767,7 +1783,7 @@ def _getitem_axis(self, key, axis: int): # to a list of keys # we will use the *values* of the object # and NOT the index if its a PandasObject - if isinstance(labels, MultiIndex): + if isinstance(labels, ABCMultiIndex): if isinstance(key, (ABCSeries, np.ndarray)) and key.ndim <= 1: # Series, or 0,1 ndim ndarray @@ -1795,7 +1811,7 @@ def _getitem_axis(self, key, axis: int): key = tuple([key]) # an iterable multi-selection - if not (isinstance(key, tuple) and isinstance(labels, MultiIndex)): + if not (isinstance(key, tuple) and isinstance(labels, ABCMultiIndex)): if hasattr(key, "ndim") and key.ndim > 1: raise ValueError("Cannot index with multidimensional key") @@ -2460,7 +2476,7 @@ def is_nested_tuple(tup, labels): for i, k in enumerate(tup): if is_list_like(k) or isinstance(k, slice): - return isinstance(labels, MultiIndex) + return isinstance(labels, ABCMultiIndex) return False diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 6a2aebe5db246..33698d245e9ff 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -7,7 +7,7 @@ import numpy as np -from pandas._libs import NaT, Timestamp, lib, tslib, tslibs +from pandas._libs import NaT, Timestamp, lib, tslib import pandas._libs.internals as libinternals from pandas._libs.tslibs import Timedelta, conversion from pandas._libs.tslibs.timezones import tz_compare @@ -18,6 +18,7 @@ find_common_type, infer_dtype_from, infer_dtype_from_scalar, + maybe_downcast_numeric, maybe_downcast_to_dtype, maybe_infer_dtype_type, maybe_promote, @@ -55,7 +56,6 @@ ABCDataFrame, ABCDatetimeIndex, ABCExtensionArray, - ABCIndexClass, ABCPandasArray, ABCSeries, ) @@ -68,13 +68,7 @@ ) import pandas.core.algorithms as algos -from pandas.core.arrays import ( - Categorical, - DatetimeArray, - ExtensionArray, - PandasDtype, - TimedeltaArray, -) +from pandas.core.arrays import Categorical, DatetimeArray, PandasDtype, TimedeltaArray from pandas.core.base import PandasObject import pandas.core.common as com from pandas.core.construction import extract_array @@ -209,10 +203,6 @@ def internal_values(self, dtype=None): """ return self.values - def formatting_values(self): - """Return the internal values used by the DataFrame/SeriesFormatter""" - return self.internal_values() - def get_values(self, dtype=None): """ return an internal format, currently just the ndarray @@ -273,6 +263,8 @@ def make_block_same_class(self, values, placement=None, ndim=None, dtype=None): ) if placement is None: placement = self.mgr_locs + if ndim is None: + ndim = self.ndim return make_block( values, placement=placement, ndim=ndim, klass=self.__class__, dtype=dtype ) @@ -415,7 +407,7 @@ def fillna(self, value, limit=None, inplace=False, downcast=None): return self.copy() if self._can_hold_element(value): - # equivalent: self._try_coerce_args(value) would not raise + # equivalent: _try_coerce_args(value) would not raise blocks = self.putmask(mask, value, inplace=inplace) return self._maybe_downcast(blocks, downcast) @@ -434,7 +426,7 @@ def f(m, v, i): return self.split_and_operate(mask, f, inplace) - def split_and_operate(self, mask, f, inplace): + def split_and_operate(self, mask, f, inplace: bool): """ split the block per-column, and apply the callable f per-column, return a new block for each. Handle @@ -493,17 +485,15 @@ def make_a_block(nv, ref_loc): return new_blocks - def _maybe_downcast(self, blocks, downcast=None): + def _maybe_downcast(self, blocks: List["Block"], downcast=None) -> List["Block"]: # no need to downcast our float # unless indicated - if downcast is None and self.is_float: - return blocks - elif downcast is None and (self.is_timedelta or self.is_datetime): + if downcast is None and ( + self.is_float or self.is_timedelta or self.is_datetime + ): return blocks - if not isinstance(blocks, list): - blocks = [blocks] return _extend_blocks([b.downcast(downcast) for b in blocks]) def downcast(self, dtypes=None): @@ -679,7 +669,7 @@ def convert( return self.copy() if copy else self - def _can_hold_element(self, element): + def _can_hold_element(self, element: Any) -> bool: """ require the same dtype as ourselves """ dtype = self.values.dtype.type tipo = maybe_infer_dtype_type(element) @@ -687,28 +677,6 @@ def _can_hold_element(self, element): return issubclass(tipo.type, dtype) return isinstance(element, dtype) - def _try_cast_result(self, result, dtype=None): - """ try to cast the result to our original type, we may have - roundtripped thru object in the mean-time - """ - if dtype is None: - dtype = self.dtype - - if self.is_integer or self.is_bool or self.is_datetime: - pass - elif self.is_float and result.dtype == self.dtype: - # protect against a bool/object showing up here - if isinstance(dtype, str) and dtype == "infer": - return result - - # This is only reached via Block.setitem, where dtype is always - # either "infer", self.dtype, or values.dtype. - assert dtype == self.dtype, (dtype, self.dtype) - return result - - # may need to change the dtype here - return maybe_downcast_to_dtype(result, dtype) - def _try_coerce_args(self, other): """ provide coercion to our input arguments """ @@ -731,10 +699,6 @@ def _try_coerce_args(self, other): return other - def _try_coerce_and_cast_result(self, result, dtype=None): - result = self._try_cast_result(result, dtype=dtype) - return result - def to_native_types(self, slicer=None, na_rep="nan", quoting=None, **kwargs): """ convert to our native types format, slicing if desired """ @@ -774,8 +738,31 @@ def replace( # If we cannot replace with own dtype, convert to ObjectBlock and # retry if not self._can_hold_element(to_replace): - # TODO: we should be able to infer at this point that there is - # nothing to replace + if not isinstance(to_replace, list): + if inplace: + return [self] + return [self.copy()] + + to_replace = [x for x in to_replace if self._can_hold_element(x)] + if not len(to_replace): + # GH#28084 avoid costly checks since we can infer + # that there is nothing to replace in this block + if inplace: + return [self] + return [self.copy()] + + if len(to_replace) == 1: + # _can_hold_element checks have reduced this back to the + # scalar case and we can avoid a costly object cast + return self.replace( + to_replace[0], + value, + inplace=inplace, + filter=filter, + regex=regex, + convert=convert, + ) + # GH 22083, TypeError or ValueError occurred within error handling # causes infinite loop. Cast and retry only if not objectblock. if is_object_dtype(self): @@ -784,7 +771,7 @@ def replace( # try again with a compatible block block = self.astype(object) return block.replace( - to_replace=original_to_replace, + to_replace=to_replace, value=value, inplace=inplace, filter=filter, @@ -800,14 +787,27 @@ def replace( filtered_out = ~self.mgr_locs.isin(filter) mask[filtered_out.nonzero()[0]] = False + if not mask.any(): + if inplace: + return [self] + return [self.copy()] + try: blocks = self.putmask(mask, value, inplace=inplace) + # Note: it is _not_ the case that self._can_hold_element(value) + # is always true at this point. In particular, that can fail + # for: + # "2u" with bool-dtype, float-dtype + # 0.5 with int64-dtype + # np.nan with int64-dtype except (TypeError, ValueError): # GH 22083, TypeError or ValueError occurred within error handling # causes infinite loop. Cast and retry only if not objectblock. if is_object_dtype(self): raise + assert not self._can_hold_element(value), value + # try again with a compatible block block = self.astype(object) return block.replace( @@ -857,12 +857,6 @@ def setitem(self, indexer, value): if self._can_hold_element(value): value = self._try_coerce_args(value) - # can keep its own dtype - if hasattr(value, "dtype") and is_dtype_equal(values.dtype, value.dtype): - dtype = self.dtype - else: - dtype = "infer" - else: # current dtype cannot store value, coerce to common dtype find_dtype = False @@ -871,15 +865,9 @@ def setitem(self, indexer, value): dtype = value.dtype find_dtype = True - elif lib.is_scalar(value): - if isna(value): - # NaN promotion is handled in latter path - dtype = False - else: - dtype, _ = infer_dtype_from_scalar(value, pandas_dtype=True) - find_dtype = True - else: - dtype = "infer" + elif lib.is_scalar(value) and not isna(value): + dtype, _ = infer_dtype_from_scalar(value, pandas_dtype=True) + find_dtype = True if find_dtype: dtype = find_common_type([values.dtype, dtype]) @@ -927,8 +915,6 @@ def setitem(self, indexer, value): else: values[indexer] = value - # coerce and try to infer the dtypes of the result - values = self._try_coerce_and_cast_result(values, dtype) if transpose: values = values.T block = self.make_block(values) @@ -962,6 +948,7 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0, transpose=False) # if we are passed a scalar None, convert it here if not is_list_like(new) and isna(new) and not self.is_object: + # FIXME: make sure we have compatible NA new = self.fill_value if self._can_hold_element(new): @@ -1089,7 +1076,7 @@ def coerce_to_target_dtype(self, other): mytz = getattr(self.dtype, "tz", None) othertz = getattr(dtype, "tz", None) - if str(mytz) != str(othertz): + if not tz_compare(mytz, othertz): return self.astype(object) raise AssertionError( @@ -1309,7 +1296,7 @@ def take_nd(self, indexer, axis, new_mgr_locs=None, fill_tuple=None): else: return self.make_block_same_class(new_values, new_mgr_locs) - def diff(self, n, axis=1): + def diff(self, n: int, axis: int = 1) -> List["Block"]: """ return block for the diff of the values """ new_values = algos.diff(self.values, n, axis=axis) return [self.make_block(values=new_values)] @@ -1343,7 +1330,15 @@ def shift(self, periods, axis=0, fill_value=None): return [self.make_block(new_values)] - def where(self, other, cond, align=True, errors="raise", try_cast=False, axis=0): + def where( + self, + other, + cond, + align=True, + errors="raise", + try_cast: bool = False, + axis: int = 0, + ) -> List["Block"]: """ evaluate the block; return result block(s) from the result @@ -1390,26 +1385,14 @@ def func(cond, values, other): if not ( (self.is_integer or self.is_bool) - and lib.is_scalar(other) + and lib.is_float(other) and np.isnan(other) ): # np.where will cast integer array to floats in this case other = self._try_coerce_args(other) - try: - fastres = expressions.where(cond, values, other) - return fastres - except Exception as detail: - if errors == "raise": - raise TypeError( - "Could not operate [{other!r}] with block values " - "[{detail!s}]".format(other=other, detail=detail) - ) - else: - # return the values - result = np.empty(values.shape, dtype="float64") - result.fill(np.nan) - return result + fastres = expressions.where(cond, values, other) + return fastres if cond.ravel().all(): result = values @@ -1438,11 +1421,7 @@ def func(cond, values, other): if transpose: result = result.T - # try to cast if requested - if try_cast: - result = self._try_cast_result(result) - - return self.make_block(result) + return [self.make_block(result)] # might need to separate out blocks axis = cond.ndim - 1 @@ -1453,13 +1432,13 @@ def func(cond, values, other): for m in [mask, ~mask]: if m.any(): taken = result.take(m.nonzero()[0], axis=axis) - r = self._try_cast_result(taken) + r = maybe_downcast_numeric(taken, self.dtype) nb = self.make_block(r.T, placement=self.mgr_locs[m]) result_blocks.append(nb) return result_blocks - def equals(self, other): + def equals(self, other) -> bool: if self.dtype != other.dtype or self.shape != other.shape: return False return array_equivalent(self.values, other.values) @@ -1474,9 +1453,9 @@ def _unstack(self, unstacker_func, new_columns, n_rows, fill_value): new_columns : Index All columns of the unstacked BlockManager. n_rows : int - Only used in ExtensionBlock.unstack + Only used in ExtensionBlock._unstack fill_value : int - Only used in ExtensionBlock.unstack + Only used in ExtensionBlock._unstack Returns ------- @@ -1550,7 +1529,7 @@ def quantile(self, qs, interpolation="linear", axis=0): result = result[..., 0] result = lib.item_from_zerodim(result) - ndim = getattr(result, "ndim", None) or 0 + ndim = np.ndim(result) return make_block(result, placement=np.arange(len(result)), ndim=ndim) def _replace_coerce( @@ -1686,9 +1665,6 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0, transpose=False) new_values[mask] = new return [self.make_block(values=new_values)] - def _try_cast_result(self, result, dtype=None): - return result - def _get_unstack_items(self, unstacker, new_columns): """ Get the placement, values, and mask for a Block unstack. @@ -1740,7 +1716,8 @@ def __init__(self, values, placement, ndim=None): super().__init__(values, placement, ndim) def _maybe_coerce_values(self, values): - """Unbox to an extension array. + """ + Unbox to an extension array. This will unbox an ExtensionArray stored in an Index or Series. ExtensionArrays pass through. No dtype coercion is done. @@ -1753,9 +1730,7 @@ def _maybe_coerce_values(self, values): ------- ExtensionArray """ - if isinstance(values, (ABCIndexClass, ABCSeries)): - values = values._values - return values + return extract_array(values) @property def _holder(self): @@ -1843,7 +1818,7 @@ def take_nd(self, indexer, axis=0, new_mgr_locs=None, fill_tuple=None): return self.make_block_same_class(new_values, new_mgr_locs) - def _can_hold_element(self, element): + def _can_hold_element(self, element: Any) -> bool: # XXX: We may need to think about pushing this onto the array. # We're doing the same as CategoricalBlock here. return True @@ -1861,21 +1836,6 @@ def _slice(self, slicer): return self.values[slicer] - def formatting_values(self): - # Deprecating the ability to override _formatting_values. - # Do the warning here, it's only user in pandas, since we - # have to check if the subclass overrode it. - fv = getattr(type(self.values), "_formatting_values", None) - if fv and fv != ExtensionArray._formatting_values: - msg = ( - "'ExtensionArray._formatting_values' is deprecated. " - "Specify 'ExtensionArray._formatter' instead." - ) - warnings.warn(msg, FutureWarning, stacklevel=10) - return self.values._formatting_values() - - return self.values - def concat_same_type(self, to_concat, placement=None): """ Concatenate list of single blocks of the same type. @@ -1923,7 +1883,15 @@ def shift( ) ] - def where(self, other, cond, align=True, errors="raise", try_cast=False, axis=0): + def where( + self, + other, + cond, + align=True, + errors="raise", + try_cast: bool = False, + axis: int = 0, + ) -> List["Block"]: if isinstance(other, ABCDataFrame): # ExtensionArrays are 1-D, so if we get here then # `other` should be a DataFrame with a single column. @@ -1968,7 +1936,7 @@ def where(self, other, cond, align=True, errors="raise", try_cast=False, axis=0) np.where(cond, self.values, other), dtype=dtype ) - return self.make_block_same_class(result, placement=self.mgr_locs) + return [self.make_block_same_class(result, placement=self.mgr_locs)] @property def _ftype(self): @@ -2020,7 +1988,7 @@ class NumericBlock(Block): class FloatOrComplexBlock(NumericBlock): __slots__ = () - def equals(self, other): + def equals(self, other) -> bool: if self.dtype != other.dtype or self.shape != other.shape: return False left, right = self.values, other.values @@ -2031,7 +1999,7 @@ class FloatBlock(FloatOrComplexBlock): __slots__ = () is_float = True - def _can_hold_element(self, element): + def _can_hold_element(self, element: Any) -> bool: tipo = maybe_infer_dtype_type(element) if tipo is not None: return issubclass(tipo.type, (np.floating, np.integer)) and not issubclass( @@ -2095,7 +2063,7 @@ class ComplexBlock(FloatOrComplexBlock): __slots__ = () is_complex = True - def _can_hold_element(self, element): + def _can_hold_element(self, element: Any) -> bool: tipo = maybe_infer_dtype_type(element) if tipo is not None: return issubclass(tipo.type, (np.floating, np.integer, np.complexfloating)) @@ -2112,7 +2080,7 @@ class IntBlock(NumericBlock): is_integer = True _can_hold_na = False - def _can_hold_element(self, element): + def _can_hold_element(self, element: Any) -> bool: tipo = maybe_infer_dtype_type(element) if tipo is not None: return ( @@ -2202,7 +2170,7 @@ def _astype(self, dtype, **kwargs): # delegate return super()._astype(dtype=dtype, **kwargs) - def _can_hold_element(self, element): + def _can_hold_element(self, element: Any) -> bool: tipo = maybe_infer_dtype_type(element) if tipo is not None: if self.is_datetimetz: @@ -2392,41 +2360,19 @@ def _slice(self, slicer): return self.values[slicer] def _try_coerce_args(self, other): - """ - localize and return i8 for the values - - Parameters - ---------- - other : ndarray-like or scalar - - Returns - ------- - base-type other - """ - if is_valid_nat_for_dtype(other, self.dtype): - other = np.datetime64("NaT", "ns") - elif isinstance(other, self._holder): - if not tz_compare(other.tz, self.values.tz): - raise ValueError("incompatible or non tz-aware value") - - elif isinstance(other, (np.datetime64, datetime, date)): - other = tslibs.Timestamp(other) - - # test we can have an equal time zone - if not tz_compare(other.tz, self.values.tz): - raise ValueError("incompatible or non tz-aware value") - else: - raise TypeError(other) - + # DatetimeArray handles this for us return other - def diff(self, n, axis=0): - """1st discrete difference + def diff(self, n: int, axis: int = 0) -> List["Block"]: + """ + 1st discrete difference. Parameters ---------- - n : int, number of periods to diff - axis : int, axis to diff upon. default 0 + n : int + Number of periods to diff. + axis : int, default 0 + Axis to diff upon. Returns ------- @@ -2488,7 +2434,7 @@ def setitem(self, indexer, value): ) return newb.setitem(indexer, value) - def equals(self, other): + def equals(self, other) -> bool: # override for significant performance improvement if self.dtype != other.dtype or self.shape != other.shape: return False @@ -2527,7 +2473,7 @@ def __init__(self, values, placement, ndim=None): def _holder(self): return TimedeltaArray - def _can_hold_element(self, element): + def _can_hold_element(self, element: Any) -> bool: tipo = maybe_infer_dtype_type(element) if tipo is not None: return issubclass(tipo.type, np.timedelta64) @@ -2620,7 +2566,7 @@ class BoolBlock(NumericBlock): is_bool = True _can_hold_na = False - def _can_hold_element(self, element): + def _can_hold_element(self, element: Any) -> bool: tipo = maybe_infer_dtype_type(element) if tipo is not None: return issubclass(tipo.type, np.bool_) @@ -2706,7 +2652,7 @@ def f(m, v, i): return blocks - def _maybe_downcast(self, blocks, downcast=None): + def _maybe_downcast(self, blocks: List["Block"], downcast=None) -> List["Block"]: if downcast is not None: return blocks @@ -2714,7 +2660,7 @@ def _maybe_downcast(self, blocks, downcast=None): # split and convert the blocks return _extend_blocks([b.convert(datetime=True, numeric=False) for b in blocks]) - def _can_hold_element(self, element): + def _can_hold_element(self, element: Any) -> bool: return True def _try_coerce_args(self, other): @@ -2870,9 +2816,9 @@ def _replace_single( regex = regex_re or to_rep_re # try to get the pattern attribute (compiled re) or it's a string - try: + if is_re(to_replace): pattern = to_replace.pattern - except AttributeError: + else: pattern = to_replace # if the pattern is not empty and to_replace is either a string or a @@ -2893,18 +2839,18 @@ def _replace_single( if isna(value) or not isinstance(value, str): def re_replacer(s): - try: + if is_re(rx) and isinstance(s, str): return value if rx.search(s) is not None else s - except TypeError: + else: return s else: # value is guaranteed to be a string here, s can be either a string # or null if it's null it gets returned def re_replacer(s): - try: + if is_re(rx) and isinstance(s, str): return rx.sub(value, s) - except TypeError: + else: return s f = np.vectorize(re_replacer, otypes=[self.dtype]) @@ -3031,7 +2977,15 @@ def concat_same_type(self, to_concat, placement=None): values, placement=placement or slice(0, len(values), 1), ndim=self.ndim ) - def where(self, other, cond, align=True, errors="raise", try_cast=False, axis=0): + def where( + self, + other, + cond, + align=True, + errors="raise", + try_cast: bool = False, + axis: int = 0, + ) -> List["Block"]: # TODO(CategoricalBlock.where): # This can all be deleted in favor of ExtensionBlock.where once # we enforce the deprecation. diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index e5acd23b77d5d..1c31542daa5de 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1582,10 +1582,6 @@ def external_values(self): def internal_values(self): return self._block.internal_values() - def formatting_values(self): - """Return the internal values used by the DataFrame/SeriesFormatter""" - return self._block.formatting_values() - def get_values(self): """ return a dense type view """ return np.array(self._block.to_dense(), copy=False) @@ -1823,7 +1819,7 @@ def _simple_blockify(tuples, dtype): """ values, placement = _stack_arrays(tuples, dtype) - # CHECK DTYPE? + # TODO: CHECK DTYPE? if dtype is not None and values.dtype != dtype: # pragma: no cover values = values.astype(dtype) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 48b3d74e8d02c..86cd6e878cde6 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -13,51 +13,46 @@ from pandas.errors import NullFrequencyError from pandas.util._decorators import Appender -from pandas.core.dtypes.cast import ( - construct_1d_object_array_from_listlike, - find_common_type, - maybe_upcast_putmask, -) +from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike from pandas.core.dtypes.common import ( ensure_object, is_bool_dtype, - is_categorical_dtype, is_datetime64_dtype, - is_datetime64tz_dtype, is_datetimelike_v_numeric, is_extension_array_dtype, is_integer_dtype, is_list_like, is_object_dtype, - is_period_dtype, is_scalar, is_timedelta64_dtype, - needs_i8_conversion, ) from pandas.core.dtypes.generic import ( ABCDataFrame, ABCDatetimeArray, ABCDatetimeIndex, - ABCIndex, + ABCExtensionArray, ABCIndexClass, ABCSeries, - ABCSparseArray, ABCSparseSeries, ) from pandas.core.dtypes.missing import isna, notna import pandas as pd from pandas._typing import ArrayLike -from pandas.core.construction import extract_array - -from . import missing -from .docstrings import ( +from pandas.core.construction import array, extract_array +from pandas.core.ops.array_ops import comp_method_OBJECT_ARRAY, define_na_arithmetic_op +from pandas.core.ops.docstrings import ( _arith_doc_FRAME, _flex_comp_doc_FRAME, _make_flex_doc, _op_descriptions, ) -from .roperator import ( # noqa:F401 +from pandas.core.ops.invalid import invalid_comparison +from pandas.core.ops.methods import ( # noqa:F401 + add_flex_arithmetic_methods, + add_special_arithmetic_methods, +) +from pandas.core.ops.roperator import ( # noqa:F401 radd, rand_, rdiv, @@ -174,7 +169,7 @@ def maybe_upcast_for_op(obj, shape: Tuple[int, ...]): # np.timedelta64(3, 'D') / 2 == np.timedelta64(1, 'D') return Timedelta(obj) - elif isinstance(obj, np.ndarray) and is_timedelta64_dtype(obj): + elif isinstance(obj, np.ndarray) and is_timedelta64_dtype(obj.dtype): # GH#22390 Unfortunately we need to special-case right-hand # timedelta64 dtypes because numpy casts integer dtypes to # timedelta64 when operating with timedelta64 @@ -185,29 +180,6 @@ def maybe_upcast_for_op(obj, shape: Tuple[int, ...]): # ----------------------------------------------------------------------------- -def make_invalid_op(name): - """ - Return a binary method that always raises a TypeError. - - Parameters - ---------- - name : str - - Returns - ------- - invalid_op : function - """ - - def invalid_op(self, other=None): - raise TypeError( - "cannot perform {name} with this index type: " - "{typ}".format(name=name, typ=type(self).__name__) - ) - - invalid_op.__name__ = name - return invalid_op - - def _gen_eval_kwargs(name): """ Find the keyword arguments to pass to numexpr for the given operation. @@ -419,97 +391,35 @@ def mask_cmp_op(x, y, op): return result -def masked_arith_op(x, y, op): - """ - If the given arithmetic operation fails, attempt it again on - only the non-null elements of the input array(s). - - Parameters - ---------- - x : np.ndarray - y : np.ndarray, Series, Index - op : binary operator - """ - # For Series `x` is 1D so ravel() is a no-op; calling it anyway makes - # the logic valid for both Series and DataFrame ops. - xrav = x.ravel() - assert isinstance(x, np.ndarray), type(x) - if isinstance(y, np.ndarray): - dtype = find_common_type([x.dtype, y.dtype]) - result = np.empty(x.size, dtype=dtype) - - # PeriodIndex.ravel() returns int64 dtype, so we have - # to work around that case. See GH#19956 - yrav = y if is_period_dtype(y) else y.ravel() - mask = notna(xrav) & notna(yrav) - - if yrav.shape != mask.shape: - # FIXME: GH#5284, GH#5035, GH#19448 - # Without specifically raising here we get mismatched - # errors in Py3 (TypeError) vs Py2 (ValueError) - # Note: Only = an issue in DataFrame case - raise ValueError("Cannot broadcast operands together.") - - if mask.any(): - with np.errstate(all="ignore"): - result[mask] = op(xrav[mask], yrav[mask]) - - else: - assert is_scalar(y), type(y) - assert isinstance(x, np.ndarray), type(x) - # mask is only meaningful for x - result = np.empty(x.size, dtype=x.dtype) - mask = notna(xrav) - - # 1 ** np.nan is 1. So we have to unmask those. - if op == pow: - mask = np.where(x == 1, False, mask) - elif op == rpow: - mask = np.where(y == 1, False, mask) - - if mask.any(): - with np.errstate(all="ignore"): - result[mask] = op(xrav[mask], y) - - result, changed = maybe_upcast_putmask(result, ~mask, np.nan) - result = result.reshape(x.shape) # 2D compat - return result +# ----------------------------------------------------------------------------- +# Dispatch logic -def invalid_comparison(left, right, op): +def should_extension_dispatch(left: ABCSeries, right: Any) -> bool: """ - If a comparison has mismatched types and is not necessarily meaningful, - follow python3 conventions by: - - - returning all-False for equality - - returning all-True for inequality - - raising TypeError otherwise + Identify cases where Series operation should use dispatch_to_extension_op. Parameters ---------- - left : array-like - right : scalar, array-like - op : operator.{eq, ne, lt, le, gt} + left : Series + right : object - Raises - ------ - TypeError : on inequality comparisons + Returns + ------- + bool """ - if op is operator.eq: - res_values = np.zeros(left.shape, dtype=bool) - elif op is operator.ne: - res_values = np.ones(left.shape, dtype=bool) - else: - raise TypeError( - "Invalid comparison between dtype={dtype} and {typ}".format( - dtype=left.dtype, typ=type(right).__name__ - ) - ) - return res_values + if ( + is_extension_array_dtype(left.dtype) + or is_datetime64_dtype(left.dtype) + or is_timedelta64_dtype(left.dtype) + ): + return True + if not is_scalar(right) and is_extension_array_dtype(right): + # GH#22378 disallow scalar to exclude e.g. "category", "Int64" + return True -# ----------------------------------------------------------------------------- -# Dispatch logic + return False def should_series_dispatch(left, right, op): @@ -616,19 +526,18 @@ def dispatch_to_extension_op(op, left, right): apply the operator defined by op. """ + if left.dtype.kind in "mM": + # We need to cast datetime64 and timedelta64 ndarrays to + # DatetimeArray/TimedeltaArray. But we avoid wrapping others in + # PandasArray as that behaves poorly with e.g. IntegerArray. + left = array(left) + # The op calls will raise TypeError if the op is not defined # on the ExtensionArray # unbox Series and Index to arrays - if isinstance(left, (ABCSeries, ABCIndexClass)): - new_left = left._values - else: - new_left = left - - if isinstance(right, (ABCSeries, ABCIndexClass)): - new_right = right._values - else: - new_right = right + new_left = extract_array(left, extract_numpy=True) + new_right = extract_array(right, extract_numpy=True) try: res_values = op(new_left, new_right) @@ -642,224 +551,6 @@ def dispatch_to_extension_op(op, left, right): return res_values -# ----------------------------------------------------------------------------- -# Functions that add arithmetic methods to objects, given arithmetic factory -# methods - - -def _get_method_wrappers(cls): - """ - Find the appropriate operation-wrappers to use when defining flex/special - arithmetic, boolean, and comparison operations with the given class. - - Parameters - ---------- - cls : class - - Returns - ------- - arith_flex : function or None - comp_flex : function or None - arith_special : function - comp_special : function - bool_special : function - - Notes - ----- - None is only returned for SparseArray - """ - if issubclass(cls, ABCSparseSeries): - # Be sure to catch this before ABCSeries and ABCSparseArray, - # as they will both come see SparseSeries as a subclass - arith_flex = _flex_method_SERIES - comp_flex = _flex_method_SERIES - arith_special = _arith_method_SPARSE_SERIES - comp_special = _arith_method_SPARSE_SERIES - bool_special = _bool_method_SERIES - # TODO: I don't think the functions defined by bool_method are tested - elif issubclass(cls, ABCSeries): - # Just Series; SparseSeries is caught above - arith_flex = _flex_method_SERIES - comp_flex = _flex_method_SERIES - arith_special = _arith_method_SERIES - comp_special = _comp_method_SERIES - bool_special = _bool_method_SERIES - elif issubclass(cls, ABCDataFrame): - # Same for DataFrame and SparseDataFrame - arith_flex = _arith_method_FRAME - comp_flex = _flex_comp_method_FRAME - arith_special = _arith_method_FRAME - comp_special = _comp_method_FRAME - bool_special = _arith_method_FRAME - return arith_flex, comp_flex, arith_special, comp_special, bool_special - - -def _create_methods(cls, arith_method, comp_method, bool_method, special): - # creates actual methods based upon arithmetic, comp and bool method - # constructors. - - have_divmod = issubclass(cls, ABCSeries) - # divmod is available for Series and SparseSeries - - # yapf: disable - new_methods = dict( - add=arith_method(cls, operator.add, special), - radd=arith_method(cls, radd, special), - sub=arith_method(cls, operator.sub, special), - mul=arith_method(cls, operator.mul, special), - truediv=arith_method(cls, operator.truediv, special), - floordiv=arith_method(cls, operator.floordiv, special), - # Causes a floating point exception in the tests when numexpr enabled, - # so for now no speedup - mod=arith_method(cls, operator.mod, special), - pow=arith_method(cls, operator.pow, special), - # not entirely sure why this is necessary, but previously was included - # so it's here to maintain compatibility - rmul=arith_method(cls, rmul, special), - rsub=arith_method(cls, rsub, special), - rtruediv=arith_method(cls, rtruediv, special), - rfloordiv=arith_method(cls, rfloordiv, special), - rpow=arith_method(cls, rpow, special), - rmod=arith_method(cls, rmod, special)) - # yapf: enable - new_methods["div"] = new_methods["truediv"] - new_methods["rdiv"] = new_methods["rtruediv"] - if have_divmod: - # divmod doesn't have an op that is supported by numexpr - new_methods["divmod"] = arith_method(cls, divmod, special) - new_methods["rdivmod"] = arith_method(cls, rdivmod, special) - - new_methods.update( - dict( - eq=comp_method(cls, operator.eq, special), - ne=comp_method(cls, operator.ne, special), - lt=comp_method(cls, operator.lt, special), - gt=comp_method(cls, operator.gt, special), - le=comp_method(cls, operator.le, special), - ge=comp_method(cls, operator.ge, special), - ) - ) - - if bool_method: - new_methods.update( - dict( - and_=bool_method(cls, operator.and_, special), - or_=bool_method(cls, operator.or_, special), - # For some reason ``^`` wasn't used in original. - xor=bool_method(cls, operator.xor, special), - rand_=bool_method(cls, rand_, special), - ror_=bool_method(cls, ror_, special), - rxor=bool_method(cls, rxor, special), - ) - ) - - if special: - dunderize = lambda x: "__{name}__".format(name=x.strip("_")) - else: - dunderize = lambda x: x - new_methods = {dunderize(k): v for k, v in new_methods.items()} - return new_methods - - -def add_methods(cls, new_methods): - for name, method in new_methods.items(): - # For most methods, if we find that the class already has a method - # of the same name, it is OK to over-write it. The exception is - # inplace methods (__iadd__, __isub__, ...) for SparseArray, which - # retain the np.ndarray versions. - force = not (issubclass(cls, ABCSparseArray) and name.startswith("__i")) - if force or name not in cls.__dict__: - setattr(cls, name, method) - - -# ---------------------------------------------------------------------- -# Arithmetic -def add_special_arithmetic_methods(cls): - """ - Adds the full suite of special arithmetic methods (``__add__``, - ``__sub__``, etc.) to the class. - - Parameters - ---------- - cls : class - special methods will be defined and pinned to this class - """ - _, _, arith_method, comp_method, bool_method = _get_method_wrappers(cls) - new_methods = _create_methods( - cls, arith_method, comp_method, bool_method, special=True - ) - # inplace operators (I feel like these should get passed an `inplace=True` - # or just be removed - - def _wrap_inplace_method(method): - """ - return an inplace wrapper for this method - """ - - def f(self, other): - result = method(self, other) - - # this makes sure that we are aligned like the input - # we are updating inplace so we want to ignore is_copy - self._update_inplace( - result.reindex_like(self, copy=False)._data, verify_is_copy=False - ) - - return self - - f.__name__ = "__i{name}__".format(name=method.__name__.strip("__")) - return f - - new_methods.update( - dict( - __iadd__=_wrap_inplace_method(new_methods["__add__"]), - __isub__=_wrap_inplace_method(new_methods["__sub__"]), - __imul__=_wrap_inplace_method(new_methods["__mul__"]), - __itruediv__=_wrap_inplace_method(new_methods["__truediv__"]), - __ifloordiv__=_wrap_inplace_method(new_methods["__floordiv__"]), - __imod__=_wrap_inplace_method(new_methods["__mod__"]), - __ipow__=_wrap_inplace_method(new_methods["__pow__"]), - ) - ) - - new_methods.update( - dict( - __iand__=_wrap_inplace_method(new_methods["__and__"]), - __ior__=_wrap_inplace_method(new_methods["__or__"]), - __ixor__=_wrap_inplace_method(new_methods["__xor__"]), - ) - ) - - add_methods(cls, new_methods=new_methods) - - -def add_flex_arithmetic_methods(cls): - """ - Adds the full suite of flex arithmetic methods (``pow``, ``mul``, ``add``) - to the class. - - Parameters - ---------- - cls : class - flex methods will be defined and pinned to this class - """ - flex_arith_method, flex_comp_method, _, _, _ = _get_method_wrappers(cls) - new_methods = _create_methods( - cls, flex_arith_method, flex_comp_method, bool_method=None, special=False - ) - new_methods.update( - dict( - multiply=new_methods["mul"], - subtract=new_methods["sub"], - divide=new_methods["div"], - ) - ) - # opt out of bool flex methods for now - assert not any(kname in new_methods for kname in ("ror_", "rxor", "rand_")) - - add_methods(cls, new_methods=new_methods) - - # ----------------------------------------------------------------------------- # Series @@ -918,33 +609,7 @@ def _arith_method_SERIES(cls, op, special): _construct_divmod_result if op in [divmod, rdivmod] else _construct_result ) - def na_op(x, y): - """ - Return the result of evaluating op on the passed in values. - - If native types are not compatible, try coersion to object dtype. - - Parameters - ---------- - x : array-like - y : array-like or scalar - - Returns - ------- - array-like - - Raises - ------ - TypeError : invalid operation - """ - import pandas.core.computation.expressions as expressions - - try: - result = expressions.evaluate(op, str_rep, x, y, **eval_kwargs) - except TypeError: - result = masked_arith_op(x, y, op) - - return missing.dispatch_fill_zeros(op, x, y, result) + na_op = define_na_arithmetic_op(op, str_rep, eval_kwargs) def wrapper(left, right): if isinstance(right, ABCDataFrame): @@ -954,127 +619,56 @@ def wrapper(left, right): res_name = get_op_result_name(left, right) right = maybe_upcast_for_op(right, left.shape) - if is_categorical_dtype(left): - raise TypeError( - "{typ} cannot perform the operation " - "{op}".format(typ=type(left).__name__, op=str_rep) - ) - - elif is_datetime64_dtype(left) or is_datetime64tz_dtype(left): - from pandas.core.arrays import DatetimeArray - - result = dispatch_to_extension_op(op, DatetimeArray(left), right) - return construct_result(left, result, index=left.index, name=res_name) - - elif is_extension_array_dtype(left) or ( - is_extension_array_dtype(right) and not is_scalar(right) - ): - # GH#22378 disallow scalar to exclude e.g. "category", "Int64" + if should_extension_dispatch(left, right): result = dispatch_to_extension_op(op, left, right) - return construct_result(left, result, index=left.index, name=res_name) - elif is_timedelta64_dtype(left): - from pandas.core.arrays import TimedeltaArray - - result = dispatch_to_extension_op(op, TimedeltaArray(left), right) - return construct_result(left, result, index=left.index, name=res_name) - - elif is_timedelta64_dtype(right): - # We should only get here with non-scalar values for right - # upcast by maybe_upcast_for_op + elif is_timedelta64_dtype(right) or isinstance( + right, (ABCDatetimeArray, ABCDatetimeIndex) + ): + # We should only get here with td64 right with non-scalar values + # for right upcast by maybe_upcast_for_op assert not isinstance(right, (np.timedelta64, np.ndarray)) - result = op(left._values, right) - # We do not pass dtype to ensure that the Series constructor - # does inference in the case where `result` has object-dtype. - return construct_result(left, result, index=left.index, name=res_name) - - elif isinstance(right, (ABCDatetimeArray, ABCDatetimeIndex)): - result = op(left._values, right) - return construct_result(left, result, index=left.index, name=res_name) + else: + lvalues = extract_array(left, extract_numpy=True) + rvalues = extract_array(right, extract_numpy=True) - lvalues = left.values - rvalues = right - if isinstance(rvalues, (ABCSeries, ABCIndexClass)): - rvalues = rvalues._values + with np.errstate(all="ignore"): + result = na_op(lvalues, rvalues) - with np.errstate(all="ignore"): - result = na_op(lvalues, rvalues) - return construct_result( - left, result, index=left.index, name=res_name, dtype=None - ) + # We do not pass dtype to ensure that the Series constructor + # does inference in the case where `result` has object-dtype. + return construct_result(left, result, index=left.index, name=res_name) wrapper.__name__ = op_name return wrapper -def _comp_method_OBJECT_ARRAY(op, x, y): - if isinstance(y, list): - y = construct_1d_object_array_from_listlike(y) - if isinstance(y, (np.ndarray, ABCSeries, ABCIndex)): - if not is_object_dtype(y.dtype): - y = y.astype(np.object_) - - if isinstance(y, (ABCSeries, ABCIndex)): - y = y.values - - result = libops.vec_compare(x, y, op) - else: - result = libops.scalar_compare(x, y, op) - return result - - def _comp_method_SERIES(cls, op, special): """ Wrapper function for Series arithmetic operations, to avoid code duplication. """ op_name = _get_op_name(op, special) - masker = _gen_eval_kwargs(op_name).get("masker", False) def na_op(x, y): # TODO: - # should have guarantess on what x, y can be type-wise + # should have guarantees on what x, y can be type-wise # Extension Dtypes are not called here - # Checking that cases that were once handled here are no longer - # reachable. - assert not (is_categorical_dtype(y) and not is_scalar(y)) - if is_object_dtype(x.dtype): - result = _comp_method_OBJECT_ARRAY(op, x, y) + result = comp_method_OBJECT_ARRAY(op, x, y) elif is_datetimelike_v_numeric(x, y): return invalid_comparison(x, y, op) else: - - # we want to compare like types - # we only want to convert to integer like if - # we are not NotImplemented, otherwise - # we would allow datetime64 (but viewed as i8) against - # integer comparisons - - # we have a datetime/timedelta and may need to convert - assert not needs_i8_conversion(x) - mask = None - if not is_scalar(y) and needs_i8_conversion(y): - mask = isna(x) | isna(y) - y = y.view("i8") - x = x.view("i8") - - method = getattr(x, op_name, None) - if method is not None: - with np.errstate(all="ignore"): - result = method(y) - if result is NotImplemented: - return invalid_comparison(x, y, op) - else: - result = op(x, y) - - if mask is not None and mask.any(): - result[mask] = masker + method = getattr(x, op_name) + with np.errstate(all="ignore"): + result = method(y) + if result is NotImplemented: + return invalid_comparison(x, y, op) return result @@ -1084,6 +678,15 @@ def wrapper(self, other, axis=None): self._get_axis_number(axis) res_name = get_op_result_name(self, other) + other = lib.item_from_zerodim(other) + + # TODO: shouldn't we be applying finalize whenever + # not isinstance(other, ABCSeries)? + finalizer = ( + lambda x: x.__finalize__(self) + if isinstance(other, (np.ndarray, ABCIndexClass)) + else x + ) if isinstance(other, list): # TODO: same for tuples? @@ -1093,57 +696,20 @@ def wrapper(self, other, axis=None): # Defer to DataFrame implementation; fail early return NotImplemented - elif isinstance(other, ABCSeries) and not self._indexed_same(other): + if isinstance(other, ABCSeries) and not self._indexed_same(other): raise ValueError("Can only compare identically-labeled Series objects") - elif is_categorical_dtype(self): - # Dispatch to Categorical implementation; CategoricalIndex - # behavior is non-canonical GH#19513 - res_values = dispatch_to_extension_op(op, self, other) - return self._constructor(res_values, index=self.index, name=res_name) - - elif is_datetime64_dtype(self) or is_datetime64tz_dtype(self): - # Dispatch to DatetimeIndex to ensure identical - # Series/Index behavior - from pandas.core.arrays import DatetimeArray - - res_values = dispatch_to_extension_op(op, DatetimeArray(self), other) - return self._constructor(res_values, index=self.index, name=res_name) - - elif is_timedelta64_dtype(self): - from pandas.core.arrays import TimedeltaArray - - res_values = dispatch_to_extension_op(op, TimedeltaArray(self), other) - return self._constructor(res_values, index=self.index, name=res_name) - - elif is_extension_array_dtype(self) or ( - is_extension_array_dtype(other) and not is_scalar(other) + elif isinstance( + other, (np.ndarray, ABCExtensionArray, ABCIndexClass, ABCSeries) ): - # Note: the `not is_scalar(other)` condition rules out - # e.g. other == "category" - res_values = dispatch_to_extension_op(op, self, other) - return self._constructor(res_values, index=self.index).rename(res_name) - - elif isinstance(other, ABCSeries): - # By this point we have checked that self._indexed_same(other) - res_values = na_op(self.values, other.values) - # rename is needed in case res_name is None and res_values.name - # is not. - return self._constructor( - res_values, index=self.index, name=res_name - ).rename(res_name) - - elif isinstance(other, (np.ndarray, ABCIndexClass)): - # do not check length of zerodim array - # as it will broadcast - if other.ndim != 0 and len(self) != len(other): + # TODO: make this treatment consistent across ops and classes. + # We are not catching all listlikes here (e.g. frozenset, tuple) + # The ambiguous case is object-dtype. See GH#27803 + if len(self) != len(other): raise ValueError("Lengths must match to compare") - res_values = na_op(self.values, np.asarray(other)) - result = self._constructor(res_values, index=self.index) - # rename is needed in case res_name is None and self.name - # is not. - return result.__finalize__(self).rename(res_name) + if should_extension_dispatch(self, other): + res_values = dispatch_to_extension_op(op, self, other) elif is_scalar(other) and isna(other): # numpy does not like comparisons vs None @@ -1151,25 +717,25 @@ def wrapper(self, other, axis=None): res_values = np.ones(len(self), dtype=bool) else: res_values = np.zeros(len(self), dtype=bool) - return self._constructor( - res_values, index=self.index, name=res_name, dtype="bool" - ) else: - values = self.to_numpy() + lvalues = extract_array(self, extract_numpy=True) + rvalues = extract_array(other, extract_numpy=True) with np.errstate(all="ignore"): - res = na_op(values, other) - if is_scalar(res): + res_values = na_op(lvalues, rvalues) + if is_scalar(res_values): raise TypeError( "Could not compare {typ} type with Series".format(typ=type(other)) ) - # always return a full value series here - res_values = extract_array(res, extract_numpy=True) - return self._constructor( - res_values, index=self.index, name=res_name, dtype="bool" - ) + result = self._constructor(res_values, index=self.index) + result = finalizer(result) + + # Set the result's name after finalizer is called because finalizer + # would set it back to self.name + result.name = res_name + return result wrapper.__name__ = op_name return wrapper @@ -1189,7 +755,7 @@ def na_op(x, y): assert not isinstance(y, (list, ABCSeries, ABCIndexClass)) if isinstance(y, np.ndarray): # bool-bool dtype operations should be OK, should not get here - assert not (is_bool_dtype(x) and is_bool_dtype(y)) + assert not (is_bool_dtype(x.dtype) and is_bool_dtype(y.dtype)) x = ensure_object(x) y = ensure_object(y) result = libops.vec_binop(x, y, op) @@ -1238,7 +804,7 @@ def wrapper(self, other): else: # scalars, list, tuple, np.array - is_other_int_dtype = is_integer_dtype(np.asarray(other)) + is_other_int_dtype = is_integer_dtype(np.asarray(other).dtype) if is_list_like(other) and not isinstance(other, np.ndarray): # TODO: Can we do this before the is_integer_dtype check? # could the is_integer_dtype check be checking the wrong @@ -1397,15 +963,7 @@ def _arith_method_FRAME(cls, op, special): eval_kwargs = _gen_eval_kwargs(op_name) default_axis = _get_frame_op_default_axis(op_name) - def na_op(x, y): - import pandas.core.computation.expressions as expressions - - try: - result = expressions.evaluate(op, str_rep, x, y, **eval_kwargs) - except TypeError: - result = masked_arith_op(x, y, op) - - return missing.dispatch_fill_zeros(op, x, y, result) + na_op = define_na_arithmetic_op(op, str_rep, eval_kwargs) if op_name in _op_descriptions: # i.e. include "add" but not "__add__" @@ -1430,10 +988,10 @@ def f(self, other, axis=default_axis, level=None, fill_value=None): self, other, pass_op, fill_value=fill_value, axis=axis, level=level ) else: + # in this case we always have `np.ndim(other) == 0` if fill_value is not None: self = self.fillna(fill_value) - assert np.ndim(other) == 0 return self._combine_const(other, op) f.__name__ = op_name @@ -1474,7 +1032,7 @@ def f(self, other, axis=default_axis, level=None): self, other, na_op, fill_value=None, axis=axis, level=level ) else: - assert np.ndim(other) == 0, other + # in this case we always have `np.ndim(other) == 0` return self._combine_const(other, na_op) f.__name__ = op_name diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py new file mode 100644 index 0000000000000..f5f6d77676f1f --- /dev/null +++ b/pandas/core/ops/array_ops.py @@ -0,0 +1,128 @@ +""" +Functions for arithmetic and comparison operations on NumPy arrays and +ExtensionArrays. +""" +import numpy as np + +from pandas._libs import ops as libops + +from pandas.core.dtypes.cast import ( + construct_1d_object_array_from_listlike, + find_common_type, + maybe_upcast_putmask, +) +from pandas.core.dtypes.common import is_object_dtype, is_scalar +from pandas.core.dtypes.generic import ABCIndex, ABCSeries +from pandas.core.dtypes.missing import notna + +from pandas.core.ops import missing +from pandas.core.ops.roperator import rpow + + +def comp_method_OBJECT_ARRAY(op, x, y): + if isinstance(y, list): + y = construct_1d_object_array_from_listlike(y) + + # TODO: Should the checks below be ABCIndexClass? + if isinstance(y, (np.ndarray, ABCSeries, ABCIndex)): + # TODO: should this be ABCIndexClass?? + if not is_object_dtype(y.dtype): + y = y.astype(np.object_) + + if isinstance(y, (ABCSeries, ABCIndex)): + y = y.values + + result = libops.vec_compare(x, y, op) + else: + result = libops.scalar_compare(x, y, op) + return result + + +def masked_arith_op(x, y, op): + """ + If the given arithmetic operation fails, attempt it again on + only the non-null elements of the input array(s). + + Parameters + ---------- + x : np.ndarray + y : np.ndarray, Series, Index + op : binary operator + """ + # For Series `x` is 1D so ravel() is a no-op; calling it anyway makes + # the logic valid for both Series and DataFrame ops. + xrav = x.ravel() + assert isinstance(x, np.ndarray), type(x) + if isinstance(y, np.ndarray): + dtype = find_common_type([x.dtype, y.dtype]) + result = np.empty(x.size, dtype=dtype) + + # NB: ravel() is only safe since y is ndarray; for e.g. PeriodIndex + # we would get int64 dtype, see GH#19956 + yrav = y.ravel() + mask = notna(xrav) & notna(yrav) + + if yrav.shape != mask.shape: + # FIXME: GH#5284, GH#5035, GH#19448 + # Without specifically raising here we get mismatched + # errors in Py3 (TypeError) vs Py2 (ValueError) + # Note: Only = an issue in DataFrame case + raise ValueError("Cannot broadcast operands together.") + + if mask.any(): + with np.errstate(all="ignore"): + result[mask] = op(xrav[mask], yrav[mask]) + + else: + if not is_scalar(y): + raise TypeError(type(y)) + + # mask is only meaningful for x + result = np.empty(x.size, dtype=x.dtype) + mask = notna(xrav) + + # 1 ** np.nan is 1. So we have to unmask those. + if op is pow: + mask = np.where(x == 1, False, mask) + elif op is rpow: + mask = np.where(y == 1, False, mask) + + if mask.any(): + with np.errstate(all="ignore"): + result[mask] = op(xrav[mask], y) + + result, changed = maybe_upcast_putmask(result, ~mask, np.nan) + result = result.reshape(x.shape) # 2D compat + return result + + +def define_na_arithmetic_op(op, str_rep, eval_kwargs): + def na_op(x, y): + """ + Return the result of evaluating op on the passed in values. + + If native types are not compatible, try coersion to object dtype. + + Parameters + ---------- + x : array-like + y : array-like or scalar + + Returns + ------- + array-like + + Raises + ------ + TypeError : invalid operation + """ + import pandas.core.computation.expressions as expressions + + try: + result = expressions.evaluate(op, str_rep, x, y, **eval_kwargs) + except TypeError: + result = masked_arith_op(x, y, op) + + return missing.dispatch_fill_zeros(op, x, y, result) + + return na_op diff --git a/pandas/core/ops/invalid.py b/pandas/core/ops/invalid.py new file mode 100644 index 0000000000000..013ff7689b221 --- /dev/null +++ b/pandas/core/ops/invalid.py @@ -0,0 +1,61 @@ +""" +Templates for invalid operations. +""" +import operator + +import numpy as np + + +def invalid_comparison(left, right, op): + """ + If a comparison has mismatched types and is not necessarily meaningful, + follow python3 conventions by: + + - returning all-False for equality + - returning all-True for inequality + - raising TypeError otherwise + + Parameters + ---------- + left : array-like + right : scalar, array-like + op : operator.{eq, ne, lt, le, gt} + + Raises + ------ + TypeError : on inequality comparisons + """ + if op is operator.eq: + res_values = np.zeros(left.shape, dtype=bool) + elif op is operator.ne: + res_values = np.ones(left.shape, dtype=bool) + else: + raise TypeError( + "Invalid comparison between dtype={dtype} and {typ}".format( + dtype=left.dtype, typ=type(right).__name__ + ) + ) + return res_values + + +def make_invalid_op(name: str): + """ + Return a binary method that always raises a TypeError. + + Parameters + ---------- + name : str + + Returns + ------- + invalid_op : function + """ + + def invalid_op(self, other=None): + raise TypeError( + "cannot perform {name} with this index type: " + "{typ}".format(name=name, typ=type(self).__name__) + ) + + invalid_op.__name__ = name + return invalid_op diff --git a/pandas/core/ops/methods.py b/pandas/core/ops/methods.py new file mode 100644 index 0000000000000..eba0a797a791f --- /dev/null +++ b/pandas/core/ops/methods.py @@ -0,0 +1,249 @@ +""" +Functions to generate methods and pin them to the appropriate classes. +""" +import operator + +from pandas.core.dtypes.generic import ( + ABCDataFrame, + ABCSeries, + ABCSparseArray, + ABCSparseSeries, +) + +from pandas.core.ops.roperator import ( + radd, + rand_, + rdivmod, + rfloordiv, + rmod, + rmul, + ror_, + rpow, + rsub, + rtruediv, + rxor, +) + + +def _get_method_wrappers(cls): + """ + Find the appropriate operation-wrappers to use when defining flex/special + arithmetic, boolean, and comparison operations with the given class. + + Parameters + ---------- + cls : class + + Returns + ------- + arith_flex : function or None + comp_flex : function or None + arith_special : function + comp_special : function + bool_special : function + + Notes + ----- + None is only returned for SparseArray + """ + # TODO: make these non-runtime imports once the relevant functions + # are no longer in __init__ + from pandas.core.ops import ( + _arith_method_FRAME, + _arith_method_SERIES, + _arith_method_SPARSE_SERIES, + _bool_method_SERIES, + _comp_method_FRAME, + _comp_method_SERIES, + _flex_comp_method_FRAME, + _flex_method_SERIES, + ) + + if issubclass(cls, ABCSparseSeries): + # Be sure to catch this before ABCSeries and ABCSparseArray, + # as they will both come see SparseSeries as a subclass + arith_flex = _flex_method_SERIES + comp_flex = _flex_method_SERIES + arith_special = _arith_method_SPARSE_SERIES + comp_special = _arith_method_SPARSE_SERIES + bool_special = _bool_method_SERIES + # TODO: I don't think the functions defined by bool_method are tested + elif issubclass(cls, ABCSeries): + # Just Series; SparseSeries is caught above + arith_flex = _flex_method_SERIES + comp_flex = _flex_method_SERIES + arith_special = _arith_method_SERIES + comp_special = _comp_method_SERIES + bool_special = _bool_method_SERIES + elif issubclass(cls, ABCDataFrame): + # Same for DataFrame and SparseDataFrame + arith_flex = _arith_method_FRAME + comp_flex = _flex_comp_method_FRAME + arith_special = _arith_method_FRAME + comp_special = _comp_method_FRAME + bool_special = _arith_method_FRAME + return arith_flex, comp_flex, arith_special, comp_special, bool_special + + +def add_special_arithmetic_methods(cls): + """ + Adds the full suite of special arithmetic methods (``__add__``, + ``__sub__``, etc.) to the class. + + Parameters + ---------- + cls : class + special methods will be defined and pinned to this class + """ + _, _, arith_method, comp_method, bool_method = _get_method_wrappers(cls) + new_methods = _create_methods( + cls, arith_method, comp_method, bool_method, special=True + ) + # inplace operators (I feel like these should get passed an `inplace=True` + # or just be removed + + def _wrap_inplace_method(method): + """ + return an inplace wrapper for this method + """ + + def f(self, other): + result = method(self, other) + + # this makes sure that we are aligned like the input + # we are updating inplace so we want to ignore is_copy + self._update_inplace( + result.reindex_like(self, copy=False)._data, verify_is_copy=False + ) + + return self + + f.__name__ = "__i{name}__".format(name=method.__name__.strip("__")) + return f + + new_methods.update( + dict( + __iadd__=_wrap_inplace_method(new_methods["__add__"]), + __isub__=_wrap_inplace_method(new_methods["__sub__"]), + __imul__=_wrap_inplace_method(new_methods["__mul__"]), + __itruediv__=_wrap_inplace_method(new_methods["__truediv__"]), + __ifloordiv__=_wrap_inplace_method(new_methods["__floordiv__"]), + __imod__=_wrap_inplace_method(new_methods["__mod__"]), + __ipow__=_wrap_inplace_method(new_methods["__pow__"]), + ) + ) + + new_methods.update( + dict( + __iand__=_wrap_inplace_method(new_methods["__and__"]), + __ior__=_wrap_inplace_method(new_methods["__or__"]), + __ixor__=_wrap_inplace_method(new_methods["__xor__"]), + ) + ) + + _add_methods(cls, new_methods=new_methods) + + +def add_flex_arithmetic_methods(cls): + """ + Adds the full suite of flex arithmetic methods (``pow``, ``mul``, ``add``) + to the class. + + Parameters + ---------- + cls : class + flex methods will be defined and pinned to this class + """ + flex_arith_method, flex_comp_method, _, _, _ = _get_method_wrappers(cls) + new_methods = _create_methods( + cls, flex_arith_method, flex_comp_method, bool_method=None, special=False + ) + new_methods.update( + dict( + multiply=new_methods["mul"], + subtract=new_methods["sub"], + divide=new_methods["div"], + ) + ) + # opt out of bool flex methods for now + assert not any(kname in new_methods for kname in ("ror_", "rxor", "rand_")) + + _add_methods(cls, new_methods=new_methods) + + +def _create_methods(cls, arith_method, comp_method, bool_method, special): + # creates actual methods based upon arithmetic, comp and bool method + # constructors. + + have_divmod = issubclass(cls, ABCSeries) + # divmod is available for Series and SparseSeries + + # yapf: disable + new_methods = dict( + add=arith_method(cls, operator.add, special), + radd=arith_method(cls, radd, special), + sub=arith_method(cls, operator.sub, special), + mul=arith_method(cls, operator.mul, special), + truediv=arith_method(cls, operator.truediv, special), + floordiv=arith_method(cls, operator.floordiv, special), + # Causes a floating point exception in the tests when numexpr enabled, + # so for now no speedup + mod=arith_method(cls, operator.mod, special), + pow=arith_method(cls, operator.pow, special), + # not entirely sure why this is necessary, but previously was included + # so it's here to maintain compatibility + rmul=arith_method(cls, rmul, special), + rsub=arith_method(cls, rsub, special), + rtruediv=arith_method(cls, rtruediv, special), + rfloordiv=arith_method(cls, rfloordiv, special), + rpow=arith_method(cls, rpow, special), + rmod=arith_method(cls, rmod, special)) + # yapf: enable + new_methods["div"] = new_methods["truediv"] + new_methods["rdiv"] = new_methods["rtruediv"] + if have_divmod: + # divmod doesn't have an op that is supported by numexpr + new_methods["divmod"] = arith_method(cls, divmod, special) + new_methods["rdivmod"] = arith_method(cls, rdivmod, special) + + new_methods.update( + dict( + eq=comp_method(cls, operator.eq, special), + ne=comp_method(cls, operator.ne, special), + lt=comp_method(cls, operator.lt, special), + gt=comp_method(cls, operator.gt, special), + le=comp_method(cls, operator.le, special), + ge=comp_method(cls, operator.ge, special), + ) + ) + + if bool_method: + new_methods.update( + dict( + and_=bool_method(cls, operator.and_, special), + or_=bool_method(cls, operator.or_, special), + # For some reason ``^`` wasn't used in original. + xor=bool_method(cls, operator.xor, special), + rand_=bool_method(cls, rand_, special), + ror_=bool_method(cls, ror_, special), + rxor=bool_method(cls, rxor, special), + ) + ) + + if special: + dunderize = lambda x: "__{name}__".format(name=x.strip("_")) + else: + dunderize = lambda x: x + new_methods = {dunderize(k): v for k, v in new_methods.items()} + return new_methods + + +def _add_methods(cls, new_methods): + for name, method in new_methods.items(): + # For most methods, if we find that the class already has a method + # of the same name, it is OK to over-write it. The exception is + # inplace methods (__iadd__, __isub__, ...) for SparseArray, which + # retain the np.ndarray versions. + force = not (issubclass(cls, ABCSparseArray) and name.startswith("__i")) + if force or name not in cls.__dict__: + setattr(cls, name, method) diff --git a/pandas/core/ops/missing.py b/pandas/core/ops/missing.py index 01bc345a40b83..45fa6a2830af6 100644 --- a/pandas/core/ops/missing.py +++ b/pandas/core/ops/missing.py @@ -40,7 +40,7 @@ def fill_zeros(result, x, y, name, fill): Mask the nan's from x. """ - if fill is None or is_float_dtype(result): + if fill is None or is_float_dtype(result.dtype): return result if name.startswith(("r", "__r")): @@ -55,7 +55,7 @@ def fill_zeros(result, x, y, name, fill): if is_scalar_type: y = np.array(y) - if is_integer_dtype(y): + if is_integer_dtype(y.dtype): if (y == 0).any(): diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 66878c3b1026c..a5d0e2cb3b58f 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1630,15 +1630,14 @@ def _get_period_bins(self, ax): def _take_new_index(obj, indexer, new_index, axis=0): - from pandas.core.api import Series, DataFrame - if isinstance(obj, Series): + if isinstance(obj, ABCSeries): new_values = algos.take_1d(obj.values, indexer) - return Series(new_values, index=new_index, name=obj.name) - elif isinstance(obj, DataFrame): + return obj._constructor(new_values, index=new_index, name=obj.name) + elif isinstance(obj, ABCDataFrame): if axis == 1: raise NotImplementedError("axis 1 is not supported") - return DataFrame( + return obj._constructor( obj._data.reindex_indexer(new_axis=new_index, indexer=indexer, axis=1) ) else: diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index f45c7693bf6ed..d7fbe464cb1e5 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -22,7 +22,6 @@ is_bool, is_bool_dtype, is_categorical_dtype, - is_datetime64_dtype, is_datetime64tz_dtype, is_datetimelike, is_dtype_equal, @@ -179,7 +178,7 @@ def merge_ordered( """ Perform merge with optional filling/interpolation designed for ordered data like time series data. Optionally perform group-wise merge (see - examples) + examples). Parameters ---------- @@ -1635,7 +1634,7 @@ def _get_merge_keys(self): ) ) - if is_datetime64_dtype(lt) or is_datetime64tz_dtype(lt): + if is_datetimelike(lt): if not isinstance(self.tolerance, Timedelta): raise MergeError(msg) if self.tolerance < Timedelta(0): diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 79716520f6654..d653dd87308cf 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -611,13 +611,21 @@ def _normalize(table, normalize, margins, margins_name="All"): table = table.fillna(0) elif margins is True: - - column_margin = table.loc[:, margins_name].drop(margins_name) - index_margin = table.loc[margins_name, :].drop(margins_name) - table = table.drop(margins_name, axis=1).drop(margins_name) - # to keep index and columns names - table_index_names = table.index.names - table_columns_names = table.columns.names + # keep index and column of pivoted table + table_index = table.index + table_columns = table.columns + + # check if margin name is in (for MI cases) or equal to last + # index/column and save the column and index margin + if (margins_name not in table.iloc[-1, :].name) | ( + margins_name != table.iloc[:, -1].name + ): + raise ValueError("{} not in pivoted DataFrame".format(margins_name)) + column_margin = table.iloc[:-1, -1] + index_margin = table.iloc[-1, :-1] + + # keep the core table + table = table.iloc[:-1, :-1] # Normalize core table = _normalize(table, normalize=normalize, margins=False) @@ -627,11 +635,13 @@ def _normalize(table, normalize, margins, margins_name="All"): column_margin = column_margin / column_margin.sum() table = concat([table, column_margin], axis=1) table = table.fillna(0) + table.columns = table_columns elif normalize == "index": index_margin = index_margin / index_margin.sum() table = table.append(index_margin) table = table.fillna(0) + table.index = table_index elif normalize == "all" or normalize is True: column_margin = column_margin / column_margin.sum() @@ -641,13 +651,12 @@ def _normalize(table, normalize, margins, margins_name="All"): table = table.append(index_margin) table = table.fillna(0) + table.index = table_index + table.columns = table_columns else: raise ValueError("Not a valid normalize argument") - table.index.names = table_index_names - table.columns.names = table_columns_names - else: raise ValueError("Not a valid margins argument") diff --git a/pandas/core/series.py b/pandas/core/series.py index 9e317d365ccb8..6fb39c422de93 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -562,13 +562,6 @@ def _values(self): """ return self._data.internal_values() - def _formatting_values(self): - """ - Return the values that can be formatted (used by SeriesFormatter - and DataFrameFormatter). - """ - return self._data.formatting_values() - def get_values(self): """ Same as values (but handles sparseness conversions); is a view. @@ -682,8 +675,8 @@ def nonzero(self): 3 4 dtype: int64 - >>> s = pd.Series([0, 3, 0, 4], index=['a', 'b', 'c', 'd']) # same return although index of s is different + >>> s = pd.Series([0, 3, 0, 4], index=['a', 'b', 'c', 'd']) >>> s.nonzero() (array([1, 3]),) >>> s.iloc[s.nonzero()[0]] @@ -1688,7 +1681,8 @@ def items(self): See Also -------- - DataFrame.items : Equivalent to Series.items for DataFrame. + DataFrame.items : Iterate over (column name, Series) pairs. + DataFrame.iterrows : Iterate over DataFrame rows as (index, Series) pairs. Examples -------- @@ -3626,7 +3620,7 @@ def explode(self) -> "Series": Series.str.split : Split string values on specified separator. Series.unstack : Unstack, a.k.a. pivot, Series with MultiIndex to produce DataFrame. - DataFrame.melt : Unpivot a DataFrame from wide format to long format + DataFrame.melt : Unpivot a DataFrame from wide format to long format. DataFrame.explode : Explode a DataFrame from list-like columns to long format. @@ -4171,12 +4165,10 @@ def rename(self, index=None, **kwargs): """ kwargs["inplace"] = validate_bool_kwarg(kwargs.get("inplace", False), "inplace") - non_mapping = is_scalar(index) or ( - is_list_like(index) and not is_dict_like(index) - ) - if non_mapping: + if callable(index) or is_dict_like(index): + return super().rename(index=index, **kwargs) + else: return self._set_name(index, inplace=kwargs.get("inplace")) - return super().rename(index=index, **kwargs) @Substitution(**_shared_doc_kwargs) @Appender(generic.NDFrame.reindex.__doc__) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 5db31fe6664ea..e6edad656d430 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -271,7 +271,6 @@ def nargsort(items, kind="quicksort", ascending=True, na_position="last"): class _KeyMapper: - """ Ease my suffering. Map compressed group id -> key tuple """ diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index f5add426297a7..8fe6850c84b8b 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -569,13 +569,13 @@ def _combine_frame(self, other, func, fill_value=None, level=None): ).__finalize__(self) def _combine_match_index(self, other, func, level=None): - new_data = {} if level is not None: raise NotImplementedError("'level' argument is not supported") this, other = self.align(other, join="outer", axis=0, level=level, copy=False) + new_data = {} for col, series in this.items(): new_data[col] = func(series.values, other.values) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 169a3a24c254d..25350119f9df5 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1442,6 +1442,12 @@ def str_slice(arr, start=None, stop=None, step=None): 2 hameleon dtype: object + >>> s.str.slice(start=-1) + 0 a + 1 x + 2 n + dtype: object + >>> s.str.slice(stop=2) 0 ko 1 fo diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 172084e97a959..b07647cf5b5fb 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -334,6 +334,9 @@ def _convert_listlike_datetimes( return DatetimeIndex(arg, tz=tz, name=name) except ValueError: pass + elif tz: + # DatetimeArray, DatetimeIndex + return arg.tz_localize(tz) return arg diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index 73e126cf230a5..bcdbf0855cbb4 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -58,7 +58,7 @@ def hash_pandas_object( obj, index=True, encoding="utf8", hash_key=None, categorize=True ): """ - Return a data hash of the Index/Series/DataFrame + Return a data hash of the Index/Series/DataFrame. Parameters ---------- diff --git a/pandas/core/window/__init__.py b/pandas/core/window/__init__.py new file mode 100644 index 0000000000000..dcf58a4c0dd5b --- /dev/null +++ b/pandas/core/window/__init__.py @@ -0,0 +1,3 @@ +from pandas.core.window.ewm import EWM # noqa:F401 +from pandas.core.window.expanding import Expanding, ExpandingGroupby # noqa:F401 +from pandas.core.window.rolling import Rolling, RollingGroupby, Window # noqa:F401 diff --git a/pandas/core/window/common.py b/pandas/core/window/common.py new file mode 100644 index 0000000000000..0f2920b3558c9 --- /dev/null +++ b/pandas/core/window/common.py @@ -0,0 +1,276 @@ +"""Common utility functions for rolling operations""" +from collections import defaultdict +import warnings + +import numpy as np + +from pandas.core.dtypes.common import is_integer +from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries + +import pandas.core.common as com +from pandas.core.generic import _shared_docs +from pandas.core.groupby.base import GroupByMixin +from pandas.core.index import MultiIndex + +_shared_docs = dict(**_shared_docs) +_doc_template = """ + Returns + ------- + Series or DataFrame + Return type is determined by the caller. + + See Also + -------- + Series.%(name)s : Series %(name)s. + DataFrame.%(name)s : DataFrame %(name)s. +""" + + +class _GroupByMixin(GroupByMixin): + """ + Provide the groupby facilities. + """ + + def __init__(self, obj, *args, **kwargs): + parent = kwargs.pop("parent", None) # noqa + groupby = kwargs.pop("groupby", None) + if groupby is None: + groupby, obj = obj, obj.obj + self._groupby = groupby + self._groupby.mutated = True + self._groupby.grouper.mutated = True + super().__init__(obj, *args, **kwargs) + + count = GroupByMixin._dispatch("count") + corr = GroupByMixin._dispatch("corr", other=None, pairwise=None) + cov = GroupByMixin._dispatch("cov", other=None, pairwise=None) + + def _apply( + self, func, name=None, window=None, center=None, check_minp=None, **kwargs + ): + """ + Dispatch to apply; we are stripping all of the _apply kwargs and + performing the original function call on the grouped object. + """ + + def f(x, name=name, *args): + x = self._shallow_copy(x) + + if isinstance(name, str): + return getattr(x, name)(*args, **kwargs) + + return x.apply(name, *args, **kwargs) + + return self._groupby.apply(f) + + +def _flex_binary_moment(arg1, arg2, f, pairwise=False): + + if not ( + isinstance(arg1, (np.ndarray, ABCSeries, ABCDataFrame)) + and isinstance(arg2, (np.ndarray, ABCSeries, ABCDataFrame)) + ): + raise TypeError( + "arguments to moment function must be of type " + "np.ndarray/Series/DataFrame" + ) + + if isinstance(arg1, (np.ndarray, ABCSeries)) and isinstance( + arg2, (np.ndarray, ABCSeries) + ): + X, Y = _prep_binary(arg1, arg2) + return f(X, Y) + + elif isinstance(arg1, ABCDataFrame): + from pandas import DataFrame + + def dataframe_from_int_dict(data, frame_template): + result = DataFrame(data, index=frame_template.index) + if len(result.columns) > 0: + result.columns = frame_template.columns[result.columns] + return result + + results = {} + if isinstance(arg2, ABCDataFrame): + if pairwise is False: + if arg1 is arg2: + # special case in order to handle duplicate column names + for i, col in enumerate(arg1.columns): + results[i] = f(arg1.iloc[:, i], arg2.iloc[:, i]) + return dataframe_from_int_dict(results, arg1) + else: + if not arg1.columns.is_unique: + raise ValueError("'arg1' columns are not unique") + if not arg2.columns.is_unique: + raise ValueError("'arg2' columns are not unique") + with warnings.catch_warnings(record=True): + warnings.simplefilter("ignore", RuntimeWarning) + X, Y = arg1.align(arg2, join="outer") + X = X + 0 * Y + Y = Y + 0 * X + + with warnings.catch_warnings(record=True): + warnings.simplefilter("ignore", RuntimeWarning) + res_columns = arg1.columns.union(arg2.columns) + for col in res_columns: + if col in X and col in Y: + results[col] = f(X[col], Y[col]) + return DataFrame(results, index=X.index, columns=res_columns) + elif pairwise is True: + results = defaultdict(dict) + for i, k1 in enumerate(arg1.columns): + for j, k2 in enumerate(arg2.columns): + if j < i and arg2 is arg1: + # Symmetric case + results[i][j] = results[j][i] + else: + results[i][j] = f( + *_prep_binary(arg1.iloc[:, i], arg2.iloc[:, j]) + ) + + from pandas import concat + + result_index = arg1.index.union(arg2.index) + if len(result_index): + + # construct result frame + result = concat( + [ + concat( + [results[i][j] for j, c in enumerate(arg2.columns)], + ignore_index=True, + ) + for i, c in enumerate(arg1.columns) + ], + ignore_index=True, + axis=1, + ) + result.columns = arg1.columns + + # set the index and reorder + if arg2.columns.nlevels > 1: + result.index = MultiIndex.from_product( + arg2.columns.levels + [result_index] + ) + result = result.reorder_levels([2, 0, 1]).sort_index() + else: + result.index = MultiIndex.from_product( + [range(len(arg2.columns)), range(len(result_index))] + ) + result = result.swaplevel(1, 0).sort_index() + result.index = MultiIndex.from_product( + [result_index] + [arg2.columns] + ) + else: + + # empty result + result = DataFrame( + index=MultiIndex( + levels=[arg1.index, arg2.columns], codes=[[], []] + ), + columns=arg2.columns, + dtype="float64", + ) + + # reset our index names to arg1 names + # reset our column names to arg2 names + # careful not to mutate the original names + result.columns = result.columns.set_names(arg1.columns.names) + result.index = result.index.set_names( + result_index.names + arg2.columns.names + ) + + return result + + else: + raise ValueError("'pairwise' is not True/False") + else: + results = { + i: f(*_prep_binary(arg1.iloc[:, i], arg2)) + for i, col in enumerate(arg1.columns) + } + return dataframe_from_int_dict(results, arg1) + + else: + return _flex_binary_moment(arg2, arg1, f) + + +def _get_center_of_mass(comass, span, halflife, alpha): + valid_count = com.count_not_none(comass, span, halflife, alpha) + if valid_count > 1: + raise ValueError("comass, span, halflife, and alpha are mutually exclusive") + + # Convert to center of mass; domain checks ensure 0 < alpha <= 1 + if comass is not None: + if comass < 0: + raise ValueError("comass must satisfy: comass >= 0") + elif span is not None: + if span < 1: + raise ValueError("span must satisfy: span >= 1") + comass = (span - 1) / 2.0 + elif halflife is not None: + if halflife <= 0: + raise ValueError("halflife must satisfy: halflife > 0") + decay = 1 - np.exp(np.log(0.5) / halflife) + comass = 1 / decay - 1 + elif alpha is not None: + if alpha <= 0 or alpha > 1: + raise ValueError("alpha must satisfy: 0 < alpha <= 1") + comass = (1.0 - alpha) / alpha + else: + raise ValueError("Must pass one of comass, span, halflife, or alpha") + + return float(comass) + + +def _offset(window, center): + if not is_integer(window): + window = len(window) + offset = (window - 1) / 2.0 if center else 0 + try: + return int(offset) + except TypeError: + return offset.astype(int) + + +def _require_min_periods(p): + def _check_func(minp, window): + if minp is None: + return window + else: + return max(p, minp) + + return _check_func + + +def _use_window(minp, window): + if minp is None: + return window + else: + return minp + + +def _zsqrt(x): + with np.errstate(all="ignore"): + result = np.sqrt(x) + mask = x < 0 + + if isinstance(x, ABCDataFrame): + if mask.values.any(): + result[mask] = 0 + else: + if mask.any(): + result[mask] = 0 + + return result + + +def _prep_binary(arg1, arg2): + if not isinstance(arg2, type(arg1)): + raise Exception("Input arrays must be of the same type!") + + # mask out values, this also makes a common index... + X = arg1 + 0 * arg2 + Y = arg2 + 0 * arg1 + + return X, Y diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py new file mode 100644 index 0000000000000..40e6c679ba72d --- /dev/null +++ b/pandas/core/window/ewm.py @@ -0,0 +1,388 @@ +from textwrap import dedent + +import numpy as np + +import pandas._libs.window as libwindow +from pandas.compat.numpy import function as nv +from pandas.util._decorators import Appender, Substitution + +from pandas.core.dtypes.generic import ABCDataFrame + +from pandas.core.base import DataError +from pandas.core.window.common import _doc_template, _get_center_of_mass, _shared_docs +from pandas.core.window.rolling import _flex_binary_moment, _Rolling, _zsqrt + +_bias_template = """ + Parameters + ---------- + bias : bool, default False + Use a standard estimation bias correction. + *args, **kwargs + Arguments and keyword arguments to be passed into func. +""" + +_pairwise_template = """ + Parameters + ---------- + other : Series, DataFrame, or ndarray, optional + If not supplied then will default to self and produce pairwise + output. + pairwise : bool, default None + If False then only matching columns between self and other will be + used and the output will be a DataFrame. + If True then all pairwise combinations will be calculated and the + output will be a MultiIndex DataFrame in the case of DataFrame + inputs. In the case of missing elements, only complete pairwise + observations will be used. + bias : bool, default False + Use a standard estimation bias correction. + **kwargs + Keyword arguments to be passed into func. +""" + + +class EWM(_Rolling): + r""" + Provide exponential weighted functions. + + Parameters + ---------- + com : float, optional + Specify decay in terms of center of mass, + :math:`\alpha = 1 / (1 + com),\text{ for } com \geq 0`. + span : float, optional + Specify decay in terms of span, + :math:`\alpha = 2 / (span + 1),\text{ for } span \geq 1`. + halflife : float, optional + Specify decay in terms of half-life, + :math:`\alpha = 1 - exp(log(0.5) / halflife),\text{for} halflife > 0`. + alpha : float, optional + Specify smoothing factor :math:`\alpha` directly, + :math:`0 < \alpha \leq 1`. + min_periods : int, default 0 + Minimum number of observations in window required to have a value + (otherwise result is NA). + adjust : bool, default True + Divide by decaying adjustment factor in beginning periods to account + for imbalance in relative weightings + (viewing EWMA as a moving average). + ignore_na : bool, default False + Ignore missing values when calculating weights; + specify True to reproduce pre-0.15.0 behavior. + axis : {0 or 'index', 1 or 'columns'}, default 0 + The axis to use. The value 0 identifies the rows, and 1 + identifies the columns. + + Returns + ------- + DataFrame + A Window sub-classed for the particular operation. + + See Also + -------- + rolling : Provides rolling window calculations. + expanding : Provides expanding transformations. + + Notes + ----- + Exactly one of center of mass, span, half-life, and alpha must be provided. + Allowed values and relationship between the parameters are specified in the + parameter descriptions above; see the link at the end of this section for + a detailed explanation. + + When adjust is True (default), weighted averages are calculated using + weights (1-alpha)**(n-1), (1-alpha)**(n-2), ..., 1-alpha, 1. + + When adjust is False, weighted averages are calculated recursively as: + weighted_average[0] = arg[0]; + weighted_average[i] = (1-alpha)*weighted_average[i-1] + alpha*arg[i]. + + When ignore_na is False (default), weights are based on absolute positions. + For example, the weights of x and y used in calculating the final weighted + average of [x, None, y] are (1-alpha)**2 and 1 (if adjust is True), and + (1-alpha)**2 and alpha (if adjust is False). + + When ignore_na is True (reproducing pre-0.15.0 behavior), weights are based + on relative positions. For example, the weights of x and y used in + calculating the final weighted average of [x, None, y] are 1-alpha and 1 + (if adjust is True), and 1-alpha and alpha (if adjust is False). + + More details can be found at + http://pandas.pydata.org/pandas-docs/stable/user_guide/computation.html#exponentially-weighted-windows + + Examples + -------- + + >>> df = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}) + >>> df + B + 0 0.0 + 1 1.0 + 2 2.0 + 3 NaN + 4 4.0 + + >>> df.ewm(com=0.5).mean() + B + 0 0.000000 + 1 0.750000 + 2 1.615385 + 3 1.615385 + 4 3.670213 + """ + _attributes = ["com", "min_periods", "adjust", "ignore_na", "axis"] + + def __init__( + self, + obj, + com=None, + span=None, + halflife=None, + alpha=None, + min_periods=0, + adjust=True, + ignore_na=False, + axis=0, + ): + self.obj = obj + self.com = _get_center_of_mass(com, span, halflife, alpha) + self.min_periods = min_periods + self.adjust = adjust + self.ignore_na = ignore_na + self.axis = axis + self.on = None + + @property + def _constructor(self): + return EWM + + _agg_see_also_doc = dedent( + """ + See Also + -------- + pandas.DataFrame.rolling.aggregate + """ + ) + + _agg_examples_doc = dedent( + """ + Examples + -------- + + >>> df = pd.DataFrame(np.random.randn(10, 3), columns=['A', 'B', 'C']) + >>> df + A B C + 0 -2.385977 -0.102758 0.438822 + 1 -1.004295 0.905829 -0.954544 + 2 0.735167 -0.165272 -1.619346 + 3 -0.702657 -1.340923 -0.706334 + 4 -0.246845 0.211596 -0.901819 + 5 2.463718 3.157577 -1.380906 + 6 -1.142255 2.340594 -0.039875 + 7 1.396598 -1.647453 1.677227 + 8 -0.543425 1.761277 -0.220481 + 9 -0.640505 0.289374 -1.550670 + + >>> df.ewm(alpha=0.5).mean() + A B C + 0 -2.385977 -0.102758 0.438822 + 1 -1.464856 0.569633 -0.490089 + 2 -0.207700 0.149687 -1.135379 + 3 -0.471677 -0.645305 -0.906555 + 4 -0.355635 -0.203033 -0.904111 + 5 1.076417 1.503943 -1.146293 + 6 -0.041654 1.925562 -0.588728 + 7 0.680292 0.132049 0.548693 + 8 0.067236 0.948257 0.163353 + 9 -0.286980 0.618493 -0.694496 + """ + ) + + @Substitution( + see_also=_agg_see_also_doc, + examples=_agg_examples_doc, + versionadded="", + klass="Series/Dataframe", + axis="", + ) + @Appender(_shared_docs["aggregate"]) + def aggregate(self, func, *args, **kwargs): + return super().aggregate(func, *args, **kwargs) + + agg = aggregate + + def _apply(self, func, **kwargs): + """ + Rolling statistical measure using supplied function. Designed to be + used with passed-in Cython array-based functions. + + Parameters + ---------- + func : str/callable to apply + + Returns + ------- + y : same type as input argument + """ + blocks, obj = self._create_blocks() + block_list = list(blocks) + + results = [] + exclude = [] + for i, b in enumerate(blocks): + try: + values = self._prep_values(b.values) + + except (TypeError, NotImplementedError): + if isinstance(obj, ABCDataFrame): + exclude.extend(b.columns) + del block_list[i] + continue + else: + raise DataError("No numeric types to aggregate") + + if values.size == 0: + results.append(values.copy()) + continue + + # if we have a string function name, wrap it + if isinstance(func, str): + cfunc = getattr(libwindow, func, None) + if cfunc is None: + raise ValueError( + "we do not support this function " + "in libwindow.{func}".format(func=func) + ) + + def func(arg): + return cfunc( + arg, + self.com, + int(self.adjust), + int(self.ignore_na), + int(self.min_periods), + ) + + results.append(np.apply_along_axis(func, self.axis, values)) + + return self._wrap_results(results, block_list, obj, exclude) + + @Substitution(name="ewm") + @Appender(_doc_template) + def mean(self, *args, **kwargs): + """ + Exponential weighted moving average. + + Parameters + ---------- + *args, **kwargs + Arguments and keyword arguments to be passed into func. + """ + nv.validate_window_func("mean", args, kwargs) + return self._apply("ewma", **kwargs) + + @Substitution(name="ewm") + @Appender(_doc_template) + @Appender(_bias_template) + def std(self, bias=False, *args, **kwargs): + """ + Exponential weighted moving stddev. + """ + nv.validate_window_func("std", args, kwargs) + return _zsqrt(self.var(bias=bias, **kwargs)) + + vol = std + + @Substitution(name="ewm") + @Appender(_doc_template) + @Appender(_bias_template) + def var(self, bias=False, *args, **kwargs): + """ + Exponential weighted moving variance. + """ + nv.validate_window_func("var", args, kwargs) + + def f(arg): + return libwindow.ewmcov( + arg, + arg, + self.com, + int(self.adjust), + int(self.ignore_na), + int(self.min_periods), + int(bias), + ) + + return self._apply(f, **kwargs) + + @Substitution(name="ewm") + @Appender(_doc_template) + @Appender(_pairwise_template) + def cov(self, other=None, pairwise=None, bias=False, **kwargs): + """ + Exponential weighted sample covariance. + """ + if other is None: + other = self._selected_obj + # only default unset + pairwise = True if pairwise is None else pairwise + other = self._shallow_copy(other) + + def _get_cov(X, Y): + X = self._shallow_copy(X) + Y = self._shallow_copy(Y) + cov = libwindow.ewmcov( + X._prep_values(), + Y._prep_values(), + self.com, + int(self.adjust), + int(self.ignore_na), + int(self.min_periods), + int(bias), + ) + return X._wrap_result(cov) + + return _flex_binary_moment( + self._selected_obj, other._selected_obj, _get_cov, pairwise=bool(pairwise) + ) + + @Substitution(name="ewm") + @Appender(_doc_template) + @Appender(_pairwise_template) + def corr(self, other=None, pairwise=None, **kwargs): + """ + Exponential weighted sample correlation. + """ + if other is None: + other = self._selected_obj + # only default unset + pairwise = True if pairwise is None else pairwise + other = self._shallow_copy(other) + + def _get_corr(X, Y): + X = self._shallow_copy(X) + Y = self._shallow_copy(Y) + + def _cov(x, y): + return libwindow.ewmcov( + x, + y, + self.com, + int(self.adjust), + int(self.ignore_na), + int(self.min_periods), + 1, + ) + + x_values = X._prep_values() + y_values = Y._prep_values() + with np.errstate(all="ignore"): + cov = _cov(x_values, y_values) + x_var = _cov(x_values, x_values) + y_var = _cov(y_values, y_values) + corr = cov / _zsqrt(x_var * y_var) + return X._wrap_result(corr) + + return _flex_binary_moment( + self._selected_obj, other._selected_obj, _get_corr, pairwise=bool(pairwise) + ) diff --git a/pandas/core/window/expanding.py b/pandas/core/window/expanding.py new file mode 100644 index 0000000000000..47bd8f2ec593b --- /dev/null +++ b/pandas/core/window/expanding.py @@ -0,0 +1,260 @@ +from textwrap import dedent + +from pandas.compat.numpy import function as nv +from pandas.util._decorators import Appender, Substitution + +from pandas.core.window.common import _doc_template, _GroupByMixin, _shared_docs +from pandas.core.window.rolling import _Rolling_and_Expanding + + +class Expanding(_Rolling_and_Expanding): + """ + Provide expanding transformations. + + Parameters + ---------- + min_periods : int, default 1 + Minimum number of observations in window required to have a value + (otherwise result is NA). + center : bool, default False + Set the labels at the center of the window. + axis : int or str, default 0 + + Returns + ------- + a Window sub-classed for the particular operation + + See Also + -------- + rolling : Provides rolling window calculations. + ewm : Provides exponential weighted functions. + + Notes + ----- + By default, the result is set to the right edge of the window. This can be + changed to the center of the window by setting ``center=True``. + + Examples + -------- + + >>> df = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}) + B + 0 0.0 + 1 1.0 + 2 2.0 + 3 NaN + 4 4.0 + + >>> df.expanding(2).sum() + B + 0 NaN + 1 1.0 + 2 3.0 + 3 3.0 + 4 7.0 + """ + + _attributes = ["min_periods", "center", "axis"] + + def __init__(self, obj, min_periods=1, center=False, axis=0, **kwargs): + super().__init__(obj=obj, min_periods=min_periods, center=center, axis=axis) + + @property + def _constructor(self): + return Expanding + + def _get_window(self, other=None, **kwargs): + """ + Get the window length over which to perform some operation. + + Parameters + ---------- + other : object, default None + The other object that is involved in the operation. + Such an object is involved for operations like covariance. + + Returns + ------- + window : int + The window length. + """ + axis = self.obj._get_axis(self.axis) + length = len(axis) + (other is not None) * len(axis) + + other = self.min_periods or -1 + return max(length, other) + + _agg_see_also_doc = dedent( + """ + See Also + -------- + DataFrame.expanding.aggregate + DataFrame.rolling.aggregate + DataFrame.aggregate + """ + ) + + _agg_examples_doc = dedent( + """ + Examples + -------- + + >>> df = pd.DataFrame(np.random.randn(10, 3), columns=['A', 'B', 'C']) + >>> df + A B C + 0 -2.385977 -0.102758 0.438822 + 1 -1.004295 0.905829 -0.954544 + 2 0.735167 -0.165272 -1.619346 + 3 -0.702657 -1.340923 -0.706334 + 4 -0.246845 0.211596 -0.901819 + 5 2.463718 3.157577 -1.380906 + 6 -1.142255 2.340594 -0.039875 + 7 1.396598 -1.647453 1.677227 + 8 -0.543425 1.761277 -0.220481 + 9 -0.640505 0.289374 -1.550670 + + >>> df.ewm(alpha=0.5).mean() + A B C + 0 -2.385977 -0.102758 0.438822 + 1 -1.464856 0.569633 -0.490089 + 2 -0.207700 0.149687 -1.135379 + 3 -0.471677 -0.645305 -0.906555 + 4 -0.355635 -0.203033 -0.904111 + 5 1.076417 1.503943 -1.146293 + 6 -0.041654 1.925562 -0.588728 + 7 0.680292 0.132049 0.548693 + 8 0.067236 0.948257 0.163353 + 9 -0.286980 0.618493 -0.694496 + """ + ) + + @Substitution( + see_also=_agg_see_also_doc, + examples=_agg_examples_doc, + versionadded="", + klass="Series/Dataframe", + axis="", + ) + @Appender(_shared_docs["aggregate"]) + def aggregate(self, func, *args, **kwargs): + return super().aggregate(func, *args, **kwargs) + + agg = aggregate + + @Substitution(name="expanding") + @Appender(_shared_docs["count"]) + def count(self, **kwargs): + return super().count(**kwargs) + + @Substitution(name="expanding") + @Appender(_shared_docs["apply"]) + def apply(self, func, raw=None, args=(), kwargs={}): + return super().apply(func, raw=raw, args=args, kwargs=kwargs) + + @Substitution(name="expanding") + @Appender(_shared_docs["sum"]) + def sum(self, *args, **kwargs): + nv.validate_expanding_func("sum", args, kwargs) + return super().sum(*args, **kwargs) + + @Substitution(name="expanding") + @Appender(_doc_template) + @Appender(_shared_docs["max"]) + def max(self, *args, **kwargs): + nv.validate_expanding_func("max", args, kwargs) + return super().max(*args, **kwargs) + + @Substitution(name="expanding") + @Appender(_shared_docs["min"]) + def min(self, *args, **kwargs): + nv.validate_expanding_func("min", args, kwargs) + return super().min(*args, **kwargs) + + @Substitution(name="expanding") + @Appender(_shared_docs["mean"]) + def mean(self, *args, **kwargs): + nv.validate_expanding_func("mean", args, kwargs) + return super().mean(*args, **kwargs) + + @Substitution(name="expanding") + @Appender(_shared_docs["median"]) + def median(self, **kwargs): + return super().median(**kwargs) + + @Substitution(name="expanding") + @Appender(_shared_docs["std"]) + def std(self, ddof=1, *args, **kwargs): + nv.validate_expanding_func("std", args, kwargs) + return super().std(ddof=ddof, **kwargs) + + @Substitution(name="expanding") + @Appender(_shared_docs["var"]) + def var(self, ddof=1, *args, **kwargs): + nv.validate_expanding_func("var", args, kwargs) + return super().var(ddof=ddof, **kwargs) + + @Substitution(name="expanding") + @Appender(_doc_template) + @Appender(_shared_docs["skew"]) + def skew(self, **kwargs): + return super().skew(**kwargs) + + _agg_doc = dedent( + """ + Examples + -------- + + The example below will show an expanding calculation with a window size of + four matching the equivalent function call using `scipy.stats`. + + >>> arr = [1, 2, 3, 4, 999] + >>> import scipy.stats + >>> fmt = "{0:.6f}" # limit the printed precision to 6 digits + >>> print(fmt.format(scipy.stats.kurtosis(arr[:-1], bias=False))) + -1.200000 + >>> print(fmt.format(scipy.stats.kurtosis(arr, bias=False))) + 4.999874 + >>> s = pd.Series(arr) + >>> s.expanding(4).kurt() + 0 NaN + 1 NaN + 2 NaN + 3 -1.200000 + 4 4.999874 + dtype: float64 + """ + ) + + @Appender(_agg_doc) + @Substitution(name="expanding") + @Appender(_shared_docs["kurt"]) + def kurt(self, **kwargs): + return super().kurt(**kwargs) + + @Substitution(name="expanding") + @Appender(_shared_docs["quantile"]) + def quantile(self, quantile, interpolation="linear", **kwargs): + return super().quantile( + quantile=quantile, interpolation=interpolation, **kwargs + ) + + @Substitution(name="expanding") + @Appender(_doc_template) + @Appender(_shared_docs["cov"]) + def cov(self, other=None, pairwise=None, ddof=1, **kwargs): + return super().cov(other=other, pairwise=pairwise, ddof=ddof, **kwargs) + + @Substitution(name="expanding") + @Appender(_shared_docs["corr"]) + def corr(self, other=None, pairwise=None, **kwargs): + return super().corr(other=other, pairwise=pairwise, **kwargs) + + +class ExpandingGroupby(_GroupByMixin, Expanding): + """ + Provide a expanding groupby implementation. + """ + + @property + def _constructor(self): + return Expanding diff --git a/pandas/core/window.py b/pandas/core/window/rolling.py similarity index 65% rename from pandas/core/window.py rename to pandas/core/window/rolling.py index 4b6a1cf2e9a04..a7e122fa3528f 100644 --- a/pandas/core/window.py +++ b/pandas/core/window/rolling.py @@ -2,7 +2,6 @@ Provide a generic structure to support window functions, similar to how we have a Groupby object. """ -from collections import defaultdict from datetime import timedelta from textwrap import dedent from typing import Callable, List, Optional, Set, Union @@ -38,22 +37,17 @@ from pandas._typing import Axis, FrameOrSeries, Scalar from pandas.core.base import DataError, PandasObject, SelectionMixin import pandas.core.common as com -from pandas.core.generic import _shared_docs -from pandas.core.groupby.base import GroupByMixin -from pandas.core.index import Index, MultiIndex, ensure_index - -_shared_docs = dict(**_shared_docs) -_doc_template = """ - Returns - ------- - Series or DataFrame - Return type is determined by the caller. - - See Also - -------- - Series.%(name)s : Series %(name)s. - DataFrame.%(name)s : DataFrame %(name)s. -""" +from pandas.core.index import Index, ensure_index +from pandas.core.window.common import ( + _doc_template, + _flex_binary_moment, + _GroupByMixin, + _offset, + _require_min_periods, + _shared_docs, + _use_window, + _zsqrt, +) class _Window(PandasObject, SelectionMixin): @@ -121,6 +115,8 @@ def validate(self): "neither", ]: raise ValueError("closed must be 'right', 'left', 'both' or 'neither'") + if not isinstance(self.obj, (ABCSeries, ABCDataFrame)): + raise TypeError("invalid type: {}".format(type(self))) def _create_blocks(self): """ @@ -246,8 +242,10 @@ def _prep_values(self, values: Optional[np.ndarray] = None) -> np.ndarray: except (ValueError, TypeError): raise TypeError("cannot handle this type -> {0}".format(values.dtype)) - # Always convert inf to nan - values[np.isinf(values)] = np.NaN + # Convert inf to nan for C funcs + inf = np.isinf(values) + if inf.any(): + values = np.where(inf, np.nan, values) return values @@ -265,6 +263,8 @@ def _wrap_result(self, result, block=None, obj=None): # coerce if necessary if block is not None: if is_timedelta64_dtype(block.values.dtype): + # TODO: do we know what result.dtype is at this point? + # i.e. can we just do an astype? from pandas import to_timedelta result = to_timedelta(result.ravel(), unit="ns").values.reshape( @@ -901,12 +901,12 @@ def func(arg, window, min_periods=None, closed=None): axis="", ) @Appender(_shared_docs["aggregate"]) - def aggregate(self, arg, *args, **kwargs): - result, how = self._aggregate(arg, *args, **kwargs) + def aggregate(self, func, *args, **kwargs): + result, how = self._aggregate(func, *args, **kwargs) if result is None: # these must apply directly - result = arg(self) + result = func(self) return result @@ -925,44 +925,6 @@ def mean(self, *args, **kwargs): return self._apply("roll_weighted_mean", **kwargs) -class _GroupByMixin(GroupByMixin): - """ - Provide the groupby facilities. - """ - - def __init__(self, obj, *args, **kwargs): - parent = kwargs.pop("parent", None) # noqa - groupby = kwargs.pop("groupby", None) - if groupby is None: - groupby, obj = obj, obj.obj - self._groupby = groupby - self._groupby.mutated = True - self._groupby.grouper.mutated = True - super().__init__(obj, *args, **kwargs) - - count = GroupByMixin._dispatch("count") - corr = GroupByMixin._dispatch("corr", other=None, pairwise=None) - cov = GroupByMixin._dispatch("cov", other=None, pairwise=None) - - def _apply( - self, func, name=None, window=None, center=None, check_minp=None, **kwargs - ): - """ - Dispatch to apply; we are stripping all of the _apply kwargs and - performing the original function call on the grouped object. - """ - - def f(x, name=name, *args): - x = self._shallow_copy(x) - - if isinstance(name, str): - return getattr(x, name)(*args, **kwargs) - - return x.apply(name, *args, **kwargs) - - return self._groupby.apply(f) - - class _Rolling(_Window): @property def _constructor(self): @@ -1826,8 +1788,8 @@ def _validate_freq(self): axis="", ) @Appender(_shared_docs["aggregate"]) - def aggregate(self, arg, *args, **kwargs): - return super().aggregate(arg, *args, **kwargs) + def aggregate(self, func, *args, **kwargs): + return super().aggregate(func, *args, **kwargs) agg = aggregate @@ -1945,6 +1907,9 @@ def corr(self, other=None, pairwise=None, **kwargs): return super().corr(other=other, pairwise=pairwise, **kwargs) +Rolling.__doc__ = Window.__doc__ + + class RollingGroupby(_GroupByMixin, Rolling): """ Provide a rolling groupby implementation. @@ -1972,883 +1937,3 @@ def _validate_monotonic(self): level. """ pass - - -class Expanding(_Rolling_and_Expanding): - """ - Provide expanding transformations. - - Parameters - ---------- - min_periods : int, default 1 - Minimum number of observations in window required to have a value - (otherwise result is NA). - center : bool, default False - Set the labels at the center of the window. - axis : int or str, default 0 - - Returns - ------- - a Window sub-classed for the particular operation - - See Also - -------- - rolling : Provides rolling window calculations. - ewm : Provides exponential weighted functions. - - Notes - ----- - By default, the result is set to the right edge of the window. This can be - changed to the center of the window by setting ``center=True``. - - Examples - -------- - - >>> df = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}) - B - 0 0.0 - 1 1.0 - 2 2.0 - 3 NaN - 4 4.0 - - >>> df.expanding(2).sum() - B - 0 NaN - 1 1.0 - 2 3.0 - 3 3.0 - 4 7.0 - """ - - _attributes = ["min_periods", "center", "axis"] - - def __init__(self, obj, min_periods=1, center=False, axis=0, **kwargs): - super().__init__(obj=obj, min_periods=min_periods, center=center, axis=axis) - - @property - def _constructor(self): - return Expanding - - def _get_window(self, other=None, **kwargs): - """ - Get the window length over which to perform some operation. - - Parameters - ---------- - other : object, default None - The other object that is involved in the operation. - Such an object is involved for operations like covariance. - - Returns - ------- - window : int - The window length. - """ - axis = self.obj._get_axis(self.axis) - length = len(axis) + (other is not None) * len(axis) - - other = self.min_periods or -1 - return max(length, other) - - _agg_see_also_doc = dedent( - """ - See Also - -------- - DataFrame.expanding.aggregate - DataFrame.rolling.aggregate - DataFrame.aggregate - """ - ) - - _agg_examples_doc = dedent( - """ - Examples - -------- - - >>> df = pd.DataFrame(np.random.randn(10, 3), columns=['A', 'B', 'C']) - >>> df - A B C - 0 -2.385977 -0.102758 0.438822 - 1 -1.004295 0.905829 -0.954544 - 2 0.735167 -0.165272 -1.619346 - 3 -0.702657 -1.340923 -0.706334 - 4 -0.246845 0.211596 -0.901819 - 5 2.463718 3.157577 -1.380906 - 6 -1.142255 2.340594 -0.039875 - 7 1.396598 -1.647453 1.677227 - 8 -0.543425 1.761277 -0.220481 - 9 -0.640505 0.289374 -1.550670 - - >>> df.ewm(alpha=0.5).mean() - A B C - 0 -2.385977 -0.102758 0.438822 - 1 -1.464856 0.569633 -0.490089 - 2 -0.207700 0.149687 -1.135379 - 3 -0.471677 -0.645305 -0.906555 - 4 -0.355635 -0.203033 -0.904111 - 5 1.076417 1.503943 -1.146293 - 6 -0.041654 1.925562 -0.588728 - 7 0.680292 0.132049 0.548693 - 8 0.067236 0.948257 0.163353 - 9 -0.286980 0.618493 -0.694496 - """ - ) - - @Substitution( - see_also=_agg_see_also_doc, - examples=_agg_examples_doc, - versionadded="", - klass="Series/Dataframe", - axis="", - ) - @Appender(_shared_docs["aggregate"]) - def aggregate(self, arg, *args, **kwargs): - return super().aggregate(arg, *args, **kwargs) - - agg = aggregate - - @Substitution(name="expanding") - @Appender(_shared_docs["count"]) - def count(self, **kwargs): - return super().count(**kwargs) - - @Substitution(name="expanding") - @Appender(_shared_docs["apply"]) - def apply(self, func, raw=None, args=(), kwargs={}): - return super().apply(func, raw=raw, args=args, kwargs=kwargs) - - @Substitution(name="expanding") - @Appender(_shared_docs["sum"]) - def sum(self, *args, **kwargs): - nv.validate_expanding_func("sum", args, kwargs) - return super().sum(*args, **kwargs) - - @Substitution(name="expanding") - @Appender(_doc_template) - @Appender(_shared_docs["max"]) - def max(self, *args, **kwargs): - nv.validate_expanding_func("max", args, kwargs) - return super().max(*args, **kwargs) - - @Substitution(name="expanding") - @Appender(_shared_docs["min"]) - def min(self, *args, **kwargs): - nv.validate_expanding_func("min", args, kwargs) - return super().min(*args, **kwargs) - - @Substitution(name="expanding") - @Appender(_shared_docs["mean"]) - def mean(self, *args, **kwargs): - nv.validate_expanding_func("mean", args, kwargs) - return super().mean(*args, **kwargs) - - @Substitution(name="expanding") - @Appender(_shared_docs["median"]) - def median(self, **kwargs): - return super().median(**kwargs) - - @Substitution(name="expanding") - @Appender(_shared_docs["std"]) - def std(self, ddof=1, *args, **kwargs): - nv.validate_expanding_func("std", args, kwargs) - return super().std(ddof=ddof, **kwargs) - - @Substitution(name="expanding") - @Appender(_shared_docs["var"]) - def var(self, ddof=1, *args, **kwargs): - nv.validate_expanding_func("var", args, kwargs) - return super().var(ddof=ddof, **kwargs) - - @Substitution(name="expanding") - @Appender(_doc_template) - @Appender(_shared_docs["skew"]) - def skew(self, **kwargs): - return super().skew(**kwargs) - - _agg_doc = dedent( - """ - Examples - -------- - - The example below will show an expanding calculation with a window size of - four matching the equivalent function call using `scipy.stats`. - - >>> arr = [1, 2, 3, 4, 999] - >>> import scipy.stats - >>> fmt = "{0:.6f}" # limit the printed precision to 6 digits - >>> print(fmt.format(scipy.stats.kurtosis(arr[:-1], bias=False))) - -1.200000 - >>> print(fmt.format(scipy.stats.kurtosis(arr, bias=False))) - 4.999874 - >>> s = pd.Series(arr) - >>> s.expanding(4).kurt() - 0 NaN - 1 NaN - 2 NaN - 3 -1.200000 - 4 4.999874 - dtype: float64 - """ - ) - - @Appender(_agg_doc) - @Substitution(name="expanding") - @Appender(_shared_docs["kurt"]) - def kurt(self, **kwargs): - return super().kurt(**kwargs) - - @Substitution(name="expanding") - @Appender(_shared_docs["quantile"]) - def quantile(self, quantile, interpolation="linear", **kwargs): - return super().quantile( - quantile=quantile, interpolation=interpolation, **kwargs - ) - - @Substitution(name="expanding") - @Appender(_doc_template) - @Appender(_shared_docs["cov"]) - def cov(self, other=None, pairwise=None, ddof=1, **kwargs): - return super().cov(other=other, pairwise=pairwise, ddof=ddof, **kwargs) - - @Substitution(name="expanding") - @Appender(_shared_docs["corr"]) - def corr(self, other=None, pairwise=None, **kwargs): - return super().corr(other=other, pairwise=pairwise, **kwargs) - - -class ExpandingGroupby(_GroupByMixin, Expanding): - """ - Provide a expanding groupby implementation. - """ - - @property - def _constructor(self): - return Expanding - - -_bias_template = """ - Parameters - ---------- - bias : bool, default False - Use a standard estimation bias correction. - *args, **kwargs - Arguments and keyword arguments to be passed into func. -""" - -_pairwise_template = """ - Parameters - ---------- - other : Series, DataFrame, or ndarray, optional - If not supplied then will default to self and produce pairwise - output. - pairwise : bool, default None - If False then only matching columns between self and other will be - used and the output will be a DataFrame. - If True then all pairwise combinations will be calculated and the - output will be a MultiIndex DataFrame in the case of DataFrame - inputs. In the case of missing elements, only complete pairwise - observations will be used. - bias : bool, default False - Use a standard estimation bias correction. - **kwargs - Keyword arguments to be passed into func. -""" - - -class EWM(_Rolling): - r""" - Provide exponential weighted functions. - - Parameters - ---------- - com : float, optional - Specify decay in terms of center of mass, - :math:`\alpha = 1 / (1 + com),\text{ for } com \geq 0`. - span : float, optional - Specify decay in terms of span, - :math:`\alpha = 2 / (span + 1),\text{ for } span \geq 1`. - halflife : float, optional - Specify decay in terms of half-life, - :math:`\alpha = 1 - exp(log(0.5) / halflife),\text{for} halflife > 0`. - alpha : float, optional - Specify smoothing factor :math:`\alpha` directly, - :math:`0 < \alpha \leq 1`. - min_periods : int, default 0 - Minimum number of observations in window required to have a value - (otherwise result is NA). - adjust : bool, default True - Divide by decaying adjustment factor in beginning periods to account - for imbalance in relative weightings - (viewing EWMA as a moving average). - ignore_na : bool, default False - Ignore missing values when calculating weights; - specify True to reproduce pre-0.15.0 behavior. - axis : {0 or 'index', 1 or 'columns'}, default 0 - The axis to use. The value 0 identifies the rows, and 1 - identifies the columns. - - Returns - ------- - DataFrame - A Window sub-classed for the particular operation. - - See Also - -------- - rolling : Provides rolling window calculations. - expanding : Provides expanding transformations. - - Notes - ----- - Exactly one of center of mass, span, half-life, and alpha must be provided. - Allowed values and relationship between the parameters are specified in the - parameter descriptions above; see the link at the end of this section for - a detailed explanation. - - When adjust is True (default), weighted averages are calculated using - weights (1-alpha)**(n-1), (1-alpha)**(n-2), ..., 1-alpha, 1. - - When adjust is False, weighted averages are calculated recursively as: - weighted_average[0] = arg[0]; - weighted_average[i] = (1-alpha)*weighted_average[i-1] + alpha*arg[i]. - - When ignore_na is False (default), weights are based on absolute positions. - For example, the weights of x and y used in calculating the final weighted - average of [x, None, y] are (1-alpha)**2 and 1 (if adjust is True), and - (1-alpha)**2 and alpha (if adjust is False). - - When ignore_na is True (reproducing pre-0.15.0 behavior), weights are based - on relative positions. For example, the weights of x and y used in - calculating the final weighted average of [x, None, y] are 1-alpha and 1 - (if adjust is True), and 1-alpha and alpha (if adjust is False). - - More details can be found at - http://pandas.pydata.org/pandas-docs/stable/user_guide/computation.html#exponentially-weighted-windows - - Examples - -------- - - >>> df = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}) - >>> df - B - 0 0.0 - 1 1.0 - 2 2.0 - 3 NaN - 4 4.0 - - >>> df.ewm(com=0.5).mean() - B - 0 0.000000 - 1 0.750000 - 2 1.615385 - 3 1.615385 - 4 3.670213 - """ - _attributes = ["com", "min_periods", "adjust", "ignore_na", "axis"] - - def __init__( - self, - obj, - com=None, - span=None, - halflife=None, - alpha=None, - min_periods=0, - adjust=True, - ignore_na=False, - axis=0, - ): - self.obj = obj - self.com = _get_center_of_mass(com, span, halflife, alpha) - self.min_periods = min_periods - self.adjust = adjust - self.ignore_na = ignore_na - self.axis = axis - self.on = None - - @property - def _constructor(self): - return EWM - - _agg_see_also_doc = dedent( - """ - See Also - -------- - pandas.DataFrame.rolling.aggregate - """ - ) - - _agg_examples_doc = dedent( - """ - Examples - -------- - - >>> df = pd.DataFrame(np.random.randn(10, 3), columns=['A', 'B', 'C']) - >>> df - A B C - 0 -2.385977 -0.102758 0.438822 - 1 -1.004295 0.905829 -0.954544 - 2 0.735167 -0.165272 -1.619346 - 3 -0.702657 -1.340923 -0.706334 - 4 -0.246845 0.211596 -0.901819 - 5 2.463718 3.157577 -1.380906 - 6 -1.142255 2.340594 -0.039875 - 7 1.396598 -1.647453 1.677227 - 8 -0.543425 1.761277 -0.220481 - 9 -0.640505 0.289374 -1.550670 - - >>> df.ewm(alpha=0.5).mean() - A B C - 0 -2.385977 -0.102758 0.438822 - 1 -1.464856 0.569633 -0.490089 - 2 -0.207700 0.149687 -1.135379 - 3 -0.471677 -0.645305 -0.906555 - 4 -0.355635 -0.203033 -0.904111 - 5 1.076417 1.503943 -1.146293 - 6 -0.041654 1.925562 -0.588728 - 7 0.680292 0.132049 0.548693 - 8 0.067236 0.948257 0.163353 - 9 -0.286980 0.618493 -0.694496 - """ - ) - - @Substitution( - see_also=_agg_see_also_doc, - examples=_agg_examples_doc, - versionadded="", - klass="Series/Dataframe", - axis="", - ) - @Appender(_shared_docs["aggregate"]) - def aggregate(self, arg, *args, **kwargs): - return super().aggregate(arg, *args, **kwargs) - - agg = aggregate - - def _apply(self, func, **kwargs): - """ - Rolling statistical measure using supplied function. Designed to be - used with passed-in Cython array-based functions. - - Parameters - ---------- - func : str/callable to apply - - Returns - ------- - y : same type as input argument - """ - blocks, obj = self._create_blocks() - block_list = list(blocks) - - results = [] - exclude = [] - for i, b in enumerate(blocks): - try: - values = self._prep_values(b.values) - - except (TypeError, NotImplementedError): - if isinstance(obj, ABCDataFrame): - exclude.extend(b.columns) - del block_list[i] - continue - else: - raise DataError("No numeric types to aggregate") - - if values.size == 0: - results.append(values.copy()) - continue - - # if we have a string function name, wrap it - if isinstance(func, str): - cfunc = getattr(libwindow, func, None) - if cfunc is None: - raise ValueError( - "we do not support this function " - "in libwindow.{func}".format(func=func) - ) - - def func(arg): - return cfunc( - arg, - self.com, - int(self.adjust), - int(self.ignore_na), - int(self.min_periods), - ) - - results.append(np.apply_along_axis(func, self.axis, values)) - - return self._wrap_results(results, block_list, obj, exclude) - - @Substitution(name="ewm") - @Appender(_doc_template) - def mean(self, *args, **kwargs): - """ - Exponential weighted moving average. - - Parameters - ---------- - *args, **kwargs - Arguments and keyword arguments to be passed into func. - """ - nv.validate_window_func("mean", args, kwargs) - return self._apply("ewma", **kwargs) - - @Substitution(name="ewm") - @Appender(_doc_template) - @Appender(_bias_template) - def std(self, bias=False, *args, **kwargs): - """ - Exponential weighted moving stddev. - """ - nv.validate_window_func("std", args, kwargs) - return _zsqrt(self.var(bias=bias, **kwargs)) - - vol = std - - @Substitution(name="ewm") - @Appender(_doc_template) - @Appender(_bias_template) - def var(self, bias=False, *args, **kwargs): - """ - Exponential weighted moving variance. - """ - nv.validate_window_func("var", args, kwargs) - - def f(arg): - return libwindow.ewmcov( - arg, - arg, - self.com, - int(self.adjust), - int(self.ignore_na), - int(self.min_periods), - int(bias), - ) - - return self._apply(f, **kwargs) - - @Substitution(name="ewm") - @Appender(_doc_template) - @Appender(_pairwise_template) - def cov(self, other=None, pairwise=None, bias=False, **kwargs): - """ - Exponential weighted sample covariance. - """ - if other is None: - other = self._selected_obj - # only default unset - pairwise = True if pairwise is None else pairwise - other = self._shallow_copy(other) - - def _get_cov(X, Y): - X = self._shallow_copy(X) - Y = self._shallow_copy(Y) - cov = libwindow.ewmcov( - X._prep_values(), - Y._prep_values(), - self.com, - int(self.adjust), - int(self.ignore_na), - int(self.min_periods), - int(bias), - ) - return X._wrap_result(cov) - - return _flex_binary_moment( - self._selected_obj, other._selected_obj, _get_cov, pairwise=bool(pairwise) - ) - - @Substitution(name="ewm") - @Appender(_doc_template) - @Appender(_pairwise_template) - def corr(self, other=None, pairwise=None, **kwargs): - """ - Exponential weighted sample correlation. - """ - if other is None: - other = self._selected_obj - # only default unset - pairwise = True if pairwise is None else pairwise - other = self._shallow_copy(other) - - def _get_corr(X, Y): - X = self._shallow_copy(X) - Y = self._shallow_copy(Y) - - def _cov(x, y): - return libwindow.ewmcov( - x, - y, - self.com, - int(self.adjust), - int(self.ignore_na), - int(self.min_periods), - 1, - ) - - x_values = X._prep_values() - y_values = Y._prep_values() - with np.errstate(all="ignore"): - cov = _cov(x_values, y_values) - x_var = _cov(x_values, x_values) - y_var = _cov(y_values, y_values) - corr = cov / _zsqrt(x_var * y_var) - return X._wrap_result(corr) - - return _flex_binary_moment( - self._selected_obj, other._selected_obj, _get_corr, pairwise=bool(pairwise) - ) - - -# Helper Funcs - - -def _flex_binary_moment(arg1, arg2, f, pairwise=False): - - if not ( - isinstance(arg1, (np.ndarray, ABCSeries, ABCDataFrame)) - and isinstance(arg2, (np.ndarray, ABCSeries, ABCDataFrame)) - ): - raise TypeError( - "arguments to moment function must be of type " - "np.ndarray/Series/DataFrame" - ) - - if isinstance(arg1, (np.ndarray, ABCSeries)) and isinstance( - arg2, (np.ndarray, ABCSeries) - ): - X, Y = _prep_binary(arg1, arg2) - return f(X, Y) - - elif isinstance(arg1, ABCDataFrame): - from pandas import DataFrame - - def dataframe_from_int_dict(data, frame_template): - result = DataFrame(data, index=frame_template.index) - if len(result.columns) > 0: - result.columns = frame_template.columns[result.columns] - return result - - results = {} - if isinstance(arg2, ABCDataFrame): - if pairwise is False: - if arg1 is arg2: - # special case in order to handle duplicate column names - for i, col in enumerate(arg1.columns): - results[i] = f(arg1.iloc[:, i], arg2.iloc[:, i]) - return dataframe_from_int_dict(results, arg1) - else: - if not arg1.columns.is_unique: - raise ValueError("'arg1' columns are not unique") - if not arg2.columns.is_unique: - raise ValueError("'arg2' columns are not unique") - with warnings.catch_warnings(record=True): - warnings.simplefilter("ignore", RuntimeWarning) - X, Y = arg1.align(arg2, join="outer") - X = X + 0 * Y - Y = Y + 0 * X - - with warnings.catch_warnings(record=True): - warnings.simplefilter("ignore", RuntimeWarning) - res_columns = arg1.columns.union(arg2.columns) - for col in res_columns: - if col in X and col in Y: - results[col] = f(X[col], Y[col]) - return DataFrame(results, index=X.index, columns=res_columns) - elif pairwise is True: - results = defaultdict(dict) - for i, k1 in enumerate(arg1.columns): - for j, k2 in enumerate(arg2.columns): - if j < i and arg2 is arg1: - # Symmetric case - results[i][j] = results[j][i] - else: - results[i][j] = f( - *_prep_binary(arg1.iloc[:, i], arg2.iloc[:, j]) - ) - - from pandas import concat - - result_index = arg1.index.union(arg2.index) - if len(result_index): - - # construct result frame - result = concat( - [ - concat( - [results[i][j] for j, c in enumerate(arg2.columns)], - ignore_index=True, - ) - for i, c in enumerate(arg1.columns) - ], - ignore_index=True, - axis=1, - ) - result.columns = arg1.columns - - # set the index and reorder - if arg2.columns.nlevels > 1: - result.index = MultiIndex.from_product( - arg2.columns.levels + [result_index] - ) - result = result.reorder_levels([2, 0, 1]).sort_index() - else: - result.index = MultiIndex.from_product( - [range(len(arg2.columns)), range(len(result_index))] - ) - result = result.swaplevel(1, 0).sort_index() - result.index = MultiIndex.from_product( - [result_index] + [arg2.columns] - ) - else: - - # empty result - result = DataFrame( - index=MultiIndex( - levels=[arg1.index, arg2.columns], codes=[[], []] - ), - columns=arg2.columns, - dtype="float64", - ) - - # reset our index names to arg1 names - # reset our column names to arg2 names - # careful not to mutate the original names - result.columns = result.columns.set_names(arg1.columns.names) - result.index = result.index.set_names( - result_index.names + arg2.columns.names - ) - - return result - - else: - raise ValueError("'pairwise' is not True/False") - else: - results = { - i: f(*_prep_binary(arg1.iloc[:, i], arg2)) - for i, col in enumerate(arg1.columns) - } - return dataframe_from_int_dict(results, arg1) - - else: - return _flex_binary_moment(arg2, arg1, f) - - -def _get_center_of_mass(comass, span, halflife, alpha): - valid_count = com.count_not_none(comass, span, halflife, alpha) - if valid_count > 1: - raise ValueError("comass, span, halflife, and alpha are mutually exclusive") - - # Convert to center of mass; domain checks ensure 0 < alpha <= 1 - if comass is not None: - if comass < 0: - raise ValueError("comass must satisfy: comass >= 0") - elif span is not None: - if span < 1: - raise ValueError("span must satisfy: span >= 1") - comass = (span - 1) / 2.0 - elif halflife is not None: - if halflife <= 0: - raise ValueError("halflife must satisfy: halflife > 0") - decay = 1 - np.exp(np.log(0.5) / halflife) - comass = 1 / decay - 1 - elif alpha is not None: - if alpha <= 0 or alpha > 1: - raise ValueError("alpha must satisfy: 0 < alpha <= 1") - comass = (1.0 - alpha) / alpha - else: - raise ValueError("Must pass one of comass, span, halflife, or alpha") - - return float(comass) - - -def _offset(window, center): - if not is_integer(window): - window = len(window) - offset = (window - 1) / 2.0 if center else 0 - try: - return int(offset) - except TypeError: - return offset.astype(int) - - -def _require_min_periods(p): - def _check_func(minp, window): - if minp is None: - return window - else: - return max(p, minp) - - return _check_func - - -def _use_window(minp, window): - if minp is None: - return window - else: - return minp - - -def _zsqrt(x): - with np.errstate(all="ignore"): - result = np.sqrt(x) - mask = x < 0 - - if isinstance(x, ABCDataFrame): - if mask.values.any(): - result[mask] = 0 - else: - if mask.any(): - result[mask] = 0 - - return result - - -def _prep_binary(arg1, arg2): - if not isinstance(arg2, type(arg1)): - raise Exception("Input arrays must be of the same type!") - - # mask out values, this also makes a common index... - X = arg1 + 0 * arg2 - Y = arg2 + 0 * arg1 - - return X, Y - - -# Top-level exports - - -def rolling(obj, win_type=None, **kwds): - if not isinstance(obj, (ABCSeries, ABCDataFrame)): - raise TypeError("invalid type: %s" % type(obj)) - - if win_type is not None: - return Window(obj, win_type=win_type, **kwds) - - return Rolling(obj, **kwds) - - -rolling.__doc__ = Window.__doc__ - - -def expanding(obj, **kwds): - if not isinstance(obj, (ABCSeries, ABCDataFrame)): - raise TypeError("invalid type: %s" % type(obj)) - - return Expanding(obj, **kwds) - - -expanding.__doc__ = Expanding.__doc__ - - -def ewm(obj, **kwds): - if not isinstance(obj, (ABCSeries, ABCDataFrame)): - raise TypeError("invalid type: %s" % type(obj)) - - return EWM(obj, **kwds) - - -ewm.__doc__ = EWM.__doc__ diff --git a/pandas/io/clipboards.py b/pandas/io/clipboards.py index d38221d784273..76c01535a26e7 100644 --- a/pandas/io/clipboards.py +++ b/pandas/io/clipboards.py @@ -9,8 +9,7 @@ def read_clipboard(sep=r"\s+", **kwargs): # pragma: no cover r""" - Read text from clipboard and pass to read_csv. See read_csv for the - full argument list + Read text from clipboard and pass to read_csv. Parameters ---------- @@ -18,9 +17,13 @@ def read_clipboard(sep=r"\s+", **kwargs): # pragma: no cover A string or regex delimiter. The default of '\s+' denotes one or more whitespace characters. + **kwargs + See read_csv for the full argument list. + Returns ------- - parsed : DataFrame + DataFrame + A parsed DataFrame object. """ encoding = kwargs.pop("encoding", "utf-8") diff --git a/pandas/io/common.py b/pandas/io/common.py index e01e473047b88..30228d660e816 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -5,12 +5,23 @@ import csv import gzip from http.client import HTTPException # noqa -from io import BytesIO -import lzma +from io import BufferedIOBase, BytesIO import mmap import os import pathlib -from typing import IO, AnyStr, BinaryIO, Optional, TextIO, Type +from typing import ( + IO, + Any, + AnyStr, + BinaryIO, + Dict, + List, + Optional, + TextIO, + Tuple, + Type, + Union, +) from urllib.error import URLError # noqa from urllib.parse import ( # noqa urlencode, @@ -23,6 +34,7 @@ from urllib.request import pathname2url, urlopen import zipfile +from pandas.compat import _get_lzma_file, _import_lzma from pandas.errors import ( # noqa AbstractMethodError, DtypeWarning, @@ -35,6 +47,8 @@ from pandas._typing import FilePathOrBuffer +lzma = _import_lzma() + # gh-12665: Alias for now and remove later. CParserError = ParserError @@ -253,6 +267,40 @@ def file_path_to_url(path: str) -> str: _compression_to_extension = {"gzip": ".gz", "bz2": ".bz2", "zip": ".zip", "xz": ".xz"} +def _get_compression_method( + compression: Optional[Union[str, Dict[str, str]]] +) -> Tuple[Optional[str], Dict[str, str]]: + """ + Simplifies a compression argument to a compression method string and + a dict containing additional arguments. + + Parameters + ---------- + compression : str or dict + If string, specifies the compression method. If dict, value at key + 'method' specifies compression method. + + Returns + ------- + tuple of ({compression method}, Optional[str] + {compression arguments}, Dict[str, str]) + + Raises + ------ + ValueError on dict missing 'method' key + """ + # Handle dict + if isinstance(compression, dict): + compression_args = compression.copy() + try: + compression = compression_args.pop("method") + except KeyError: + raise ValueError("If dict, compression must have key 'method'") + else: + compression_args = {} + return compression, compression_args + + def _infer_compression( filepath_or_buffer: FilePathOrBuffer, compression: Optional[str] ) -> Optional[str]: @@ -264,8 +312,8 @@ def _infer_compression( Parameters ---------- - filepath_or_buffer : - a path (str) or buffer + filepath_or_buffer : str or file handle + File path or object. compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None} If 'infer' and `filepath_or_buffer` is path-like, then detect compression from the following extensions: '.gz', '.bz2', '.zip', @@ -273,12 +321,11 @@ def _infer_compression( Returns ------- - string or None : - compression method + string or None Raises ------ - ValueError on invalid compression specified + ValueError on invalid compression specified. """ # No compression has been explicitly specified @@ -310,49 +357,67 @@ def _infer_compression( def _get_handle( - path_or_buf, mode, encoding=None, compression=None, memory_map=False, is_text=True + path_or_buf, + mode: str, + encoding=None, + compression: Optional[Union[str, Dict[str, Any]]] = None, + memory_map: bool = False, + is_text: bool = True, ): """ Get file handle for given path/buffer and mode. Parameters ---------- - path_or_buf : - a path (str) or buffer + path_or_buf : str or file handle + File path or object. mode : str - mode to open path_or_buf with + Mode to open path_or_buf with. encoding : str or None - compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default None - If 'infer' and `filepath_or_buffer` is path-like, then detect - compression from the following extensions: '.gz', '.bz2', '.zip', - or '.xz' (otherwise no compression). + Encoding to use. + compression : str or dict, default None + If string, specifies compression mode. If dict, value at key 'method' + specifies compression mode. Compression mode must be one of {'infer', + 'gzip', 'bz2', 'zip', 'xz', None}. If compression mode is 'infer' + and `filepath_or_buffer` is path-like, then detect compression from + the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise + no compression). If dict and compression mode is 'zip' or inferred as + 'zip', other entries passed as additional compression options. + + .. versionchanged:: 1.0.0 + + May now be a dict with key 'method' as compression mode + and other keys as compression options if compression + mode is 'zip'. + memory_map : boolean, default False See parsers._parser_params for more information. is_text : boolean, default True whether file/buffer is in text format (csv, json, etc.), or in binary - mode (pickle, etc.) + mode (pickle, etc.). Returns ------- f : file-like - A file-like object + A file-like object. handles : list of file-like objects A list of file-like object that were opened in this function. """ try: from s3fs import S3File - need_text_wrapping = (BytesIO, S3File) + need_text_wrapping = (BufferedIOBase, S3File) except ImportError: - need_text_wrapping = (BytesIO,) + need_text_wrapping = BufferedIOBase # type: ignore - handles = list() + handles = list() # type: List[IO] f = path_or_buf # Convert pathlib.Path/py.path.local or string path_or_buf = _stringify_path(path_or_buf) is_path = isinstance(path_or_buf, str) + compression, compression_args = _get_compression_method(compression) if is_path: compression = _infer_compression(path_or_buf, compression) @@ -374,7 +439,7 @@ def _get_handle( # ZIP Compression elif compression == "zip": - zf = BytesZipFile(path_or_buf, mode) + zf = BytesZipFile(path_or_buf, mode, **compression_args) # Ensure the container is closed as well. handles.append(zf) if zf.mode == "w": @@ -395,7 +460,7 @@ def _get_handle( # XZ Compression elif compression == "xz": - f = lzma.LZMAFile(path_or_buf, mode) + f = _get_lzma_file(lzma)(path_or_buf, mode) # Unrecognized Compression else: @@ -420,14 +485,16 @@ def _get_handle( if is_text and (compression or isinstance(f, need_text_wrapping)): from io import TextIOWrapper - f = TextIOWrapper(f, encoding=encoding, newline="") - handles.append(f) + g = TextIOWrapper(f, encoding=encoding, newline="") + if not isinstance(f, BufferedIOBase): + handles.append(g) + f = g if memory_map and hasattr(f, "fileno"): try: - g = MMapWrapper(f) + wrapped = MMapWrapper(f) f.close() - f = g + f = wrapped except Exception: # we catch any errors that may have occurred # because that is consistent with the lower-level @@ -452,15 +519,19 @@ def __init__( self, file: FilePathOrBuffer, mode: str, - compression: int = zipfile.ZIP_DEFLATED, + archive_name: Optional[str] = None, **kwargs ): if mode in ["wb", "rb"]: mode = mode.replace("b", "") - super().__init__(file, mode, compression, **kwargs) + self.archive_name = archive_name + super().__init__(file, mode, zipfile.ZIP_DEFLATED, **kwargs) def write(self, data): - super().writestr(self.filename, data) + archive_name = self.filename + if self.archive_name is not None: + archive_name = self.archive_name + super().writestr(archive_name, data) @property def closed(self): @@ -505,7 +576,6 @@ def __next__(self) -> str: class UTF8Recoder(BaseIterator): - """ Iterator that reads an encoded stream and re-encodes the input to UTF-8 """ diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 154656fbb250b..997edf49d9e8f 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -837,10 +837,10 @@ def parse( **kwds ): """ - Parse specified sheet(s) into a DataFrame + Parse specified sheet(s) into a DataFrame. Equivalent to read_excel(ExcelFile, ...) See the read_excel - docstring for more info on accepted parameters + docstring for more info on accepted parameters. Returns ------- diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index 6fe22f14c2c5b..25a6db675265d 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -39,7 +39,7 @@ def to_feather(df, path): if not isinstance(df.index, Int64Index): raise ValueError( "feather does not support serializing {} " - "for the index; you can .reset_index()" + "for the index; you can .reset_index() " "to make the index into column(s)".format(type(df.index)) ) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 60daf311397e8..e25862537cbfc 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -22,6 +22,7 @@ from pandas.io.common import ( UnicodeWriter, + _get_compression_method, _get_handle, _infer_compression, get_filepath_or_buffer, @@ -58,6 +59,9 @@ def __init__( if path_or_buf is None: path_or_buf = StringIO() + # Extract compression mode as given, if dict + compression, self.compression_args = _get_compression_method(compression) + self.path_or_buf, _, _, _ = get_filepath_or_buffer( path_or_buf, encoding=encoding, compression=compression, mode=mode ) @@ -178,7 +182,7 @@ def save(self): self.path_or_buf, self.mode, encoding=self.encoding, - compression=self.compression, + compression=dict(self.compression_args, method=self.compression), ) close = True @@ -206,11 +210,13 @@ def save(self): if hasattr(self.path_or_buf, "write"): self.path_or_buf.write(buf) else: + compression = dict(self.compression_args, method=self.compression) + f, handles = _get_handle( self.path_or_buf, self.mode, encoding=self.encoding, - compression=self.compression, + compression=compression, ) f.write(buf) close = True diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index d8a370d77ea31..8ff4b9bda0430 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -336,9 +336,11 @@ def _get_formatted_index(self) -> Tuple[List[str], bool]: return fmt_index, have_header def _get_formatted_values(self) -> List[str]: - values_to_format = self.tr_series._formatting_values() return format_array( - values_to_format, None, float_format=self.float_format, na_rep=self.na_rep + self.tr_series._values, + None, + float_format=self.float_format, + na_rep=self.na_rep, ) def to_string(self) -> str: @@ -547,7 +549,8 @@ def __init__( decimal: str = ".", table_id: Optional[str] = None, render_links: bool = False, - **kwds + bold_rows: bool = False, + escape: bool = True, ): self.frame = frame self.show_index_names = index_names @@ -578,7 +581,8 @@ def __init__( else: self.justify = justify - self.kwds = kwds + self.bold_rows = bold_rows + self.escape = escape if columns is not None: self.columns = ensure_index(columns) @@ -903,9 +907,8 @@ def to_latex( def _format_col(self, i: int) -> List[str]: frame = self.tr_frame formatter = self._get_formatter(i) - values_to_format = frame.iloc[:, i]._formatting_values() return format_array( - values_to_format, + frame.iloc[:, i]._values, formatter, float_format=self.float_format, na_rep=self.na_rep, diff --git a/pandas/io/formats/html.py b/pandas/io/formats/html.py index 4b44893df70ed..8c4a7f4a1213d 100644 --- a/pandas/io/formats/html.py +++ b/pandas/io/formats/html.py @@ -37,7 +37,7 @@ class HTMLFormatter(TableFormatter): def __init__( self, formatter: DataFrameFormatter, - classes: Optional[Union[str, List, Tuple]] = None, + classes: Optional[Union[str, List[str], Tuple[str, ...]]] = None, border: Optional[int] = None, ) -> None: self.fmt = formatter @@ -46,11 +46,11 @@ def __init__( self.frame = self.fmt.frame self.columns = self.fmt.tr_frame.columns self.elements = [] # type: List[str] - self.bold_rows = self.fmt.kwds.get("bold_rows", False) - self.escape = self.fmt.kwds.get("escape", True) + self.bold_rows = self.fmt.bold_rows + self.escape = self.fmt.escape self.show_dimensions = self.fmt.show_dimensions if border is None: - border = get_option("display.html.border") + border = cast(int, get_option("display.html.border")) self.border = border self.table_id = self.fmt.table_id self.render_links = self.fmt.render_links diff --git a/pandas/io/formats/latex.py b/pandas/io/formats/latex.py index c60e15b733f0a..4c4d5ec73269a 100644 --- a/pandas/io/formats/latex.py +++ b/pandas/io/formats/latex.py @@ -39,12 +39,13 @@ def __init__( ): self.fmt = formatter self.frame = self.fmt.frame - self.bold_rows = self.fmt.kwds.get("bold_rows", False) + self.bold_rows = self.fmt.bold_rows self.column_format = column_format self.longtable = longtable self.multicolumn = multicolumn self.multicolumn_format = multicolumn_format self.multirow = multirow + self.escape = self.fmt.escape def write_result(self, buf: IO[str]) -> None: """ @@ -142,7 +143,7 @@ def pad_empties(x): buf.write("\\endfoot\n\n") buf.write("\\bottomrule\n") buf.write("\\endlastfoot\n") - if self.fmt.kwds.get("escape", True): + if self.escape: # escape backslashes first crow = [ ( diff --git a/pandas/io/formats/printing.py b/pandas/io/formats/printing.py index 4ec9094ce4abe..ead51693da791 100644 --- a/pandas/io/formats/printing.py +++ b/pandas/io/formats/printing.py @@ -3,12 +3,14 @@ """ import sys -from typing import Callable, Dict, Iterable, List, Optional, Sequence, Tuple, Union +from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple, Union from pandas._config import get_option from pandas.core.dtypes.inference import is_sequence +EscapeChars = Union[Dict[str, str], Iterable[str]] + def adjoin(space: int, *lists: List[str], **kwargs) -> str: """ @@ -148,19 +150,16 @@ def _pprint_dict( def pprint_thing( - thing, + thing: Any, _nest_lvl: int = 0, - escape_chars: Optional[Union[Dict[str, str], Iterable[str]]] = None, + escape_chars: Optional[EscapeChars] = None, default_escapes: bool = False, quote_strings: bool = False, max_seq_items: Optional[int] = None, ) -> str: """ This function is the sanctioned way of converting objects - to a unicode representation. - - properly handles nested sequences containing unicode strings - (unicode(object) does not) + to a string representation and properly handles nested sequences. Parameters ---------- @@ -178,21 +177,13 @@ def pprint_thing( Returns ------- - result - unicode str + str """ - def as_escaped_unicode(thing, escape_chars=escape_chars): - # Unicode is fine, else we try to decode using utf-8 and 'replace' - # if that's not it either, we have no way of knowing and the user - # should deal with it himself. - - try: - result = str(thing) # we should try this first - except UnicodeDecodeError: - # either utf-8 or we replace errors - result = str(thing).decode("utf-8", "replace") - + def as_escaped_string( + thing: Any, escape_chars: Optional[EscapeChars] = escape_chars + ) -> str: translate = {"\t": r"\t", "\n": r"\n", "\r": r"\r"} if isinstance(escape_chars, dict): if default_escapes: @@ -202,10 +193,11 @@ def as_escaped_unicode(thing, escape_chars=escape_chars): escape_chars = list(escape_chars.keys()) else: escape_chars = escape_chars or tuple() + + result = str(thing) for c in escape_chars: result = result.replace(c, translate[c]) - - return str(result) + return result if hasattr(thing, "__next__"): return str(thing) @@ -224,11 +216,11 @@ def as_escaped_unicode(thing, escape_chars=escape_chars): max_seq_items=max_seq_items, ) elif isinstance(thing, str) and quote_strings: - result = "'{thing}'".format(thing=as_escaped_unicode(thing)) + result = "'{thing}'".format(thing=as_escaped_string(thing)) else: - result = as_escaped_unicode(thing) + result = as_escaped_string(thing) - return str(result) # always unicode + return result def pprint_thing_encoded( diff --git a/pandas/io/html.py b/pandas/io/html.py index 9d2647f226f00..490c574463b9b 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -1,4 +1,5 @@ -""":mod:`pandas.io.html` is a module containing functionality for dealing with +""" +:mod:`pandas.io.html` is a module containing functionality for dealing with HTML IO. """ @@ -58,7 +59,8 @@ def _importers(): def _remove_whitespace(s, regex=_RE_WHITESPACE): - """Replace extra whitespace inside of a string with a single space. + """ + Replace extra whitespace inside of a string with a single space. Parameters ---------- @@ -77,7 +79,8 @@ def _remove_whitespace(s, regex=_RE_WHITESPACE): def _get_skiprows(skiprows): - """Get an iterator given an integer, slice or container. + """ + Get an iterator given an integer, slice or container. Parameters ---------- @@ -107,7 +110,8 @@ def _get_skiprows(skiprows): def _read(obj): - """Try to read from a url, file or string. + """ + Try to read from a url, file or string. Parameters ---------- @@ -136,7 +140,8 @@ def _read(obj): class _HtmlFrameParser: - """Base class for parsers that parse HTML into DataFrames. + """ + Base class for parsers that parse HTML into DataFrames. Parameters ---------- @@ -515,7 +520,8 @@ def _handle_hidden_tables(self, tbl_list, attr_name): class _BeautifulSoupHtml5LibFrameParser(_HtmlFrameParser): - """HTML to DataFrame parser that uses BeautifulSoup under the hood. + """ + HTML to DataFrame parser that uses BeautifulSoup under the hood. See Also -------- @@ -622,7 +628,8 @@ def _build_xpath_expr(attrs): class _LxmlFrameParser(_HtmlFrameParser): - """HTML to DataFrame parser that uses lxml under the hood. + """ + HTML to DataFrame parser that uses lxml under the hood. Warning ------- @@ -937,7 +944,8 @@ def read_html( keep_default_na=True, displayed_only=True, ): - r"""Read HTML tables into a ``list`` of ``DataFrame`` objects. + r""" + Read HTML tables into a ``list`` of ``DataFrame`` objects. Parameters ---------- diff --git a/pandas/io/msgpack/__init__.py b/pandas/io/msgpack/__init__.py index 9b09cffd83f75..7107263c180cb 100644 --- a/pandas/io/msgpack/__init__.py +++ b/pandas/io/msgpack/__init__.py @@ -2,8 +2,8 @@ from collections import namedtuple -from pandas.io.msgpack.exceptions import * # noqa -from pandas.io.msgpack._version import version # noqa +from pandas.io.msgpack.exceptions import * # noqa: F401,F403 isort:skip +from pandas.io.msgpack._version import version # noqa: F401 isort:skip class ExtType(namedtuple("ExtType", "code data")): @@ -19,10 +19,14 @@ def __new__(cls, code, data): return super().__new__(cls, code, data) -import os # noqa +import os # noqa: F401,E402 isort:skip -from pandas.io.msgpack._packer import Packer # noqa -from pandas.io.msgpack._unpacker import unpack, unpackb, Unpacker # noqa +from pandas.io.msgpack._unpacker import ( # noqa: F401,E402 isort:skip + Unpacker, + unpack, + unpackb, +) +from pandas.io.msgpack._packer import Packer # noqa: E402 isort:skip def pack(o, stream, **kwargs): diff --git a/pandas/io/msgpack/_packer.pyi b/pandas/io/msgpack/_packer.pyi new file mode 100644 index 0000000000000..e95a1622c5615 --- /dev/null +++ b/pandas/io/msgpack/_packer.pyi @@ -0,0 +1,22 @@ +# flake8: noqa + +class Packer: + def __cinit__(self): ... + def __init__( + self, + default=..., + encoding=..., + unicode_errors=..., + use_single_float=..., + autoreset: int = ..., + use_bin_type: int = ..., + ): ... + def __dealloc__(self): ... + def _pack(self, o, nest_limit: int = ...) -> int: ... + def pack(self, obj): ... + def pack_ext_type(self, typecode, data): ... + def pack_array_header(self, size): ... + def pack_map_header(self, size): ... + def pack_map_pairs(self, pairs): ... + def reset(self) -> None: ... + def bytes(self): ... diff --git a/pandas/io/msgpack/_unpacker.pyi b/pandas/io/msgpack/_unpacker.pyi new file mode 100644 index 0000000000000..9910895947fb6 --- /dev/null +++ b/pandas/io/msgpack/_unpacker.pyi @@ -0,0 +1,59 @@ +# flake8: noqa + +def unpackb( + packed, + object_hook=..., + list_hook=..., + use_list=..., + encoding=..., + unicode_errors=..., + object_pairs_hook=..., + ext_hook=..., + max_str_len=..., + max_bin_len=..., + max_array_len=..., + max_map_len=..., + max_ext_len=..., +): ... +def unpack( + stream, + object_hook=..., + list_hook=..., + use_list=..., + encoding=..., + unicode_errors=..., + object_pairs_hook=..., +): ... + +class Unpacker: + def __cinit__(self): ... + def __dealloc__(self): ... + def __init__( + self, + file_like=..., + read_size=..., + use_list=..., + object_hook=..., + object_pairs_hook=..., + list_hook=..., + encoding=..., + unicode_errors=..., + max_buffer_size: int = ..., + ext_hook=..., + max_str_len=..., + max_bin_len=..., + max_array_len=..., + max_map_len=..., + max_ext_len=..., + ): ... + def feed(self, next_bytes): ... + def append_buffer(self, _buf, _buf_len): ... + def read_from_file(self): ... + def _unpack(self, execute, write_bytes, iter=...): ... + def read_bytes(self, nbytes): ... + def unpack(self, write_bytes=...): ... + def skip(self, write_bytes=...): ... + def read_array_header(self, write_bytes=...): ... + def read_map_header(self, write_bytes=...): ... + def __iter__(self): ... + def __next__(self): ... diff --git a/pandas/io/packers.py b/pandas/io/packers.py index 04e49708ff082..ad47ba23b9221 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -846,7 +846,6 @@ def __init__( class Iterator: - """ manage the unpacking iteration, close the file on completion """ diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 82c460300582b..6fc70e9f4a737 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -184,12 +184,14 @@ def write( def read(self, path, columns=None, **kwargs): if is_s3_url(path): + from pandas.io.s3 import get_file_and_filesystem + # When path is s3:// an S3File is returned. # We need to retain the original path(str) while also # pass the S3File().open function to fsatparquet impl. - s3, _, _, should_close = get_filepath_or_buffer(path) + s3, filesystem = get_file_and_filesystem(path) try: - parquet_file = self.api.ParquetFile(path, open_with=s3.s3.open) + parquet_file = self.api.ParquetFile(path, open_with=filesystem.open) finally: s3.close() else: diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index f4b00b0aac5f7..a3ff837bc7f52 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1393,6 +1393,10 @@ def __init__(self, kwds): if isinstance(self.header, (list, tuple, np.ndarray)): if not all(map(is_integer, self.header)): raise ValueError("header must be integer or list of integers") + if any(i < 0 for i in self.header): + raise ValueError( + "cannot specify multi-index header with negative integers" + ) if kwds.get("usecols"): raise ValueError( "cannot specify usecols when specifying a multi-index header" @@ -1419,6 +1423,13 @@ def __init__(self, kwds): elif self.header is not None and not is_integer(self.header): raise ValueError("header must be integer or list of integers") + # GH 27779 + elif self.header is not None and self.header < 0: + raise ValueError( + "Passing negative integer to header is invalid. " + "For no header, use header=None instead" + ) + self._name_processed = False self._first_chunk = True diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index abc8a414eb37a..1ff3400323e54 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -429,10 +429,10 @@ def _is_metadata_of(group, parent_group): class HDFStore: - """ - Dict-like IO interface for storing pandas objects in PyTables - either Fixed or Table format. + Dict-like IO interface for storing pandas objects in PyTables. + + Either Fixed or Table format. Parameters ---------- @@ -564,13 +564,12 @@ def __exit__(self, exc_type, exc_value, traceback): def keys(self): """ - Return a (potentially unordered) list of the keys corresponding to the - objects stored in the HDFStore. These are ABSOLUTE path-names (e.g. - have the leading '/' + Return a list of keys corresponding to objects stored in HDFStore. Returns ------- list + List of ABSOLUTE path-names (e.g. have the leading '/'). """ return [n._v_pathname for n in self.groups()] @@ -703,7 +702,7 @@ def flush(self, fsync=False): def get(self, key): """ - Retrieve pandas object stored in file + Retrieve pandas object stored in file. Parameters ---------- @@ -711,7 +710,8 @@ def get(self, key): Returns ------- - obj : same type as object stored in file + object + Same type as object stored in file. """ group = self.get_node(key) if group is None: @@ -731,25 +731,31 @@ def select( **kwargs ): """ - Retrieve pandas object stored in file, optionally based on where - criteria + Retrieve pandas object stored in file, optionally based on where criteria. Parameters ---------- key : object - where : list of Term (or convertible) objects, optional - start : integer (defaults to None), row number to start selection - stop : integer (defaults to None), row number to stop selection - columns : a list of columns that if not None, will limit the return - columns - iterator : boolean, return an iterator, default False - chunksize : nrows to include in iteration, return an iterator - auto_close : boolean, should automatically close the store when - finished, default is False + Object being retrieved from file. + where : list, default None + List of Term (or convertible) objects, optional. + start : int, default None + Row number to start selection. + stop : int, default None + Row number to stop selection. + columns : list, default None + A list of columns that if not None, will limit the return columns. + iterator : bool, default False + Returns an iterator. + chunksize : int, default None + Number or rows to include in iteration, return an iterator. + auto_close : bool, default False + Should automatically close the store when finished. Returns ------- - The selected object + object + Retrieved object from file. """ group = self.get_node(key) if group is None: @@ -929,28 +935,30 @@ def func(_start, _stop, _where): def put(self, key, value, format=None, append=False, **kwargs): """ - Store object in HDFStore + Store object in HDFStore. Parameters ---------- - key : object - value : {Series, DataFrame} - format : 'fixed(f)|table(t)', default is 'fixed' + key : object + value : {Series, DataFrame} + format : 'fixed(f)|table(t)', default is 'fixed' fixed(f) : Fixed format - Fast writing/reading. Not-appendable, nor searchable + Fast writing/reading. Not-appendable, nor searchable. table(t) : Table format Write as a PyTables Table structure which may perform worse but allow more flexible operations like searching - / selecting subsets of the data - append : boolean, default False + / selecting subsets of the data. + append : bool, default False This will force Table format, append the input data to the existing. - data_columns : list of columns to create as data columns, or True to + data_columns : list, default None + List of columns to create as data columns, or True to use all columns. See `here `__. - encoding : default None, provide an encoding for strings - dropna : boolean, default False, do not write an ALL nan row to - the store settable by the option 'io.hdf.dropna_table' + encoding : str, default None + Provide an encoding for strings. + dropna : bool, default False, do not write an ALL nan row to + The store settable by the option 'io.hdf.dropna_table'. """ if format is None: format = get_option("io.hdf.default_format") or "fixed" @@ -1165,12 +1173,15 @@ def create_table_index(self, key, **kwargs): s.create_index(**kwargs) def groups(self): - """return a list of all the top-level nodes (that are not themselves a - pandas storage object) + """ + Return a list of all the top-level nodes. + + Each node returned is not a pandas storage object. Returns ------- list + List of objects. """ _tables() self._check_if_open() @@ -1188,10 +1199,12 @@ def groups(self): ] def walk(self, where="/"): - """ Walk the pytables group hierarchy for pandas objects + """ + Walk the pytables group hierarchy for pandas objects. This generator will yield the group path, subgroups and pandas object names for each group. + Any non-pandas PyTables objects that are not a group will be ignored. The `where` group itself is listed first (preorder), then each of its @@ -1202,18 +1215,17 @@ def walk(self, where="/"): Parameters ---------- - where : str, optional + where : str, default "/" Group where to start walking. - If not supplied, the root group is used. Yields ------ path : str - Full path to a group (without trailing '/') - groups : list of str - names of the groups contained in `path` - leaves : list of str - names of the pandas objects contained in `path` + Full path to a group (without trailing '/'). + groups : list + Names (strings) of the groups contained in `path`. + leaves : list + Names (strings) of the pandas objects contained in `path`. """ _tables() self._check_if_open() @@ -1533,7 +1545,6 @@ def _read_group(self, group, **kwargs): class TableIterator: - """ define the iteration interface on a table Parameters @@ -1641,7 +1652,6 @@ def get_result(self, coordinates=False): class IndexCol: - """ an index column description class Parameters @@ -1955,7 +1965,6 @@ def write_metadata(self, handler): class GenericIndexCol(IndexCol): - """ an index which is not represented in the data of the table """ @property @@ -1993,7 +2002,6 @@ def set_attr(self): class DataCol(IndexCol): - """ a data holding column, by definition this is not indexable Parameters @@ -2443,7 +2451,6 @@ def set_attr(self): class DataIndexableCol(DataCol): - """ represent a data column that can be indexed """ is_data_indexable = True @@ -2466,7 +2473,6 @@ def get_atom_timedelta64(self, block): class GenericDataIndexableCol(DataIndexableCol): - """ represent a generic pytables data column """ def get_attr(self): @@ -2474,7 +2480,6 @@ def get_attr(self): class Fixed: - """ represent an object in my store facilitate read/write of various types of objects this is an abstract base class @@ -2642,7 +2647,6 @@ def delete(self, where=None, start=None, stop=None, **kwargs): class GenericFixed(Fixed): - """ a generified fixed version """ _index_type_map = {DatetimeIndex: "datetime", PeriodIndex: "period"} @@ -2898,7 +2902,12 @@ def read_index_node(self, node, start=None, stop=None): kwargs["freq"] = node._v_attrs["freq"] if "tz" in node._v_attrs: - kwargs["tz"] = node._v_attrs["tz"] + if isinstance(node._v_attrs["tz"], bytes): + # created by python2 + kwargs["tz"] = node._v_attrs["tz"].decode("utf-8") + else: + # created by python3 + kwargs["tz"] = node._v_attrs["tz"] if kind in ("date", "datetime"): index = factory( @@ -3202,7 +3211,9 @@ def read(self, start=None, stop=None, **kwargs): values = self.read_array( "block{idx}_values".format(idx=i), start=_start, stop=_stop ) - blk = make_block(values, placement=items.get_indexer(blk_items)) + blk = make_block( + values, placement=items.get_indexer(blk_items), ndim=len(axes) + ) blocks.append(blk) return self.obj_type(BlockManager(blocks, axes)) @@ -3237,7 +3248,6 @@ class FrameFixed(BlockManagerFixed): class Table(Fixed): - """ represent a table: facilitate read/write of various types of tables @@ -4112,7 +4122,6 @@ def read_column(self, column, where=None, start=None, stop=None): class WORMTable(Table): - """ a write-once read-many table: this format DOES NOT ALLOW appending to a table. writing is a one-time operation the data are stored in a format that allows for searching the data on disk @@ -4134,7 +4143,6 @@ def write(self, **kwargs): class LegacyTable(Table): - """ an appendable table: allow append/query/delete operations to a (possibly) already existing appendable table this table ALLOWS append (but doesn't require them), and stores the data in a format @@ -4462,7 +4470,7 @@ def read(self, where=None, columns=None, **kwargs): if values.ndim == 1 and isinstance(values, np.ndarray): values = values.reshape((1, values.shape[0])) - block = make_block(values, placement=np.arange(len(cols_))) + block = make_block(values, placement=np.arange(len(cols_)), ndim=2) mgr = BlockManager([block], [cols_, index_]) frames.append(DataFrame(mgr)) @@ -4588,7 +4596,6 @@ def write(self, **kwargs): class AppendableMultiFrameTable(AppendableFrameTable): - """ a frame with a multi-index """ table_type = "appendable_multiframe" @@ -4947,7 +4954,6 @@ def _need_convert(kind): class Selection: - """ Carries out a selection operation on a tables.Table object. diff --git a/pandas/io/s3.py b/pandas/io/s3.py index 0a7c082fec51c..7e0a37e8cba20 100644 --- a/pandas/io/s3.py +++ b/pandas/io/s3.py @@ -1,8 +1,11 @@ """ s3 support for remote file interactivity """ +from typing import IO, Any, Optional, Tuple from urllib.parse import urlparse as parse_url from pandas.compat._optional import import_optional_dependency +from pandas._typing import FilePathOrBuffer + s3fs = import_optional_dependency( "s3fs", extra="The s3fs package is required to handle s3 files." ) @@ -14,9 +17,9 @@ def _strip_schema(url): return result.netloc + result.path -def get_filepath_or_buffer( - filepath_or_buffer, encoding=None, compression=None, mode=None -): +def get_file_and_filesystem( + filepath_or_buffer: FilePathOrBuffer, mode: Optional[str] = None +) -> Tuple[IO, Any]: from botocore.exceptions import NoCredentialsError if mode is None: @@ -24,7 +27,7 @@ def get_filepath_or_buffer( fs = s3fs.S3FileSystem(anon=False) try: - filepath_or_buffer = fs.open(_strip_schema(filepath_or_buffer), mode) + file = fs.open(_strip_schema(filepath_or_buffer), mode) except (FileNotFoundError, NoCredentialsError): # boto3 has troubles when trying to access a public file # when credentialed... @@ -33,5 +36,15 @@ def get_filepath_or_buffer( # A NoCredentialsError is raised if you don't have creds # for that bucket. fs = s3fs.S3FileSystem(anon=True) - filepath_or_buffer = fs.open(_strip_schema(filepath_or_buffer), mode) - return filepath_or_buffer, None, compression, True + file = fs.open(_strip_schema(filepath_or_buffer), mode) + return file, fs + + +def get_filepath_or_buffer( + filepath_or_buffer: FilePathOrBuffer, + encoding: Optional[str] = None, + compression: Optional[str] = None, + mode: Optional[str] = None, +) -> Tuple[IO, Optional[str], Optional[str], bool]: + file, _fs = get_file_and_filesystem(filepath_or_buffer, mode=mode) + return file, None, compression, True diff --git a/pandas/io/sql.py b/pandas/io/sql.py index f1f52a9198d29..44cb399336d62 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -269,7 +269,8 @@ def read_sql_query( parse_dates=None, chunksize=None, ): - """Read SQL query into a DataFrame. + """ + Read SQL query into a DataFrame. Returns a DataFrame corresponding to the result set of the query string. Optionally provide an `index_col` parameter to use one of the @@ -455,14 +456,14 @@ def to_sql( Parameters ---------- frame : DataFrame, Series - name : string + name : str Name of SQL table. con : SQLAlchemy connectable(engine/connection) or database string URI or sqlite3 DBAPI2 connection Using SQLAlchemy makes it possible to use any DB supported by that library. If a DBAPI2 object, only sqlite3 is supported. - schema : string, default None + schema : str, optional Name of SQL schema in database to write to (if database flavor supports this). If None, use default schema (default). if_exists : {'fail', 'replace', 'append'}, default 'fail' @@ -471,18 +472,19 @@ def to_sql( - append: If table exists, insert data. Create if does not exist. index : boolean, default True Write DataFrame index as a column. - index_label : string or sequence, default None + index_label : str or sequence, optional Column label for index column(s). If None is given (default) and `index` is True, then the index names are used. A sequence should be given if the DataFrame uses MultiIndex. - chunksize : int, default None - If not None, then rows will be written in batches of this size at a - time. If None, all rows will be written at once. - dtype : single SQLtype or dict of column name to SQL type, default None - Optional specifying the datatype for columns. The SQL type should - be a SQLAlchemy type, or a string for sqlite3 fallback connection. - If all columns are of the same type, one single value can be used. - method : {None, 'multi', callable}, default None + chunksize : int, optional + Specify the number of rows in each batch to be written at a time. + By default, all rows will be written at once. + dtype : dict or scalar, optional + Specifying the datatype for columns. If a dictionary is used, the + keys should be the column names and the values should be the + SQLAlchemy types or strings for the sqlite3 fallback mode. If a + scalar is provided, it will be applied to all columns. + method : {None, 'multi', callable}, optional Controls the SQL insertion clause used: - None : Uses standard SQL ``INSERT`` clause (one per row). diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 69bafc7749258..31fdaa5cc6735 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -138,7 +138,7 @@ _iterator_params, ) -_data_method_doc = """\ +_data_method_doc = """ Read observations from Stata file, converting them into a dataframe .. deprecated:: diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index a3c1499845c2a..2e6a401b49efc 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -53,7 +53,7 @@ def hist_series( rotation of y axis labels figsize : tuple, default None figure size in inches by default - bins : integer or sequence, default 10 + bins : int or sequence, default 10 Number of histogram bins to be used. If an integer is given, bins + 1 bin edges are calculated and returned. If bins is a sequence, gives bin edges, including left edge of first bin and right edge of last @@ -116,7 +116,7 @@ def hist_frame( ---------- data : DataFrame The pandas object holding the data. - column : string or sequence + column : str or sequence If passed, will be used to limit data to a subset of columns. by : object, optional If passed, then used to form histograms for separate groups. @@ -148,7 +148,7 @@ def hist_frame( `matplotlib.rcParams` by default. layout : tuple, optional Tuple of (rows, columns) for the layout of the histograms. - bins : integer or sequence, default 10 + bins : int or sequence, default 10 Number of histogram bins to be used. If an integer is given, bins + 1 bin edges are calculated and returned. If bins is a sequence, gives bin edges, including left edge of first bin and right edge of last @@ -177,7 +177,7 @@ def hist_frame( >>> df = pd.DataFrame({ ... 'length': [1.5, 0.5, 1.2, 0.9, 3], ... 'width': [0.7, 0.2, 0.15, 0.2, 1.1] - ... }, index= ['pig', 'rabbit', 'duck', 'chicken', 'horse']) + ... }, index=['pig', 'rabbit', 'duck', 'chicken', 'horse']) >>> hist = df.hist(bins=3) """ plot_backend = _get_plot_backend() @@ -370,8 +370,8 @@ def boxplot( If ``return_type`` is `None`, a NumPy array of axes with the same shape as ``layout`` is returned: - >>> boxplot = df.boxplot(column=['Col1', 'Col2'], by='X', - ... return_type=None) + >>> boxplot = df.boxplot(column=['Col1', 'Col2'], by='X', + ... return_type=None) >>> type(boxplot) """ @@ -446,7 +446,7 @@ def boxplot_frame_groupby( * ``True`` - create a subplot for each group column : column name or list of names, or vector Can be any valid input to groupby - fontsize : int or string + fontsize : int or str rot : label rotation angle grid : Setting this to True will show the grid ax : Matplotlib axis object, default None @@ -530,7 +530,7 @@ class PlotAccessor(PandasObject): figsize : a tuple (width, height) in inches use_index : bool, default True Use index as ticks for x axis - title : string or list + title : str or list Title to use for the plot. If a string is passed, print the string at the top of the figure. If a list is passed and `subplots` is True, print each item in the list above the corresponding subplot. @@ -553,16 +553,16 @@ class PlotAccessor(PandasObject): .. versionchanged:: 0.25.0 xticks : sequence - Values to use for the xticks + Values to use for the xticks. yticks : sequence - Values to use for the yticks + Values to use for the yticks. xlim : 2-tuple/list ylim : 2-tuple/list rot : int, default None Rotation for ticks (xticks for vertical, yticks for horizontal plots) fontsize : int, default None - Font size for xticks and yticks + Font size for xticks and yticks. colormap : str or matplotlib colormap object, default None Colormap to select colors from. If string, load colormap with that name from matplotlib. @@ -586,8 +586,10 @@ class PlotAccessor(PandasObject): mark_right : bool, default True When using a secondary_y axis, automatically mark the column labels with "(right)" in the legend + include_bool : bool, default is False + If True, boolean values can be plotted. `**kwds` : keywords - Options to pass to matplotlib plotting method + Options to pass to matplotlib plotting method. Returns ------- @@ -983,7 +985,7 @@ def barh(self, x=None, y=None, **kwargs): .. plot:: :context: close-figs - >>> df = pd.DataFrame({'lab':['A', 'B', 'C'], 'val':[10, 30, 20]}) + >>> df = pd.DataFrame({'lab': ['A', 'B', 'C'], 'val': [10, 30, 20]}) >>> ax = df.plot.barh(x='lab', y='val') Plot a whole DataFrame to a horizontal bar plot @@ -1047,7 +1049,7 @@ def box(self, by=None, **kwargs): Parameters ---------- - by : string or sequence + by : str or sequence Column in the DataFrame to group by. **kwds : optional Additional keywords are documented in diff --git a/pandas/plotting/_matplotlib/converter.py b/pandas/plotting/_matplotlib/converter.py index 15648d59c8f98..893854ab26e37 100644 --- a/pandas/plotting/_matplotlib/converter.py +++ b/pandas/plotting/_matplotlib/converter.py @@ -64,11 +64,12 @@ def register(explicit=True): pairs = get_pairs() for type_, cls in pairs: - converter = cls() - if type_ in units.registry: + # Cache previous converter if present + if type_ in units.registry and not isinstance(units.registry[type_], cls): previous = units.registry[type_] _mpl_units[type_] = previous - units.registry[type_] = converter + # Replace with pandas converter + units.registry[type_] = cls() def deregister(): diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index c2b37bb297ecb..6ff3f28440303 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -106,6 +106,7 @@ def __init__( colormap=None, table=False, layout=None, + include_bool=False, **kwds ): @@ -191,6 +192,7 @@ def __init__( self.colormap = colormap self.table = table + self.include_bool = include_bool self.kwds = kwds @@ -400,9 +402,20 @@ def _compute_plot_data(self): # GH16953, _convert is needed as fallback, for ``Series`` # with ``dtype == object`` data = data._convert(datetime=True, timedelta=True) - numeric_data = data.select_dtypes( - include=[np.number, "datetime", "datetimetz", "timedelta"] - ) + include_type = [np.number, "datetime", "datetimetz", "timedelta"] + + # GH23719, allow plotting boolean + if self.include_bool is True: + include_type.append(np.bool_) + + # GH22799, exclude datatime-like type for boxplot + exclude_type = None + if self._kind == "box": + # TODO: change after solving issue 27881 + include_type = [np.number] + exclude_type = ["timedelta"] + + numeric_data = data.select_dtypes(include=include_type, exclude=exclude_type) try: is_empty = numeric_data.empty @@ -549,7 +562,7 @@ def _add_legend_handle(self, handle, label, index=None): self.legend_labels.append(label) def _make_legend(self): - ax, leg = self._get_ax_legend(self.axes[0]) + ax, leg, handle = self._get_ax_legend_handle(self.axes[0]) handles = [] labels = [] @@ -558,7 +571,8 @@ def _make_legend(self): if not self.subplots: if leg is not None: title = leg.get_title().get_text() - handles = leg.legendHandles + # Replace leg.LegendHandles because it misses marker info + handles.extend(handle) labels = [x.get_text() for x in leg.get_texts()] if self.legend: @@ -568,6 +582,7 @@ def _make_legend(self): handles += self.legend_handles labels += self.legend_labels + if self.legend_title is not None: title = self.legend_title @@ -579,8 +594,14 @@ def _make_legend(self): if ax.get_visible(): ax.legend(loc="best") - def _get_ax_legend(self, ax): + def _get_ax_legend_handle(self, ax): + """ + Take in axes and return ax, legend and handle under different scenarios + """ leg = ax.get_legend() + + # Get handle from axes + handle, _ = ax.get_legend_handles_labels() other_ax = getattr(ax, "left_ax", None) or getattr(ax, "right_ax", None) other_leg = None if other_ax is not None: @@ -588,7 +609,7 @@ def _get_ax_legend(self, ax): if leg is None and other_leg is not None: leg = other_leg ax = other_ax - return ax, leg + return ax, leg, handle @cache_readonly def plt(self): @@ -1080,9 +1101,13 @@ def _make_plot(self): ) self._add_legend_handle(newlines[0], label, index=i) - lines = _get_all_lines(ax) - left, right = _get_xlim(lines) - ax.set_xlim(left, right) + if self._is_ts_plot(): + + # reset of xlim should be used for ts data + # TODO: GH28021, should find a way to change view limit on xaxis + lines = _get_all_lines(ax) + left, right = _get_xlim(lines) + ax.set_xlim(left, right) @classmethod def _plot(cls, ax, x, y, style=None, column_num=None, stacking_id=None, **kwds): diff --git a/pandas/plotting/_matplotlib/tools.py b/pandas/plotting/_matplotlib/tools.py index 8472eb3a3d887..67fa79ad5da8c 100644 --- a/pandas/plotting/_matplotlib/tools.py +++ b/pandas/plotting/_matplotlib/tools.py @@ -343,6 +343,21 @@ def _flatten(axes): return np.array(axes) +def _set_ticks_props(axes, xlabelsize=None, xrot=None, ylabelsize=None, yrot=None): + import matplotlib.pyplot as plt + + for ax in _flatten(axes): + if xlabelsize is not None: + plt.setp(ax.get_xticklabels(), fontsize=xlabelsize) + if xrot is not None: + plt.setp(ax.get_xticklabels(), rotation=xrot) + if ylabelsize is not None: + plt.setp(ax.get_yticklabels(), fontsize=ylabelsize) + if yrot is not None: + plt.setp(ax.get_yticklabels(), rotation=yrot) + return axes + + def _get_all_lines(ax): lines = ax.get_lines() @@ -362,18 +377,3 @@ def _get_xlim(lines): left = min(np.nanmin(x), left) right = max(np.nanmax(x), right) return left, right - - -def _set_ticks_props(axes, xlabelsize=None, xrot=None, ylabelsize=None, yrot=None): - import matplotlib.pyplot as plt - - for ax in _flatten(axes): - if xlabelsize is not None: - plt.setp(ax.get_xticklabels(), fontsize=xlabelsize) - if xrot is not None: - plt.setp(ax.get_xticklabels(), rotation=xrot) - if ylabelsize is not None: - plt.setp(ax.get_yticklabels(), fontsize=ylabelsize) - if yrot is not None: - plt.setp(ax.get_yticklabels(), rotation=yrot) - return axes diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py index 1cba0e7354182..7ed0ffc6d0115 100644 --- a/pandas/plotting/_misc.py +++ b/pandas/plotting/_misc.py @@ -329,7 +329,8 @@ def parallel_coordinates( sort_labels=False, **kwds ): - """Parallel coordinates plotting. + """ + Parallel coordinates plotting. Parameters ---------- @@ -392,7 +393,8 @@ def parallel_coordinates( def lag_plot(series, lag=1, ax=None, **kwds): - """Lag plot for time series. + """ + Lag plot for time series. Parameters ---------- diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index 3920cfcc002d7..5931cd93cc8c5 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -30,6 +30,54 @@ import pandas.util.testing as tm +def assert_invalid_comparison(left, right, box): + """ + Assert that comparison operations with mismatched types behave correctly. + + Parameters + ---------- + left : np.ndarray, ExtensionArray, Index, or Series + right : object + box : {pd.DataFrame, pd.Series, pd.Index, tm.to_array} + """ + # Not for tznaive-tzaware comparison + + # Note: not quite the same as how we do this for tm.box_expected + xbox = box if box is not pd.Index else np.array + + result = left == right + expected = xbox(np.zeros(result.shape, dtype=np.bool_)) + + tm.assert_equal(result, expected) + + result = right == left + tm.assert_equal(result, expected) + + result = left != right + tm.assert_equal(result, ~expected) + + result = right != left + tm.assert_equal(result, ~expected) + + msg = "Invalid comparison between" + with pytest.raises(TypeError, match=msg): + left < right + with pytest.raises(TypeError, match=msg): + left <= right + with pytest.raises(TypeError, match=msg): + left > right + with pytest.raises(TypeError, match=msg): + left >= right + with pytest.raises(TypeError, match=msg): + right < left + with pytest.raises(TypeError, match=msg): + right <= left + with pytest.raises(TypeError, match=msg): + right > left + with pytest.raises(TypeError, match=msg): + right >= left + + def assert_all(obj): """ Test helper to call call obj.all() the appropriate number of times on @@ -47,7 +95,7 @@ def assert_all(obj): class TestDatetime64ArrayLikeComparisons: # Comparison tests for datetime64 vectors fully parametrized over - # DataFrame/Series/DatetimeIndex/DateteimeArray. Ideally all comparison + # DataFrame/Series/DatetimeIndex/DatetimeArray. Ideally all comparison # tests will eventually end up here. def test_compare_zerodim(self, tz_naive_fixture, box_with_array): @@ -59,36 +107,61 @@ def test_compare_zerodim(self, tz_naive_fixture, box_with_array): other = np.array(dti.to_numpy()[0]) - # FIXME: ValueError with transpose on tzaware - dtarr = tm.box_expected(dti, box, transpose=False) + dtarr = tm.box_expected(dti, box) result = dtarr <= other expected = np.array([True, False, False]) - expected = tm.box_expected(expected, xbox, transpose=False) + expected = tm.box_expected(expected, xbox) tm.assert_equal(result, expected) + def test_dt64arr_cmp_date_invalid(self, tz_naive_fixture, box_with_array): + # GH#19800, GH#19301 datetime.date comparison raises to + # match DatetimeIndex/Timestamp. This also matches the behavior + # of stdlib datetime.datetime + tz = tz_naive_fixture -class TestDatetime64DataFrameComparison: - @pytest.mark.parametrize( - "timestamps", - [ - [pd.Timestamp("2012-01-01 13:00:00+00:00")] * 2, - [pd.Timestamp("2012-01-01 13:00:00")] * 2, - ], - ) - def test_tz_aware_scalar_comparison(self, timestamps): - # GH#15966 - df = pd.DataFrame({"test": timestamps}) - expected = pd.DataFrame({"test": [False, False]}) - tm.assert_frame_equal(df == -1, expected) + dti = pd.date_range("20010101", periods=10, tz=tz) + date = dti[0].to_pydatetime().date() + + dtarr = tm.box_expected(dti, box_with_array) + assert_invalid_comparison(dtarr, date, box_with_array) + + @pytest.mark.parametrize("other", ["foo", -1, 99, 4.0, object(), timedelta(days=2)]) + def test_dt64arr_cmp_scalar_invalid(self, other, tz_naive_fixture, box_with_array): + # GH#22074, GH#15966 + tz = tz_naive_fixture + + rng = date_range("1/1/2000", periods=10, tz=tz) + dtarr = tm.box_expected(rng, box_with_array) + assert_invalid_comparison(dtarr, other, box_with_array) + + @pytest.mark.parametrize("other", [None, np.nan]) + def test_dt64arr_cmp_na_scalar_invalid( + self, other, tz_naive_fixture, box_with_array + ): + # GH#19301 + tz = tz_naive_fixture + dti = pd.date_range("2016-01-01", periods=2, tz=tz) + dtarr = tm.box_expected(dti, box_with_array) + assert_invalid_comparison(dtarr, other, box_with_array) - def test_dt64_nat_comparison(self): + def test_dt64arr_nat_comparison(self, tz_naive_fixture, box_with_array): # GH#22242, GH#22163 DataFrame considered NaT == ts incorrectly - ts = pd.Timestamp.now() - df = pd.DataFrame([ts, pd.NaT]) - expected = pd.DataFrame([True, False]) + tz = tz_naive_fixture + box = box_with_array + xbox = box if box is not pd.Index else np.ndarray - result = df == ts - tm.assert_frame_equal(result, expected) + ts = pd.Timestamp.now(tz) + ser = pd.Series([ts, pd.NaT]) + + # FIXME: Can't transpose because that loses the tz dtype on + # the NaT column + obj = tm.box_expected(ser, box, transpose=False) + + expected = pd.Series([True, False], dtype=np.bool_) + expected = tm.box_expected(expected, xbox, transpose=False) + + result = obj == ts + tm.assert_equal(result, expected) class TestDatetime64SeriesComparison: @@ -142,35 +215,17 @@ def test_nat_comparisons(self, dtype, box, reverse, pair): expected = Series([False, False, True]) tm.assert_series_equal(left <= right, expected) - def test_comparison_invalid(self, box_with_array): + def test_comparison_invalid(self, tz_naive_fixture, box_with_array): # GH#4968 # invalid date/int comparisons - xbox = box_with_array if box_with_array is not pd.Index else np.ndarray - + tz = tz_naive_fixture ser = Series(range(5)) - ser2 = Series(pd.date_range("20010101", periods=5)) + ser2 = Series(pd.date_range("20010101", periods=5, tz=tz)) ser = tm.box_expected(ser, box_with_array) ser2 = tm.box_expected(ser2, box_with_array) - for (x, y) in [(ser, ser2), (ser2, ser)]: - - result = x == y - expected = tm.box_expected([False] * 5, xbox) - tm.assert_equal(result, expected) - - result = x != y - expected = tm.box_expected([True] * 5, xbox) - tm.assert_equal(result, expected) - msg = "Invalid comparison between" - with pytest.raises(TypeError, match=msg): - x >= y - with pytest.raises(TypeError, match=msg): - x > y - with pytest.raises(TypeError, match=msg): - x < y - with pytest.raises(TypeError, match=msg): - x <= y + assert_invalid_comparison(ser, ser2, box_with_array) @pytest.mark.parametrize( "data", @@ -227,26 +282,6 @@ def test_series_comparison_scalars(self): expected = Series([x > val for x in series]) tm.assert_series_equal(result, expected) - def test_dt64ser_cmp_date_invalid(self, box_with_array): - # GH#19800 datetime.date comparison raises to - # match DatetimeIndex/Timestamp. This also matches the behavior - # of stdlib datetime.datetime - - ser = pd.date_range("20010101", periods=10) - date = ser[0].to_pydatetime().date() - - ser = tm.box_expected(ser, box_with_array) - assert_all(~(ser == date)) - assert_all(ser != date) - with pytest.raises(TypeError): - ser > date - with pytest.raises(TypeError): - ser < date - with pytest.raises(TypeError): - ser >= date - with pytest.raises(TypeError): - ser <= date - @pytest.mark.parametrize( "left,right", [("lt", "gt"), ("le", "ge"), ("eq", "eq"), ("ne", "ne")] ) @@ -388,57 +423,6 @@ def test_dti_cmp_datetimelike(self, other, tz_naive_fixture): expected = np.array([True, False]) tm.assert_numpy_array_equal(result, expected) - def dt64arr_cmp_non_datetime(self, tz_naive_fixture, box_with_array): - # GH#19301 by convention datetime.date is not considered comparable - # to Timestamp or DatetimeIndex. This may change in the future. - tz = tz_naive_fixture - dti = pd.date_range("2016-01-01", periods=2, tz=tz) - dtarr = tm.box_expected(dti, box_with_array) - - other = datetime(2016, 1, 1).date() - assert not (dtarr == other).any() - assert (dtarr != other).all() - with pytest.raises(TypeError): - dtarr < other - with pytest.raises(TypeError): - dtarr <= other - with pytest.raises(TypeError): - dtarr > other - with pytest.raises(TypeError): - dtarr >= other - - @pytest.mark.parametrize("other", [None, np.nan, pd.NaT]) - def test_dti_eq_null_scalar(self, other, tz_naive_fixture): - # GH#19301 - tz = tz_naive_fixture - dti = pd.date_range("2016-01-01", periods=2, tz=tz) - assert not (dti == other).any() - - @pytest.mark.parametrize("other", [None, np.nan, pd.NaT]) - def test_dti_ne_null_scalar(self, other, tz_naive_fixture): - # GH#19301 - tz = tz_naive_fixture - dti = pd.date_range("2016-01-01", periods=2, tz=tz) - assert (dti != other).all() - - @pytest.mark.parametrize("other", [None, np.nan]) - def test_dti_cmp_null_scalar_inequality( - self, tz_naive_fixture, other, box_with_array - ): - # GH#19301 - tz = tz_naive_fixture - dti = pd.date_range("2016-01-01", periods=2, tz=tz) - dtarr = tm.box_expected(dti, box_with_array) - msg = "Invalid comparison between" - with pytest.raises(TypeError, match=msg): - dtarr < other - with pytest.raises(TypeError, match=msg): - dtarr <= other - with pytest.raises(TypeError, match=msg): - dtarr > other - with pytest.raises(TypeError, match=msg): - dtarr >= other - @pytest.mark.parametrize("dtype", [None, object]) def test_dti_cmp_nat(self, dtype, box_with_array): if box_with_array is tm.to_array and dtype is object: @@ -728,34 +712,6 @@ def test_dti_cmp_str(self, tz_naive_fixture): expected = np.array([True] * 10) tm.assert_numpy_array_equal(result, expected) - @pytest.mark.parametrize("other", ["foo", 99, 4.0, object(), timedelta(days=2)]) - def test_dt64arr_cmp_scalar_invalid(self, other, tz_naive_fixture, box_with_array): - # GH#22074 - tz = tz_naive_fixture - xbox = box_with_array if box_with_array is not pd.Index else np.ndarray - - rng = date_range("1/1/2000", periods=10, tz=tz) - rng = tm.box_expected(rng, box_with_array) - - result = rng == other - expected = np.array([False] * 10) - expected = tm.box_expected(expected, xbox) - tm.assert_equal(result, expected) - - result = rng != other - expected = np.array([True] * 10) - expected = tm.box_expected(expected, xbox) - tm.assert_equal(result, expected) - msg = "Invalid comparison between" - with pytest.raises(TypeError, match=msg): - rng < other - with pytest.raises(TypeError, match=msg): - rng <= other - with pytest.raises(TypeError, match=msg): - rng > other - with pytest.raises(TypeError, match=msg): - rng >= other - def test_dti_cmp_list(self): rng = date_range("1/1/2000", periods=10) @@ -1097,7 +1053,13 @@ def test_dt64arr_add_timestamp_raises(self, box_with_array): def test_dt64arr_add_sub_float(self, other, box_with_array): dti = DatetimeIndex(["2011-01-01", "2011-01-02"], freq="D") dtarr = tm.box_expected(dti, box_with_array) - msg = "|".join(["unsupported operand type", "cannot (add|subtract)"]) + msg = "|".join( + [ + "unsupported operand type", + "cannot (add|subtract)", + "ufunc '?(add|subtract)'? cannot use operands with types", + ] + ) with pytest.raises(TypeError, match=msg): dtarr + other with pytest.raises(TypeError, match=msg): @@ -2570,24 +2532,3 @@ def test_shift_months(years, months): raw = [x + pd.offsets.DateOffset(years=years, months=months) for x in dti] expected = DatetimeIndex(raw) tm.assert_index_equal(actual, expected) - - -# FIXME: this belongs in scalar tests -class SubDatetime(datetime): - pass - - -@pytest.mark.parametrize( - "lh,rh", - [ - (SubDatetime(2000, 1, 1), Timedelta(hours=1)), - (Timedelta(hours=1), SubDatetime(2000, 1, 1)), - ], -) -def test_dt_subclass_add_timedelta(lh, rh): - # GH 25851 - # ensure that subclassed datetime works for - # Timedelta operations - result = lh + rh - expected = SubDatetime(2000, 1, 1, 1) - assert result == expected diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py index 2b23790e4ccd3..d686d9f90a5a4 100644 --- a/pandas/tests/arithmetic/test_numeric.py +++ b/pandas/tests/arithmetic/test_numeric.py @@ -1227,3 +1227,36 @@ def test_addsub_arithmetic(self, dtype, delta): tm.assert_index_equal(index + index, 2 * index) tm.assert_index_equal(index - index, 0 * index) assert not (index - index).empty + + +def test_fill_value_inf_masking(): + # GH #27464 make sure we mask 0/1 with Inf and not NaN + df = pd.DataFrame({"A": [0, 1, 2], "B": [1.1, None, 1.1]}) + + other = pd.DataFrame({"A": [1.1, 1.2, 1.3]}, index=[0, 2, 3]) + + result = df.rfloordiv(other, fill_value=1) + + expected = pd.DataFrame( + {"A": [np.inf, 1.0, 0.0, 1.0], "B": [0.0, np.nan, 0.0, np.nan]} + ) + tm.assert_frame_equal(result, expected) + + +def test_dataframe_div_silenced(): + # GH#26793 + pdf1 = pd.DataFrame( + { + "A": np.arange(10), + "B": [np.nan, 1, 2, 3, 4] * 2, + "C": [np.nan] * 10, + "D": np.arange(10), + }, + index=list("abcdefghij"), + columns=list("ABCD"), + ) + pdf2 = pd.DataFrame( + np.random.randn(10, 4), index=list("abcdefghjk"), columns=list("ABCX") + ) + with tm.assert_produces_warning(None): + pdf1.div(pdf2, fill_value=0) diff --git a/pandas/tests/arithmetic/test_period.py b/pandas/tests/arithmetic/test_period.py index 4b58c290c3cea..ed693d873efb8 100644 --- a/pandas/tests/arithmetic/test_period.py +++ b/pandas/tests/arithmetic/test_period.py @@ -573,12 +573,19 @@ def test_parr_add_sub_float_raises(self, op, other, box_with_array): @pytest.mark.parametrize( "other", [ + # datetime scalars pd.Timestamp.now(), pd.Timestamp.now().to_pydatetime(), pd.Timestamp.now().to_datetime64(), + # datetime-like arrays + pd.date_range("2016-01-01", periods=3, freq="H"), + pd.date_range("2016-01-01", periods=3, tz="Europe/Brussels"), + pd.date_range("2016-01-01", periods=3, freq="S")._data, + pd.date_range("2016-01-01", periods=3, tz="Asia/Tokyo")._data, + # Miscellaneous invalid types ], ) - def test_parr_add_sub_datetime_scalar(self, other, box_with_array): + def test_parr_add_sub_invalid(self, other, box_with_array): # GH#23215 rng = pd.period_range("1/1/2000", freq="D", periods=3) rng = tm.box_expected(rng, box_with_array) @@ -595,23 +602,6 @@ def test_parr_add_sub_datetime_scalar(self, other, box_with_array): # ----------------------------------------------------------------- # __add__/__sub__ with ndarray[datetime64] and ndarray[timedelta64] - def test_parr_add_sub_dt64_array_raises(self, box_with_array): - rng = pd.period_range("1/1/2000", freq="D", periods=3) - dti = pd.date_range("2016-01-01", periods=3) - dtarr = dti.values - - rng = tm.box_expected(rng, box_with_array) - - with pytest.raises(TypeError): - rng + dtarr - with pytest.raises(TypeError): - dtarr + rng - - with pytest.raises(TypeError): - rng - dtarr - with pytest.raises(TypeError): - dtarr - rng - def test_pi_add_sub_td64_array_non_tick_raises(self): rng = pd.period_range("1/1/2000", freq="Q", periods=3) tdi = pd.TimedeltaIndex(["-1 Day", "-1 Day", "-1 Day"]) diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index 4f5e00bc5a37d..6d6b85a1e81e1 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -18,6 +18,7 @@ Timestamp, timedelta_range, ) +from pandas.tests.arithmetic.test_datetime64 import assert_invalid_comparison import pandas.util.testing as tm @@ -61,42 +62,33 @@ def test_compare_timedelta64_zerodim(self, box_with_array): # zero-dim of wrong dtype should still raise tdi >= np.array(4) - -class TestTimedelta64ArrayComparisons: - # TODO: All of these need to be parametrized over box - - def test_compare_timedelta_series(self): + @pytest.mark.parametrize( + "td_scalar", + [timedelta(days=1), Timedelta(days=1), Timedelta(days=1).to_timedelta64()], + ) + def test_compare_timedeltalike_scalar(self, box_with_array, td_scalar): # regression test for GH#5963 - s = pd.Series([timedelta(days=1), timedelta(days=2)]) - actual = s > timedelta(days=1) + box = box_with_array + xbox = box if box is not pd.Index else np.ndarray + ser = pd.Series([timedelta(days=1), timedelta(days=2)]) + ser = tm.box_expected(ser, box) + actual = ser > td_scalar expected = pd.Series([False, True]) - tm.assert_series_equal(actual, expected) + expected = tm.box_expected(expected, xbox) + tm.assert_equal(actual, expected) - def test_tdi_cmp_str_invalid(self, box_with_array): - # GH#13624 - xbox = box_with_array if box_with_array is not pd.Index else np.ndarray - tdi = TimedeltaIndex(["1 day", "2 days"]) - tdarr = tm.box_expected(tdi, box_with_array) + @pytest.mark.parametrize("invalid", [345600000000000, "a"]) + def test_td64_comparisons_invalid(self, box_with_array, invalid): + # GH#13624 for str + box = box_with_array + rng = timedelta_range("1 days", periods=10) + obj = tm.box_expected(rng, box) - for left, right in [(tdarr, "a"), ("a", tdarr)]: - with pytest.raises(TypeError): - left > right - with pytest.raises(TypeError): - left >= right - with pytest.raises(TypeError): - left < right - with pytest.raises(TypeError): - left <= right - - result = left == right - expected = np.array([False, False], dtype=bool) - expected = tm.box_expected(expected, xbox) - tm.assert_equal(result, expected) + assert_invalid_comparison(obj, invalid, box) - result = left != right - expected = np.array([True, True], dtype=bool) - expected = tm.box_expected(expected, xbox) - tm.assert_equal(result, expected) + +class TestTimedelta64ArrayComparisons: + # TODO: All of these need to be parametrized over box @pytest.mark.parametrize("dtype", [None, object]) def test_comp_nat(self, dtype): @@ -191,10 +183,6 @@ def test_comparisons_coverage(self): expected = np.array([True, True, True] + [False] * 7) tm.assert_numpy_array_equal(result, expected) - # raise TypeError for now - with pytest.raises(TypeError): - rng < rng[3].value - result = rng == list(rng) exp = rng == rng tm.assert_numpy_array_equal(result, exp) @@ -835,19 +823,10 @@ def test_timedelta64_ops_nat(self): # ------------------------------------------------------------- # Invalid Operations - def test_td64arr_add_str_invalid(self, box_with_array): - # GH#13624 + @pytest.mark.parametrize("other", ["a", 3.14, np.array([2.0, 3.0])]) + def test_td64arr_add_sub_invalid(self, box_with_array, other): + # GH#13624 for str tdi = TimedeltaIndex(["1 day", "2 days"]) - tdi = tm.box_expected(tdi, box_with_array) - - with pytest.raises(TypeError): - tdi + "a" - with pytest.raises(TypeError): - "a" + tdi - - @pytest.mark.parametrize("other", [3.14, np.array([2.0, 3.0])]) - def test_td64arr_add_sub_float(self, box_with_array, other): - tdi = TimedeltaIndex(["-1 days", "-1 days"]) tdarr = tm.box_expected(tdi, box_with_array) with pytest.raises(TypeError): @@ -1399,8 +1378,12 @@ def test_td64arr_add_offset_array(self, box): @pytest.mark.parametrize( "names", [(None, None, None), ("foo", "bar", None), ("foo", "foo", "foo")] ) - def test_td64arr_sub_offset_index(self, names, box): + def test_td64arr_sub_offset_index(self, names, box_with_array): # GH#18824, GH#19744 + box = box_with_array + xbox = box if box is not tm.to_array else pd.Index + exname = names[2] if box is not tm.to_array else names[1] + if box is pd.DataFrame and names[1] == "bar": pytest.skip( "Name propagation for DataFrame does not behave like " @@ -1411,11 +1394,11 @@ def test_td64arr_sub_offset_index(self, names, box): other = pd.Index([pd.offsets.Hour(n=1), pd.offsets.Minute(n=-2)], name=names[1]) expected = TimedeltaIndex( - [tdi[n] - other[n] for n in range(len(tdi))], freq="infer", name=names[2] + [tdi[n] - other[n] for n in range(len(tdi))], freq="infer", name=exname ) tdi = tm.box_expected(tdi, box) - expected = tm.box_expected(expected, box) + expected = tm.box_expected(expected, xbox) # The DataFrame operation is transposed and so operates as separate # scalar operations, which do not issue a PerformanceWarning @@ -1631,7 +1614,7 @@ def test_td64arr_div_nat_invalid(self, box_with_array): rng = timedelta_range("1 days", "10 days", name="foo") rng = tm.box_expected(rng, box_with_array) - with pytest.raises(TypeError, match="'?true_divide'? cannot use operands"): + with pytest.raises(TypeError, match="unsupported operand type"): rng / pd.NaT with pytest.raises(TypeError, match="Cannot divide NaTType by"): pd.NaT / rng diff --git a/pandas/tests/arrays/categorical/test_missing.py b/pandas/tests/arrays/categorical/test_missing.py index 1b62479530d24..3037ac79cd592 100644 --- a/pandas/tests/arrays/categorical/test_missing.py +++ b/pandas/tests/arrays/categorical/test_missing.py @@ -5,7 +5,7 @@ from pandas.core.dtypes.dtypes import CategoricalDtype -from pandas import Categorical, Index, isna +from pandas import Categorical, Index, Series, isna import pandas.util.testing as tm @@ -59,11 +59,13 @@ def test_set_item_nan(self): ), (dict(), "Must specify a fill 'value' or 'method'."), (dict(method="bad"), "Invalid fill method. Expecting .* bad"), + (dict(value=Series([1, 2, 3, 4, "a"])), "fill value must be in categories"), ], ) def test_fillna_raises(self, fillna_kwargs, msg): # https://github.com/pandas-dev/pandas/issues/19682 - cat = Categorical([1, 2, 3]) + # https://github.com/pandas-dev/pandas/issues/13628 + cat = Categorical([1, 2, 3, None, None]) with pytest.raises(ValueError, match=msg): cat.fillna(**fillna_kwargs) diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index 9a09ea8422b1f..22c1d5373372a 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -349,7 +349,9 @@ def test_numeric_like_ops(self): ("__mul__", r"\*"), ("__truediv__", "/"), ]: - msg = r"Series cannot perform the operation {}".format(str_rep) + msg = r"Series cannot perform the operation {}|unsupported operand".format( + str_rep + ) with pytest.raises(TypeError, match=msg): getattr(df, op)(df) @@ -375,7 +377,9 @@ def test_numeric_like_ops(self): ("__mul__", r"\*"), ("__truediv__", "/"), ]: - msg = r"Series cannot perform the operation {}".format(str_rep) + msg = r"Series cannot perform the operation {}|unsupported operand".format( + str_rep + ) with pytest.raises(TypeError, match=msg): getattr(s, op)(2) diff --git a/pandas/tests/arrays/sparse/test_arithmetics.py b/pandas/tests/arrays/sparse/test_arithmetics.py index 57e5a35d99e48..cb5b437c962f9 100644 --- a/pandas/tests/arrays/sparse/test_arithmetics.py +++ b/pandas/tests/arrays/sparse/test_arithmetics.py @@ -441,6 +441,23 @@ def test_with_list(op): tm.assert_sp_array_equal(result, expected) +def test_with_dataframe(): + # GH#27910 + arr = pd.SparseArray([0, 1], fill_value=0) + df = pd.DataFrame([[1, 2], [3, 4]]) + result = arr.__add__(df) + assert result is NotImplemented + + +def test_with_zerodim_ndarray(): + # GH#27910 + arr = pd.SparseArray([0, 1], fill_value=0) + + result = arr * np.array(2) + expected = arr * 2 + tm.assert_sp_array_equal(result, expected) + + @pytest.mark.parametrize("ufunc", [np.abs, np.exp]) @pytest.mark.parametrize( "arr", [pd.SparseArray([0, 0, -1, 1]), pd.SparseArray([None, None, -1, 1])] diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index ffda2f4de2700..7c482664bca48 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._libs import OutOfBoundsDatetime + import pandas as pd from pandas.core.arrays import DatetimeArray, PeriodArray, TimedeltaArray import pandas.util.testing as tm @@ -462,6 +464,13 @@ def test_concat_same_type_different_freq(self): tm.assert_datetime_array_equal(result, expected) + def test_strftime(self, datetime_index): + arr = DatetimeArray(datetime_index) + + result = arr.strftime("%Y %b") + expected = np.array(datetime_index.strftime("%Y %b")) + tm.assert_numpy_array_equal(result, expected) + class TestTimedeltaArray(SharedTests): index_cls = pd.TimedeltaIndex @@ -608,6 +617,15 @@ def test_to_timestamp(self, how, period_index): # an EA-specific tm.assert_ function tm.assert_index_equal(pd.Index(result), pd.Index(expected)) + def test_to_timestamp_out_of_bounds(self): + # GH#19643 previously overflowed silently + pi = pd.period_range("1500", freq="Y", periods=3) + with pytest.raises(OutOfBoundsDatetime): + pi.to_timestamp() + + with pytest.raises(OutOfBoundsDatetime): + pi._data.to_timestamp() + @pytest.mark.parametrize("propname", PeriodArray._bool_ops) def test_bool_properties(self, period_index, propname): # in this case _bool_ops is just `is_leap_year` @@ -652,6 +670,13 @@ def test_array_interface(self, period_index): expected = np.asarray(arr).astype("S20") tm.assert_numpy_array_equal(result, expected) + def test_strftime(self, period_index): + arr = PeriodArray(period_index) + + result = arr.strftime("%Y") + expected = np.array(period_index.strftime("%Y")) + tm.assert_numpy_array_equal(result, expected) + @pytest.mark.parametrize( "array,casting_nats", diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index 8fbfb4c12f4b2..31a9a0483081e 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -280,7 +280,7 @@ def test_arith_coerce_scalar(self, data, all_arithmetic_operators): other = 0.01 self._check_op(s, op, other) - @pytest.mark.parametrize("other", [1.0, 1.0, np.array(1.0), np.array([1.0])]) + @pytest.mark.parametrize("other", [1.0, np.array(1.0)]) def test_arithmetic_conversion(self, all_arithmetic_operators, other): # if we have a float operand we should have a float result # if that is equal to an integer @@ -290,6 +290,15 @@ def test_arithmetic_conversion(self, all_arithmetic_operators, other): result = op(s, other) assert result.dtype is np.dtype("float") + def test_arith_len_mismatch(self, all_arithmetic_operators): + # operating with a list-like with non-matching length raises + op = self.get_op_from_name(all_arithmetic_operators) + other = np.array([1.0]) + + s = pd.Series([1, 2, 3], dtype="Int64") + with pytest.raises(ValueError, match="Lengths must match"): + op(s, other) + @pytest.mark.parametrize("other", [0, 0.5]) def test_arith_zero_dim_ndarray(self, other): arr = integer_array([1, None, 2]) @@ -322,8 +331,9 @@ def test_error(self, data, all_arithmetic_operators): ops(pd.Series(pd.date_range("20180101", periods=len(s)))) # 2d - with pytest.raises(NotImplementedError): - opa(pd.DataFrame({"A": s})) + result = opa(pd.DataFrame({"A": s})) + assert result is NotImplemented + with pytest.raises(NotImplementedError): opa(np.arange(len(s)).reshape(-1, len(s))) @@ -379,8 +389,6 @@ def test_compare_array(self, data, all_compare_operators): class TestCasting: - pass - @pytest.mark.parametrize("dropna", [True, False]) def test_construct_index(self, all_data, dropna): # ensure that we do not coerce to Float64Index, rather diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index c500760fa1390..b6ffd8a83e409 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -14,7 +14,7 @@ from pandas.core.dtypes.common import is_bool, is_list_like, is_scalar import pandas as pd -from pandas import DataFrame, Series, date_range +from pandas import DataFrame, Series, compat, date_range from pandas.core.computation import pytables from pandas.core.computation.check import _NUMEXPR_VERSION from pandas.core.computation.engines import NumExprClobberingError, _engines @@ -1267,7 +1267,10 @@ def test_assignment_column(self): msg = "left hand side of an assignment must be a single name" with pytest.raises(SyntaxError, match=msg): df.eval("d,c = a + b") - msg = "can't assign to function call" + if compat.PY38: + msg = "cannot assign to function call" + else: + msg = "can't assign to function call" with pytest.raises(SyntaxError, match=msg): df.eval('Timestamp("20131001") = a + b') @@ -1967,6 +1970,26 @@ def test_bool_ops_fails_on_scalars(lhs, cmp, rhs, engine, parser): pd.eval(ex, engine=engine, parser=parser) +@pytest.mark.parametrize( + "other", + [ + "'x'", + pytest.param( + "...", marks=pytest.mark.xfail(not compat.PY38, reason="GH-28116") + ), + ], +) +def test_equals_various(other): + df = DataFrame({"A": ["a", "b", "c"]}) + result = df.eval("A == {}".format(other)) + expected = Series([False, False, False], name="A") + if _USE_NUMEXPR: + # https://github.com/pandas-dev/pandas/issues/10239 + # lose name with numexpr engine. Remove when that's fixed. + expected.name = None + tm.assert_series_equal(result, expected) + + def test_inf(engine, parser): s = "inf + 1" expected = np.inf diff --git a/pandas/tests/config/test_config.py b/pandas/tests/config/test_config.py index 3f12d1d7a292d..efaeb7b1471ec 100644 --- a/pandas/tests/config/test_config.py +++ b/pandas/tests/config/test_config.py @@ -208,13 +208,16 @@ def test_set_option_multiple(self): def test_validation(self): self.cf.register_option("a", 1, "doc", validator=self.cf.is_int) + self.cf.register_option("d", 1, "doc", validator=self.cf.is_nonnegative_int) self.cf.register_option("b.c", "hullo", "doc2", validator=self.cf.is_text) + msg = "Value must have type ''" with pytest.raises(ValueError, match=msg): self.cf.register_option("a.b.c.d2", "NO", "doc", validator=self.cf.is_int) self.cf.set_option("a", 2) # int is_int self.cf.set_option("b.c", "wurld") # str is_str + self.cf.set_option("d", 2) # None not is_int with pytest.raises(ValueError, match=msg): @@ -222,6 +225,16 @@ def test_validation(self): with pytest.raises(ValueError, match=msg): self.cf.set_option("a", "ab") + msg = "Value must be a nonnegative integer or None" + with pytest.raises(ValueError, match=msg): + self.cf.register_option( + "a.b.c.d3", "NO", "doc", validator=self.cf.is_nonnegative_int + ) + with pytest.raises(ValueError, match=msg): + self.cf.register_option( + "a.b.c.d3", -2, "doc", validator=self.cf.is_nonnegative_int + ) + msg = r"Value must be an instance of \|" with pytest.raises(ValueError, match=msg): self.cf.set_option("b.c", 1) diff --git a/pandas/tests/dtypes/test_missing.py b/pandas/tests/dtypes/test_missing.py index a688dec50bc95..bbc485ecf94f2 100644 --- a/pandas/tests/dtypes/test_missing.py +++ b/pandas/tests/dtypes/test_missing.py @@ -86,6 +86,10 @@ def test_isna_isnull(self, isna_f): assert not isna_f(np.inf) assert not isna_f(-np.inf) + # type + assert not isna_f(type(pd.Series())) + assert not isna_f(type(pd.DataFrame())) + # series for s in [ tm.makeFloatSeries(), diff --git a/pandas/tests/extension/arrow/test_bool.py b/pandas/tests/extension/arrow/test_bool.py index cc0deca765b41..9c53210b75d6b 100644 --- a/pandas/tests/extension/arrow/test_bool.py +++ b/pandas/tests/extension/arrow/test_bool.py @@ -41,6 +41,10 @@ def test_copy(self, data): # __setitem__ does not work, so we only have a smoke-test data.copy() + def test_view(self, data): + # __setitem__ does not work, so we only have a smoke-test + data.view() + class TestConstructors(BaseArrowTests, base.BaseConstructorsTests): def test_from_dtype(self, data): diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index e02586eacfea7..d56cc50f4739c 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -260,3 +260,9 @@ def test_reindex_non_na_fill_value(self, data_missing): expected = pd.Series(data_missing._from_sequence([na, valid, valid])) self.assert_series_equal(result, expected) + + def test_loc_len1(self, data): + # see GH-27785 take_nd with indexer of len 1 resulting in wrong ndim + df = pd.DataFrame({"A": data}) + res = df.loc[[0], "A"] + assert res._data._block.ndim == 1 diff --git a/pandas/tests/extension/base/interface.py b/pandas/tests/extension/base/interface.py index dee8021f5375f..a29f6deeffae6 100644 --- a/pandas/tests/extension/base/interface.py +++ b/pandas/tests/extension/base/interface.py @@ -75,3 +75,18 @@ def test_copy(self, data): data[1] = data[0] assert result[1] != result[0] + + def test_view(self, data): + # view with no dtype should return a shallow copy, *not* the same + # object + assert data[1] != data[0] + + result = data.view() + assert result is not data + assert type(result) == type(data) + + result[1] = result[0] + assert data[1] == data[0] + + # check specifically that the `dtype` kwarg is accepted + data.view(dtype=None) diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index c28ff956a33a4..a1988744d76a1 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -137,11 +137,11 @@ def __setitem__(self, key, value): value = decimal.Decimal(value) self._data[key] = value - def __len__(self): + def __len__(self) -> int: return len(self._data) @property - def nbytes(self): + def nbytes(self) -> int: n = len(self) if n: return n * sys.getsizeof(self[0]) diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 9dec023f4073a..3ac9d37ccf4f3 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -392,17 +392,6 @@ def test_ufunc_fallback(data): tm.assert_series_equal(result, expected) -def test_formatting_values_deprecated(): - class DecimalArray2(DecimalArray): - def _formatting_values(self): - return np.array(self) - - ser = pd.Series(DecimalArray2([decimal.Decimal("1.0")])) - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - repr(ser) - - def test_array_ufunc(): a = to_decimal([1, 2, 3]) result = np.exp(a) diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index 21c4ac8f055a2..b64ddbd6ac84d 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -80,6 +80,9 @@ def __getitem__(self, item): elif isinstance(item, abc.Iterable): # fancy indexing return type(self)([self.data[i] for i in item]) + elif isinstance(item, slice) and item == slice(None): + # Make sure we get a view + return type(self)(self.data) else: # slice return type(self)(self.data[item]) @@ -103,11 +106,11 @@ def __setitem__(self, key, value): assert isinstance(v, self.dtype.type) self.data[k] = v - def __len__(self): + def __len__(self) -> int: return len(self.data) @property - def nbytes(self): + def nbytes(self) -> int: return sys.getsizeof(self.data) def isna(self): diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index f7456d24ad6d3..0c0e8b0123c03 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -211,7 +211,7 @@ def test_arith_series_with_scalar(self, data, all_arithmetic_operators): def test_add_series_with_extension_array(self, data): ser = pd.Series(data) - with pytest.raises(TypeError, match="cannot perform"): + with pytest.raises(TypeError, match="cannot perform|unsupported operand"): ser + data def test_divmod_series_array(self): diff --git a/pandas/tests/extension/test_external_block.py b/pandas/tests/extension/test_external_block.py index 1a4f84e2c0fd2..6311070cfe2bb 100644 --- a/pandas/tests/extension/test_external_block.py +++ b/pandas/tests/extension/test_external_block.py @@ -2,7 +2,7 @@ import pytest import pandas as pd -from pandas.core.internals import BlockManager, SingleBlockManager +from pandas.core.internals import BlockManager from pandas.core.internals.blocks import Block, NonConsolidatableMixIn @@ -10,9 +10,6 @@ class CustomBlock(NonConsolidatableMixIn, Block): _holder = np.ndarray - def formatting_values(self): - return np.array(["Val: {}".format(i) for i in self.values]) - def concat_same_type(self, to_concat, placement=None): """ Always concatenate disregarding self.ndim as the values are @@ -35,22 +32,6 @@ def df(): return pd.DataFrame(block_manager) -def test_custom_repr(): - values = np.arange(3, dtype="int64") - - # series - block = CustomBlock(values, placement=slice(0, 3)) - - s = pd.Series(SingleBlockManager(block, pd.RangeIndex(3))) - assert repr(s) == "0 Val: 0\n1 Val: 1\n2 Val: 2\ndtype: int64" - - # dataframe - block = CustomBlock(values, placement=slice(0, 1)) - blk_mgr = BlockManager([block], [["col"], range(3)]) - df = pd.DataFrame(blk_mgr) - assert repr(df) == " col\n0 Val: 0\n1 Val: 1\n2 Val: 2" - - def test_concat_series(): # GH17728 values = np.arange(3, dtype="int64") diff --git a/pandas/tests/extension/test_interval.py b/pandas/tests/extension/test_interval.py index 1aab71286b4a6..4fdcf930d224f 100644 --- a/pandas/tests/extension/test_interval.py +++ b/pandas/tests/extension/test_interval.py @@ -95,7 +95,10 @@ class TestGrouping(BaseInterval, base.BaseGroupbyTests): class TestInterface(BaseInterval, base.BaseInterfaceTests): - pass + def test_view(self, data): + # __setitem__ incorrectly makes a copy (GH#27147), so we only + # have a smoke-test + data.view() class TestReduce(base.BaseNoReduceTests): diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 84d59902d2aa7..6ebe71e173ec2 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -103,6 +103,10 @@ def test_copy(self, data): # __setitem__ does not work, so we only have a smoke-test data.copy() + def test_view(self, data): + # __setitem__ does not work, so we only have a smoke-test + data.view() + class TestConstructors(BaseSparseTests, base.BaseConstructorsTests): pass diff --git a/pandas/tests/frame/test_indexing.py b/pandas/tests/frame/test_indexing.py index ae14563e5952a..a78b2ab7d1c4c 100644 --- a/pandas/tests/frame/test_indexing.py +++ b/pandas/tests/frame/test_indexing.py @@ -821,6 +821,14 @@ def test_setitem_empty_frame_with_boolean(self, dtype, kwargs): df[df > df2] = 47 assert_frame_equal(df, df2) + def test_setitem_with_empty_listlike(self): + # GH #17101 + index = pd.Index([], name="idx") + result = pd.DataFrame(columns=["A"], index=index) + result["A"] = [] + expected = pd.DataFrame(columns=["A"], index=index) + tm.assert_index_equal(result.index, expected.index) + def test_setitem_scalars_no_index(self): # GH16823 / 17894 df = DataFrame() diff --git a/pandas/tests/frame/test_replace.py b/pandas/tests/frame/test_replace.py index 2862615ef8585..b341ed6a52ca5 100644 --- a/pandas/tests/frame/test_replace.py +++ b/pandas/tests/frame/test_replace.py @@ -1069,18 +1069,24 @@ def test_replace_truthy(self): e = df assert_frame_equal(r, e) - def test_replace_int_to_int_chain(self): + def test_nested_dict_overlapping_keys_replace_int(self): + # GH 27660 keep behaviour consistent for simple dictionary and + # nested dictionary replacement df = DataFrame({"a": list(range(1, 5))}) - with pytest.raises(ValueError, match="Replacement not allowed .+"): - df.replace({"a": dict(zip(range(1, 5), range(2, 6)))}) - def test_replace_str_to_str_chain(self): + result = df.replace({"a": dict(zip(range(1, 5), range(2, 6)))}) + expected = df.replace(dict(zip(range(1, 5), range(2, 6)))) + assert_frame_equal(result, expected) + + def test_nested_dict_overlapping_keys_replace_str(self): + # GH 27660 a = np.arange(1, 5) astr = a.astype(str) bstr = np.arange(2, 6).astype(str) df = DataFrame({"a": astr}) - with pytest.raises(ValueError, match="Replacement not allowed .+"): - df.replace({"a": dict(zip(astr, bstr))}) + result = df.replace(dict(zip(astr, bstr))) + expected = df.replace({"a": dict(zip(astr, bstr))}) + assert_frame_equal(result, expected) def test_replace_swapping_bug(self): df = pd.DataFrame({"a": [True, False, True]}) diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index f3452e9a85fb3..84e343f07f990 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -984,7 +984,7 @@ def test_stack_preserve_categorical_dtype(self, ordered, labels): df = DataFrame([[10, 11, 12]], columns=cidx) result = df.stack() - # `MutliIndex.from_product` preserves categorical dtype - + # `MultiIndex.from_product` preserves categorical dtype - # it's tested elsewhere. midx = pd.MultiIndex.from_product([df.index, cidx]) expected = Series([10, 11, 12], index=midx) diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index e2e4a82ff581c..8fb028a0f0326 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -695,6 +695,20 @@ def _make_frame(names=None): tm.assert_index_equal(recons.columns, exp.columns) assert len(recons) == 0 + def test_to_csv_interval_index(self): + # GH 28210 + df = DataFrame({"A": list("abc"), "B": range(3)}, index=pd.interval_range(0, 3)) + + with ensure_clean("__tmp_to_csv_interval_index__.csv") as path: + df.to_csv(path) + result = self.read_csv(path, index_col=0) + + # can't roundtrip intervalindex via read_csv so check string repr (GH 23595) + expected = df.copy() + expected.index = expected.index.astype(str) + + assert_frame_equal(result, expected) + def test_to_csv_float32_nanrep(self): df = DataFrame(np.random.randn(1, 4).astype(np.float32)) df[1] = np.nan diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 52d4fa76bf879..aa80c461a00e7 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -10,7 +10,7 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, compat, concat from pandas.core.base import SpecificationError -from pandas.core.groupby.generic import _maybe_mangle_lambdas +from pandas.core.groupby.generic import _make_unique, _maybe_mangle_lambdas from pandas.core.groupby.grouper import Grouping import pandas.util.testing as tm @@ -560,3 +560,150 @@ def test_with_kwargs(self): result = pd.Series([1, 2]).groupby([0, 0]).agg([f1, f2], 0, b=10) expected = pd.DataFrame({"": [13], "": [30]}) tm.assert_frame_equal(result, expected) + + def test_agg_with_one_lambda(self): + # GH 25719, write tests for DataFrameGroupby.agg with only one lambda + df = pd.DataFrame( + { + "kind": ["cat", "dog", "cat", "dog"], + "height": [9.1, 6.0, 9.5, 34.0], + "weight": [7.9, 7.5, 9.9, 198.0], + } + ) + + # sort for 35 and earlier + columns = ["height_sqr_min", "height_max", "weight_max"] + if compat.PY35: + columns = ["height_max", "height_sqr_min", "weight_max"] + expected = pd.DataFrame( + { + "height_sqr_min": [82.81, 36.00], + "height_max": [9.5, 34.0], + "weight_max": [9.9, 198.0], + }, + index=pd.Index(["cat", "dog"], name="kind"), + columns=columns, + ) + + # check pd.NameAgg case + result1 = df.groupby(by="kind").agg( + height_sqr_min=pd.NamedAgg( + column="height", aggfunc=lambda x: np.min(x ** 2) + ), + height_max=pd.NamedAgg(column="height", aggfunc="max"), + weight_max=pd.NamedAgg(column="weight", aggfunc="max"), + ) + tm.assert_frame_equal(result1, expected) + + # check agg(key=(col, aggfunc)) case + result2 = df.groupby(by="kind").agg( + height_sqr_min=("height", lambda x: np.min(x ** 2)), + height_max=("height", "max"), + weight_max=("weight", "max"), + ) + tm.assert_frame_equal(result2, expected) + + def test_agg_multiple_lambda(self): + # GH25719, test for DataFrameGroupby.agg with multiple lambdas + # with mixed aggfunc + df = pd.DataFrame( + { + "kind": ["cat", "dog", "cat", "dog"], + "height": [9.1, 6.0, 9.5, 34.0], + "weight": [7.9, 7.5, 9.9, 198.0], + } + ) + # sort for 35 and earlier + columns = [ + "height_sqr_min", + "height_max", + "weight_max", + "height_max_2", + "weight_min", + ] + if compat.PY35: + columns = [ + "height_max", + "height_max_2", + "height_sqr_min", + "weight_max", + "weight_min", + ] + expected = pd.DataFrame( + { + "height_sqr_min": [82.81, 36.00], + "height_max": [9.5, 34.0], + "weight_max": [9.9, 198.0], + "height_max_2": [9.5, 34.0], + "weight_min": [7.9, 7.5], + }, + index=pd.Index(["cat", "dog"], name="kind"), + columns=columns, + ) + + # check agg(key=(col, aggfunc)) case + result1 = df.groupby(by="kind").agg( + height_sqr_min=("height", lambda x: np.min(x ** 2)), + height_max=("height", "max"), + weight_max=("weight", "max"), + height_max_2=("height", lambda x: np.max(x)), + weight_min=("weight", lambda x: np.min(x)), + ) + tm.assert_frame_equal(result1, expected) + + # check pd.NamedAgg case + result2 = df.groupby(by="kind").agg( + height_sqr_min=pd.NamedAgg( + column="height", aggfunc=lambda x: np.min(x ** 2) + ), + height_max=pd.NamedAgg(column="height", aggfunc="max"), + weight_max=pd.NamedAgg(column="weight", aggfunc="max"), + height_max_2=pd.NamedAgg(column="height", aggfunc=lambda x: np.max(x)), + weight_min=pd.NamedAgg(column="weight", aggfunc=lambda x: np.min(x)), + ) + tm.assert_frame_equal(result2, expected) + + @pytest.mark.parametrize( + "order, expected_reorder", + [ + ( + [ + ("height", ""), + ("height", "max"), + ("weight", "max"), + ("height", ""), + ("weight", ""), + ], + [ + ("height", "_0"), + ("height", "max"), + ("weight", "max"), + ("height", "_1"), + ("weight", ""), + ], + ), + ( + [ + ("col2", "min"), + ("col1", ""), + ("col1", ""), + ("col1", ""), + ], + [ + ("col2", "min"), + ("col1", "_0"), + ("col1", "_1"), + ("col1", "_2"), + ], + ), + ( + [("col", ""), ("col", ""), ("col", "")], + [("col", "_0"), ("col", "_1"), ("col", "_2")], + ), + ], + ) + def test_make_unique(self, order, expected_reorder): + # GH 27519, test if make_unique function reorders correctly + result = _make_unique(order) + + assert result == expected_reorder diff --git a/pandas/tests/groupby/test_bin_groupby.py b/pandas/tests/groupby/test_bin_groupby.py index 2195686ee9c7f..b8f9ecd42bae3 100644 --- a/pandas/tests/groupby/test_bin_groupby.py +++ b/pandas/tests/groupby/test_bin_groupby.py @@ -2,7 +2,7 @@ from numpy import nan import pytest -from pandas._libs import groupby, lib, reduction +from pandas._libs import groupby, lib, reduction as libreduction from pandas.core.dtypes.common import ensure_int64 @@ -18,7 +18,7 @@ def test_series_grouper(): labels = np.array([-1, -1, -1, 0, 0, 0, 1, 1, 1, 1], dtype=np.int64) - grouper = reduction.SeriesGrouper(obj, np.mean, labels, 2, dummy) + grouper = libreduction.SeriesGrouper(obj, np.mean, labels, 2, dummy) result, counts = grouper.get_result() expected = np.array([obj[3:6].mean(), obj[6:].mean()]) @@ -34,7 +34,7 @@ def test_series_bin_grouper(): bins = np.array([3, 6]) - grouper = reduction.SeriesBinGrouper(obj, np.mean, bins, dummy) + grouper = libreduction.SeriesBinGrouper(obj, np.mean, bins, dummy) result, counts = grouper.get_result() expected = np.array([obj[:3].mean(), obj[3:6].mean(), obj[6:].mean()]) @@ -120,31 +120,31 @@ class TestMoments: class TestReducer: def test_int_index(self): arr = np.random.randn(100, 4) - result = reduction.compute_reduction(arr, np.sum, labels=Index(np.arange(4))) + result = libreduction.compute_reduction(arr, np.sum, labels=Index(np.arange(4))) expected = arr.sum(0) assert_almost_equal(result, expected) - result = reduction.compute_reduction( + result = libreduction.compute_reduction( arr, np.sum, axis=1, labels=Index(np.arange(100)) ) expected = arr.sum(1) assert_almost_equal(result, expected) dummy = Series(0.0, index=np.arange(100)) - result = reduction.compute_reduction( + result = libreduction.compute_reduction( arr, np.sum, dummy=dummy, labels=Index(np.arange(4)) ) expected = arr.sum(0) assert_almost_equal(result, expected) dummy = Series(0.0, index=np.arange(4)) - result = reduction.compute_reduction( + result = libreduction.compute_reduction( arr, np.sum, axis=1, dummy=dummy, labels=Index(np.arange(100)) ) expected = arr.sum(1) assert_almost_equal(result, expected) - result = reduction.compute_reduction( + result = libreduction.compute_reduction( arr, np.sum, axis=1, dummy=dummy, labels=Index(np.arange(100)) ) assert_almost_equal(result, expected) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 9b8c8e6d8a077..e09af3fd48ee6 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas.compat import PY37, is_platform_windows + import pandas as pd from pandas import ( Categorical, @@ -208,6 +210,9 @@ def test_level_get_group(observed): # GH#21636 previously flaky on py37 +@pytest.mark.xfail( + is_platform_windows() and PY37, reason="Flaky, GH-27902", strict=False +) @pytest.mark.parametrize("ordered", [True, False]) def test_apply(ordered): # GH 10138 @@ -429,6 +434,21 @@ def test_observed_groups_with_nan(observed): tm.assert_dict_equal(result, expected) +def test_observed_nth(): + # GH 26385 + cat = pd.Categorical(["a", np.nan, np.nan], categories=["a", "b", "c"]) + ser = pd.Series([1, 2, 3]) + df = pd.DataFrame({"cat": cat, "ser": ser}) + + result = df.groupby("cat", observed=False)["ser"].nth(0) + + index = pd.Categorical(["a", "b", "c"], categories=["a", "b", "c"]) + expected = pd.Series([1, np.nan, np.nan], index=index, name="ser") + expected.index.name = "cat" + + tm.assert_series_equal(result, expected) + + def test_dataframe_categorical_with_nan(observed): # GH 21151 s1 = Categorical([np.nan, "a", np.nan, "a"], categories=["a", "b", "c"]) @@ -506,7 +526,7 @@ def test_datetime(): desc_result = grouped.describe() idx = cats.codes.argsort() - ord_labels = cats.take_nd(idx) + ord_labels = cats.take(idx) ord_data = data.take(idx) expected = ord_data.groupby(ord_labels, observed=False).describe() assert_frame_equal(desc_result, expected) @@ -1163,3 +1183,13 @@ def test_seriesgroupby_observed_apply_dict(df_cat, observed, index, data): lambda x: OrderedDict([("min", x.min()), ("max", x.max())]) ) assert_series_equal(result, expected) + + +@pytest.mark.parametrize("code", [([1, 0, 0]), ([0, 0, 0])]) +def test_groupby_categorical_axis_1(code): + # GH 13420 + df = DataFrame({"a": [1, 2, 3, 4], "b": [-1, -2, -3, -4], "c": [5, 6, 7, 8]}) + cat = pd.Categorical.from_codes(code, categories=list("abc")) + result = df.groupby(cat, axis=1).mean() + expected = df.T.groupby(cat, axis=0).mean().T + assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index efc3142b25b82..d89233f2fd603 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1238,6 +1238,75 @@ def test_quantile(interpolation, a_vals, b_vals, q): tm.assert_frame_equal(result, expected) +def test_quantile_array(): + # https://github.com/pandas-dev/pandas/issues/27526 + df = pd.DataFrame({"A": [0, 1, 2, 3, 4]}) + result = df.groupby([0, 0, 1, 1, 1]).quantile([0.25]) + + index = pd.MultiIndex.from_product([[0, 1], [0.25]]) + expected = pd.DataFrame({"A": [0.25, 2.50]}, index=index) + tm.assert_frame_equal(result, expected) + + df = pd.DataFrame({"A": [0, 1, 2, 3], "B": [4, 5, 6, 7]}) + index = pd.MultiIndex.from_product([[0, 1], [0.25, 0.75]]) + + result = df.groupby([0, 0, 1, 1]).quantile([0.25, 0.75]) + expected = pd.DataFrame( + {"A": [0.25, 0.75, 2.25, 2.75], "B": [4.25, 4.75, 6.25, 6.75]}, index=index + ) + tm.assert_frame_equal(result, expected) + + +def test_quantile_array2(): + # https://github.com/pandas-dev/pandas/pull/28085#issuecomment-524066959 + df = pd.DataFrame( + np.random.RandomState(0).randint(0, 5, size=(10, 3)), columns=list("ABC") + ) + result = df.groupby("A").quantile([0.3, 0.7]) + expected = pd.DataFrame( + { + "B": [0.9, 2.1, 2.2, 3.4, 1.6, 2.4, 2.3, 2.7, 0.0, 0.0], + "C": [1.2, 2.8, 1.8, 3.0, 0.0, 0.0, 1.9, 3.1, 3.0, 3.0], + }, + index=pd.MultiIndex.from_product( + [[0, 1, 2, 3, 4], [0.3, 0.7]], names=["A", None] + ), + ) + tm.assert_frame_equal(result, expected) + + +def test_quantile_array_no_sort(): + df = pd.DataFrame({"A": [0, 1, 2], "B": [3, 4, 5]}) + result = df.groupby([1, 0, 1], sort=False).quantile([0.25, 0.5, 0.75]) + expected = pd.DataFrame( + {"A": [0.5, 1.0, 1.5, 1.0, 1.0, 1.0], "B": [3.5, 4.0, 4.5, 4.0, 4.0, 4.0]}, + index=pd.MultiIndex.from_product([[1, 0], [0.25, 0.5, 0.75]]), + ) + tm.assert_frame_equal(result, expected) + + result = df.groupby([1, 0, 1], sort=False).quantile([0.75, 0.25]) + expected = pd.DataFrame( + {"A": [1.5, 0.5, 1.0, 1.0], "B": [4.5, 3.5, 4.0, 4.0]}, + index=pd.MultiIndex.from_product([[1, 0], [0.75, 0.25]]), + ) + tm.assert_frame_equal(result, expected) + + +def test_quantile_array_multiple_levels(): + df = pd.DataFrame( + {"A": [0, 1, 2], "B": [3, 4, 5], "c": ["a", "a", "a"], "d": ["a", "a", "b"]} + ) + result = df.groupby(["c", "d"]).quantile([0.25, 0.75]) + index = pd.MultiIndex.from_tuples( + [("a", "a", 0.25), ("a", "a", 0.75), ("a", "b", 0.25), ("a", "b", 0.75)], + names=["c", "d", None], + ) + expected = pd.DataFrame( + {"A": [0.25, 0.75, 2.0, 2.0], "B": [3.25, 3.75, 5.0, 5.0]}, index=index + ) + tm.assert_frame_equal(result, expected) + + def test_quantile_raises(): df = pd.DataFrame( [["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"] @@ -1247,6 +1316,17 @@ def test_quantile_raises(): df.groupby("key").quantile() +def test_quantile_out_of_bounds_q_raises(): + # https://github.com/pandas-dev/pandas/issues/27470 + df = pd.DataFrame(dict(a=[0, 0, 0, 1, 1, 1], b=range(6))) + g = df.groupby([0, 0, 0, 1, 1, 1]) + with pytest.raises(ValueError, match="Got '50.0' instead"): + g.quantile(50) + + with pytest.raises(ValueError, match="Got '-1.0' instead"): + g.quantile(-1) + + # pipe # -------------------------------- diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 9459069f0ea2d..0e74c87388682 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -1,3 +1,5 @@ +import gc + import numpy as np import pytest @@ -908,3 +910,10 @@ def test_is_unique(self): # multiple NA should not be unique index_na_dup = index_na.insert(0, np.nan) assert index_na_dup.is_unique is False + + def test_engine_reference_cycle(self): + # GH27585 + index = self.create_index() + nrefs_pre = len(gc.get_referrers(index)) + index._engine + assert len(gc.get_referrers(index)) == nrefs_pre diff --git a/pandas/tests/indexes/datetimes/test_misc.py b/pandas/tests/indexes/datetimes/test_misc.py index 4ea32359b8d4a..ab3107a0798e5 100644 --- a/pandas/tests/indexes/datetimes/test_misc.py +++ b/pandas/tests/indexes/datetimes/test_misc.py @@ -377,3 +377,11 @@ def test_nanosecond_field(self): dti = DatetimeIndex(np.arange(10)) tm.assert_index_equal(dti.nanosecond, pd.Index(np.arange(10, dtype=np.int64))) + + +def test_iter_readonly(): + # GH#28055 ints_to_pydatetime with readonly array + arr = np.array([np.datetime64("2012-02-15T12:00:00.000000000")]) + arr.setflags(write=False) + dti = pd.to_datetime(arr) + list(dti) diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index 8db15709da35d..9af0f47f6dce9 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -1620,6 +1620,18 @@ def test_dayfirst(self, cache): tm.assert_index_equal(expected, idx5) tm.assert_index_equal(expected, idx6) + @pytest.mark.parametrize("klass", [DatetimeIndex, DatetimeArray]) + def test_to_datetime_dta_tz(self, klass): + # GH#27733 + dti = date_range("2015-04-05", periods=3).rename("foo") + expected = dti.tz_localize("UTC") + + obj = klass(dti) + expected = klass(expected) + + result = to_datetime(obj, utc=True) + tm.assert_equal(result, expected) + class TestGuessDatetimeFormat: @td.skip_if_not_us_locale diff --git a/pandas/tests/indexes/interval/test_construction.py b/pandas/tests/indexes/interval/test_construction.py index e2abb4531525a..82a10d24dad30 100644 --- a/pandas/tests/indexes/interval/test_construction.py +++ b/pandas/tests/indexes/interval/test_construction.py @@ -421,32 +421,3 @@ def test_index_mixed_closed(self): result = Index(intervals) expected = Index(intervals, dtype=object) tm.assert_index_equal(result, expected) - - -class TestFromIntervals(TestClassConstructors): - """ - Tests for IntervalIndex.from_intervals, which is deprecated in favor of the - IntervalIndex constructor. Same tests as the IntervalIndex constructor, - plus deprecation test. Should only need to delete this class when removed. - """ - - @pytest.fixture - def constructor(self): - def from_intervals_ignore_warnings(*args, **kwargs): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - return IntervalIndex.from_intervals(*args, **kwargs) - - return from_intervals_ignore_warnings - - def test_deprecated(self): - ivs = [Interval(0, 1), Interval(1, 2)] - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - IntervalIndex.from_intervals(ivs) - - @pytest.mark.skip(reason="parent class test that is not applicable") - def test_index_object_dtype(self): - pass - - @pytest.mark.skip(reason="parent class test that is not applicable") - def test_index_mixed_closed(self): - pass diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py index c1a21e6a7f152..eeb0f43f4b900 100644 --- a/pandas/tests/indexes/interval/test_interval.py +++ b/pandas/tests/indexes/interval/test_interval.py @@ -417,6 +417,46 @@ def test_repr_missing(self, constructor, expected): result = repr(obj) assert result == expected + @pytest.mark.parametrize( + "tuples, closed, expected_data", + [ + ([(0, 1), (1, 2), (2, 3)], "left", ["[0, 1)", "[1, 2)", "[2, 3)"]), + ( + [(0.5, 1.0), np.nan, (2.0, 3.0)], + "right", + ["(0.5, 1.0]", "NaN", "(2.0, 3.0]"], + ), + ( + [ + (Timestamp("20180101"), Timestamp("20180102")), + np.nan, + ((Timestamp("20180102"), Timestamp("20180103"))), + ], + "both", + ["[2018-01-01, 2018-01-02]", "NaN", "[2018-01-02, 2018-01-03]"], + ), + ( + [ + (Timedelta("0 days"), Timedelta("1 days")), + (Timedelta("1 days"), Timedelta("2 days")), + np.nan, + ], + "neither", + [ + "(0 days 00:00:00, 1 days 00:00:00)", + "(1 days 00:00:00, 2 days 00:00:00)", + "NaN", + ], + ), + ], + ) + def test_to_native_types(self, tuples, closed, expected_data): + # GH 28210 + index = IntervalIndex.from_tuples(tuples, closed=closed) + result = index.to_native_types() + expected = np.array(expected_data) + tm.assert_numpy_array_equal(result, expected) + def test_get_item(self, closed): i = IntervalIndex.from_arrays((0, 1, np.nan), (1, 2, np.nan), closed=closed) assert i[0] == Interval(0.0, 1.0, closed=closed) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index c40a9bce9385b..d1ed79118d2fa 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -2805,3 +2805,17 @@ def test_deprecated_fastpath(): expected = pd.CategoricalIndex(["a", "b", "c"], name="test") tm.assert_index_equal(idx, expected) + + +def test_shape_of_invalid_index(): + # Currently, it is possible to create "invalid" index objects backed by + # a multi-dimensional array (see https://github.com/pandas-dev/pandas/issues/27125 + # about this). However, as long as this is not solved in general,this test ensures + # that the returned shape is consistent with this underlying array for + # compat with matplotlib (see https://github.com/pandas-dev/pandas/issues/27775) + a = np.arange(8).reshape(2, 2, 2) + idx = pd.Index(a) + assert idx.shape == a.shape + + idx = pd.Index([0, 1, 2, 3]) + assert idx[:, None].shape == (4, 1) diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index 280b0a99c7e68..67bf9bd20e716 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -823,6 +823,11 @@ def test_equals_categorical(self): msg = ( "categorical index comparisons must have the same categories" " and ordered attributes" + "|" + "Categoricals can only be compared if 'categories' are the same. " + "Categories are different lengths" + "|" + "Categoricals can only be compared if 'ordered' is the same" ) with pytest.raises(TypeError, match=msg): ci1 == ci2 diff --git a/pandas/tests/indexes/test_numpy_compat.py b/pandas/tests/indexes/test_numpy_compat.py index f9ca1bca04165..645ad19ea4cc9 100644 --- a/pandas/tests/indexes/test_numpy_compat.py +++ b/pandas/tests/indexes/test_numpy_compat.py @@ -118,4 +118,7 @@ def test_elementwise_comparison_warning(): # this test. idx = Index([1, 2]) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - idx == "a" + result = idx == "a" + + expected = np.array([False, False]) + tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/indexing/multiindex/test_loc.py b/pandas/tests/indexing/multiindex/test_loc.py index a08b2b4c66af2..8b48c2bf7169f 100644 --- a/pandas/tests/indexing/multiindex/test_loc.py +++ b/pandas/tests/indexing/multiindex/test_loc.py @@ -390,3 +390,26 @@ def test_loc_getitem_lowerdim_corner(multiindex_dataframe_random_data): expected = 0 result = df.sort_index().loc[("bar", "three"), "B"] assert result == expected + + +def test_loc_setitem_single_column_slice(): + # case from https://github.com/pandas-dev/pandas/issues/27841 + df = DataFrame( + "string", + index=list("abcd"), + columns=MultiIndex.from_product([["Main"], ("another", "one")]), + ) + df["labels"] = "a" + df.loc[:, "labels"] = df.index + tm.assert_numpy_array_equal(np.asarray(df["labels"]), np.asarray(df.index)) + + # test with non-object block + df = DataFrame( + np.nan, + index=range(4), + columns=MultiIndex.from_tuples([("A", "1"), ("A", "2"), ("B", "1")]), + ) + expected = df.copy() + df.loc[:, "B"] = np.arange(4) + expected.iloc[:, 2] = np.arange(4) + tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index ed80e249220fd..05b58b0eca9b8 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -1038,10 +1038,6 @@ def test_replace_series(self, how, to_key, from_key): "from_key", ["datetime64[ns, UTC]", "datetime64[ns, US/Eastern]"] ) def test_replace_series_datetime_tz(self, how, to_key, from_key): - how = "series" - from_key = "datetime64[ns, US/Eastern]" - to_key = "timedelta64[ns]" - index = pd.Index([3, 4], name="xyz") obj = pd.Series(self.rep[from_key], index=index, name="yyy") assert obj.dtype == from_key diff --git a/pandas/tests/indexing/test_ix.py b/pandas/tests/indexing/test_ix.py index 45ccd8d1b8fb3..6029db8ed66f6 100644 --- a/pandas/tests/indexing/test_ix.py +++ b/pandas/tests/indexing/test_ix.py @@ -343,3 +343,13 @@ def test_ix_duplicate_returns_series(self): r = df.ix[0.2, "a"] e = df.loc[0.2, "a"] tm.assert_series_equal(r, e) + + def test_ix_intervalindex(self): + # https://github.com/pandas-dev/pandas/issues/27865 + df = DataFrame( + np.random.randn(5, 2), + index=pd.IntervalIndex.from_breaks([-np.inf, 0, 1, 2, 3, np.inf]), + ) + result = df.ix[0:2, 0] + expected = df.iloc[0:2, 0] + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index abe0cd86c90d7..9845b1ac3a4b9 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1070,6 +1070,16 @@ def test_series_indexing_zerodim_np_array(self): result = s.loc[np.array(0)] assert result == 1 + def test_loc_reverse_assignment(self): + # GH26939 + data = [1, 2, 3, 4, 5, 6] + [None] * 4 + expected = Series(data, index=range(2010, 2020)) + + result = pd.Series(index=range(2010, 2020)) + result.loc[2015:2010:-1] = [6, 5, 4, 3, 2, 1] + + tm.assert_series_equal(result, expected) + def test_series_loc_getitem_label_list_missing_values(): # gh-11428 diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py index 68e93f06e43dc..c4505231932c6 100644 --- a/pandas/tests/indexing/test_partial.py +++ b/pandas/tests/indexing/test_partial.py @@ -442,10 +442,10 @@ def test_partial_set_empty_frame(self): # these work as they don't really change # anything but the index # GH5632 - expected = DataFrame(columns=["foo"], index=Index([], dtype="int64")) + expected = DataFrame(columns=["foo"], index=Index([], dtype="object")) def f(): - df = DataFrame() + df = DataFrame(index=Index([], dtype="object")) df["foo"] = Series([], dtype="object") return df @@ -469,22 +469,21 @@ def f(): expected["foo"] = expected["foo"].astype("float64") def f(): - df = DataFrame() + df = DataFrame(index=Index([], dtype="int64")) df["foo"] = [] return df tm.assert_frame_equal(f(), expected) def f(): - df = DataFrame() + df = DataFrame(index=Index([], dtype="int64")) df["foo"] = Series(np.arange(len(df)), dtype="float64") return df tm.assert_frame_equal(f(), expected) def f(): - df = DataFrame() - tm.assert_index_equal(df.index, Index([], dtype="object")) + df = DataFrame(index=Index([], dtype="int64")) df["foo"] = range(len(df)) return df diff --git a/pandas/tests/io/data/legacy_hdf/gh26443.h5 b/pandas/tests/io/data/legacy_hdf/gh26443.h5 new file mode 100644 index 0000000000000..45aa64324530f Binary files /dev/null and b/pandas/tests/io/data/legacy_hdf/gh26443.h5 differ diff --git a/pandas/tests/io/formats/data/html/html_repr_max_rows_10_min_rows_12.html b/pandas/tests/io/formats/data/html/html_repr_max_rows_10_min_rows_12.html new file mode 100644 index 0000000000000..4eb3f5319749d --- /dev/null +++ b/pandas/tests/io/formats/data/html/html_repr_max_rows_10_min_rows_12.html @@ -0,0 +1,70 @@ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
a
00
11
22
33
44
......
5656
5757
5858
5959
6060
+

61 rows × 1 columns

+
diff --git a/pandas/tests/io/formats/data/html/html_repr_max_rows_10_min_rows_4.html b/pandas/tests/io/formats/data/html/html_repr_max_rows_10_min_rows_4.html new file mode 100644 index 0000000000000..2b1d97aec517c --- /dev/null +++ b/pandas/tests/io/formats/data/html/html_repr_max_rows_10_min_rows_4.html @@ -0,0 +1,46 @@ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
a
00
11
......
5959
6060
+

61 rows × 1 columns

+
diff --git a/pandas/tests/io/formats/data/html/html_repr_max_rows_12_min_rows_None.html b/pandas/tests/io/formats/data/html/html_repr_max_rows_12_min_rows_None.html new file mode 100644 index 0000000000000..a539e5a4884a1 --- /dev/null +++ b/pandas/tests/io/formats/data/html/html_repr_max_rows_12_min_rows_None.html @@ -0,0 +1,78 @@ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
a
00
11
22
33
44
55
......
5555
5656
5757
5858
5959
6060
+

61 rows × 1 columns

+
diff --git a/pandas/tests/io/formats/data/html/html_repr_max_rows_None_min_rows_12.html b/pandas/tests/io/formats/data/html/html_repr_max_rows_None_min_rows_12.html new file mode 100644 index 0000000000000..3e680a505c6d6 --- /dev/null +++ b/pandas/tests/io/formats/data/html/html_repr_max_rows_None_min_rows_12.html @@ -0,0 +1,269 @@ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
a
00
11
22
33
44
55
66
77
88
99
1010
1111
1212
1313
1414
1515
1616
1717
1818
1919
2020
2121
2222
2323
2424
2525
2626
2727
2828
2929
3030
3131
3232
3333
3434
3535
3636
3737
3838
3939
4040
4141
4242
4343
4444
4545
4646
4747
4848
4949
5050
5151
5252
5353
5454
5555
5656
5757
5858
5959
6060
+
diff --git a/pandas/tests/io/formats/data/html/html_repr_min_rows_default_no_truncation.html b/pandas/tests/io/formats/data/html/html_repr_min_rows_default_no_truncation.html new file mode 100644 index 0000000000000..10f6247e37def --- /dev/null +++ b/pandas/tests/io/formats/data/html/html_repr_min_rows_default_no_truncation.html @@ -0,0 +1,105 @@ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
a
00
11
22
33
44
55
66
77
88
99
1010
1111
1212
1313
1414
1515
1616
1717
1818
1919
+
diff --git a/pandas/tests/io/formats/data/html/html_repr_min_rows_default_truncated.html b/pandas/tests/io/formats/data/html/html_repr_min_rows_default_truncated.html new file mode 100644 index 0000000000000..4eb3f5319749d --- /dev/null +++ b/pandas/tests/io/formats/data/html/html_repr_min_rows_default_truncated.html @@ -0,0 +1,70 @@ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
a
00
11
22
33
44
......
5656
5757
5858
5959
6060
+

61 rows × 1 columns

+
diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index a048e3bb867bd..c0451a0672c89 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -471,28 +471,35 @@ def test_repr_min_rows(self): # default setting no truncation even if above min_rows assert ".." not in repr(df) + assert ".." not in df._repr_html_() df = pd.DataFrame({"a": range(61)}) # default of max_rows 60 triggers truncation if above assert ".." in repr(df) + assert ".." in df._repr_html_() with option_context("display.max_rows", 10, "display.min_rows", 4): # truncated after first two rows assert ".." in repr(df) assert "2 " not in repr(df) + assert "..." in df._repr_html_() + assert "2" not in df._repr_html_() with option_context("display.max_rows", 12, "display.min_rows", None): # when set to None, follow value of max_rows assert "5 5" in repr(df) + assert "5" in df._repr_html_() with option_context("display.max_rows", 10, "display.min_rows", 12): # when set value higher as max_rows, use the minimum assert "5 5" not in repr(df) + assert "5" not in df._repr_html_() with option_context("display.max_rows", None, "display.min_rows", 12): # max_rows of None -> never truncate assert ".." not in repr(df) + assert ".." not in df._repr_html_() def test_str_max_colwidth(self): # GH 7856 diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index ee236a8253b01..ab44b8b8059eb 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -514,3 +514,44 @@ def test_to_csv_compression(self, compression_only, read_infer, to_infer): df.to_csv(path, compression=to_compression) result = pd.read_csv(path, index_col=0, compression=read_compression) tm.assert_frame_equal(result, df) + + def test_to_csv_compression_dict(self, compression_only): + # GH 26023 + method = compression_only + df = DataFrame({"ABC": [1]}) + filename = "to_csv_compress_as_dict." + filename += "gz" if method == "gzip" else method + with tm.ensure_clean(filename) as path: + df.to_csv(path, compression={"method": method}) + read_df = pd.read_csv(path, index_col=0) + tm.assert_frame_equal(read_df, df) + + def test_to_csv_compression_dict_no_method_raises(self): + # GH 26023 + df = DataFrame({"ABC": [1]}) + compression = {"some_option": True} + msg = "must have key 'method'" + + with tm.ensure_clean("out.zip") as path: + with pytest.raises(ValueError, match=msg): + df.to_csv(path, compression=compression) + + @pytest.mark.parametrize("compression", ["zip", "infer"]) + @pytest.mark.parametrize( + "archive_name", [None, "test_to_csv.csv", "test_to_csv.zip"] + ) + def test_to_csv_zip_arguments(self, compression, archive_name): + # GH 26023 + from zipfile import ZipFile + + df = DataFrame({"ABC": [1]}) + with tm.ensure_clean("to_csv_archive_name.zip") as path: + df.to_csv( + path, compression={"method": compression, "archive_name": archive_name} + ) + zp = ZipFile(path) + expected_arcname = path if archive_name is None else archive_name + expected_arcname = os.path.basename(expected_arcname) + assert len(zp.filelist) == 1 + archived_file = os.path.basename(zp.filelist[0].filename) + assert archived_file == expected_arcname diff --git a/pandas/tests/io/formats/test_to_html.py b/pandas/tests/io/formats/test_to_html.py index 448e869df950d..52c7b89220f06 100644 --- a/pandas/tests/io/formats/test_to_html.py +++ b/pandas/tests/io/formats/test_to_html.py @@ -713,3 +713,42 @@ def test_to_html_with_col_space_units(unit): for h in hdrs: expected = ''.format(unit=unit) assert expected in h + + +def test_html_repr_min_rows_default(datapath): + # gh-27991 + + # default setting no truncation even if above min_rows + df = pd.DataFrame({"a": range(20)}) + result = df._repr_html_() + expected = expected_html(datapath, "html_repr_min_rows_default_no_truncation") + assert result == expected + + # default of max_rows 60 triggers truncation if above + df = pd.DataFrame({"a": range(61)}) + result = df._repr_html_() + expected = expected_html(datapath, "html_repr_min_rows_default_truncated") + assert result == expected + + +@pytest.mark.parametrize( + "max_rows,min_rows,expected", + [ + # truncated after first two rows + (10, 4, "html_repr_max_rows_10_min_rows_4"), + # when set to None, follow value of max_rows + (12, None, "html_repr_max_rows_12_min_rows_None"), + # when set value higher as max_rows, use the minimum + (10, 12, "html_repr_max_rows_10_min_rows_12"), + # max_rows of None -> never truncate + (None, 12, "html_repr_max_rows_None_min_rows_12"), + ], +) +def test_html_repr_min_rows(datapath, max_rows, min_rows, expected): + # gh-27991 + + df = pd.DataFrame({"a": range(61)}) + expected = expected_html(datapath, expected) + with option_context("display.max_rows", max_rows, "display.min_rows", min_rows): + result = df._repr_html_() + assert result == expected diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 9c687f036aa68..9842a706f43d7 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1012,60 +1012,70 @@ def test_convert_dates_infer(self): result = read_json(dumps(data))[["id", infer_word]] assert_frame_equal(result, expected) - def test_date_format_frame(self): + @pytest.mark.parametrize( + "date,date_unit", + [ + ("20130101 20:43:42.123", None), + ("20130101 20:43:42", "s"), + ("20130101 20:43:42.123", "ms"), + ("20130101 20:43:42.123456", "us"), + ("20130101 20:43:42.123456789", "ns"), + ], + ) + def test_date_format_frame(self, date, date_unit): df = self.tsframe.copy() - def test_w_date(date, date_unit=None): - df["date"] = Timestamp(date) - df.iloc[1, df.columns.get_loc("date")] = pd.NaT - df.iloc[5, df.columns.get_loc("date")] = pd.NaT - if date_unit: - json = df.to_json(date_format="iso", date_unit=date_unit) - else: - json = df.to_json(date_format="iso") - result = read_json(json) - expected = df.copy() - expected.index = expected.index.tz_localize("UTC") - expected["date"] = expected["date"].dt.tz_localize("UTC") - assert_frame_equal(result, expected) - - test_w_date("20130101 20:43:42.123") - test_w_date("20130101 20:43:42", date_unit="s") - test_w_date("20130101 20:43:42.123", date_unit="ms") - test_w_date("20130101 20:43:42.123456", date_unit="us") - test_w_date("20130101 20:43:42.123456789", date_unit="ns") + df["date"] = Timestamp(date) + df.iloc[1, df.columns.get_loc("date")] = pd.NaT + df.iloc[5, df.columns.get_loc("date")] = pd.NaT + if date_unit: + json = df.to_json(date_format="iso", date_unit=date_unit) + else: + json = df.to_json(date_format="iso") + result = read_json(json) + expected = df.copy() + # expected.index = expected.index.tz_localize("UTC") + expected["date"] = expected["date"].dt.tz_localize("UTC") + assert_frame_equal(result, expected) + def test_date_format_frame_raises(self): + df = self.tsframe.copy() msg = "Invalid value 'foo' for option 'date_unit'" with pytest.raises(ValueError, match=msg): df.to_json(date_format="iso", date_unit="foo") - def test_date_format_series(self): - def test_w_date(date, date_unit=None): - ts = Series(Timestamp(date), index=self.ts.index) - ts.iloc[1] = pd.NaT - ts.iloc[5] = pd.NaT - if date_unit: - json = ts.to_json(date_format="iso", date_unit=date_unit) - else: - json = ts.to_json(date_format="iso") - result = read_json(json, typ="series") - expected = ts.copy() - expected.index = expected.index.tz_localize("UTC") - expected = expected.dt.tz_localize("UTC") - assert_series_equal(result, expected) - - test_w_date("20130101 20:43:42.123") - test_w_date("20130101 20:43:42", date_unit="s") - test_w_date("20130101 20:43:42.123", date_unit="ms") - test_w_date("20130101 20:43:42.123456", date_unit="us") - test_w_date("20130101 20:43:42.123456789", date_unit="ns") + @pytest.mark.parametrize( + "date,date_unit", + [ + ("20130101 20:43:42.123", None), + ("20130101 20:43:42", "s"), + ("20130101 20:43:42.123", "ms"), + ("20130101 20:43:42.123456", "us"), + ("20130101 20:43:42.123456789", "ns"), + ], + ) + def test_date_format_series(self, date, date_unit): + ts = Series(Timestamp(date), index=self.ts.index) + ts.iloc[1] = pd.NaT + ts.iloc[5] = pd.NaT + if date_unit: + json = ts.to_json(date_format="iso", date_unit=date_unit) + else: + json = ts.to_json(date_format="iso") + result = read_json(json, typ="series") + expected = ts.copy() + # expected.index = expected.index.tz_localize("UTC") + expected = expected.dt.tz_localize("UTC") + assert_series_equal(result, expected) + def test_date_format_series_raises(self): ts = Series(Timestamp("20130101 20:43:42.123"), index=self.ts.index) msg = "Invalid value 'foo' for option 'date_unit'" with pytest.raises(ValueError, match=msg): ts.to_json(date_format="iso", date_unit="foo") - def test_date_unit(self): + @pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"]) + def test_date_unit(self, unit): df = self.tsframe.copy() df["date"] = Timestamp("20130101 20:43:42") dl = df.columns.get_loc("date") @@ -1073,16 +1083,15 @@ def test_date_unit(self): df.iloc[2, dl] = Timestamp("21460101 20:43:42") df.iloc[4, dl] = pd.NaT - for unit in ("s", "ms", "us", "ns"): - json = df.to_json(date_format="epoch", date_unit=unit) + json = df.to_json(date_format="epoch", date_unit=unit) - # force date unit - result = read_json(json, date_unit=unit) - assert_frame_equal(result, df) + # force date unit + result = read_json(json, date_unit=unit) + assert_frame_equal(result, df) - # detect date unit - result = read_json(json, date_unit=None) - assert_frame_equal(result, df) + # detect date unit + result = read_json(json, date_unit=None) + assert_frame_equal(result, df) def test_weird_nested_json(self): # this used to core dump the parser @@ -1611,3 +1620,30 @@ def test_read_timezone_information(self): ) expected = Series([88], index=DatetimeIndex(["2019-01-01 11:00:00"], tz="UTC")) assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "date_format,key", [("epoch", 86400000), ("iso", "P1DT0H0M0S")] + ) + def test_timedelta_as_label(self, date_format, key): + df = pd.DataFrame([[1]], columns=[pd.Timedelta("1D")]) + expected = '{{"{key}":{{"0":1}}}}'.format(key=key) + result = df.to_json(date_format=date_format) + + assert result == expected + + @pytest.mark.parametrize( + "orient,expected", + [ + ("index", "{\"('a', 'b')\":{\"('c', 'd')\":1}}"), + ("columns", "{\"('c', 'd')\":{\"('a', 'b')\":1}}"), + # TODO: the below have separate encoding procedures + # They produce JSON but not in a consistent manner + pytest.param("split", "", marks=pytest.mark.skip), + pytest.param("table", "", marks=pytest.mark.skip), + ], + ) + def test_tuple_labels(self, orient, expected): + # GH 20500 + df = pd.DataFrame([[1]], index=[("a", "b")], columns=[("c", "d")]) + result = df.to_json(orient=orient) + assert result == expected diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index b94d5cd497ccf..e04535df56663 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -1898,7 +1898,10 @@ def test_null_byte_char(all_parsers): out = parser.read_csv(StringIO(data), names=names) tm.assert_frame_equal(out, expected) else: - msg = "NULL byte detected" + if compat.PY38: + msg = "line contains NUL" + else: + msg = "NULL byte detected" with pytest.raises(ParserError, match=msg): parser.read_csv(StringIO(data), names=names) @@ -2020,9 +2023,34 @@ def test_file_handles_with_open(all_parsers, csv1): # Don't close user provided file handles. parser = all_parsers - with open(csv1, "r") as f: - parser.read_csv(f) - assert not f.closed + for mode in ["r", "rb"]: + with open(csv1, mode) as f: + parser.read_csv(f) + assert not f.closed + + +@pytest.mark.parametrize( + "fname,encoding", + [ + ("test1.csv", "utf-8"), + ("unicode_series.csv", "latin-1"), + ("sauron.SHIFT_JIS.csv", "shiftjis"), + ], +) +def test_binary_mode_file_buffers(all_parsers, csv_dir_path, fname, encoding): + # gh-23779: Python csv engine shouldn't error on files opened in binary. + parser = all_parsers + + fpath = os.path.join(csv_dir_path, fname) + expected = parser.read_csv(fpath, encoding=encoding) + + with open(fpath, mode="r", encoding=encoding) as fa: + result = parser.read_csv(fa) + tm.assert_frame_equal(expected, result) + + with open(fpath, mode="rb") as fb: + result = parser.read_csv(fb, encoding=encoding) + tm.assert_frame_equal(expected, result) def test_invalid_file_buffer_class(all_parsers): diff --git a/pandas/tests/io/parser/test_header.py b/pandas/tests/io/parser/test_header.py index 99e0181741998..0ecd8be7ddc78 100644 --- a/pandas/tests/io/parser/test_header.py +++ b/pandas/tests/io/parser/test_header.py @@ -24,6 +24,35 @@ def test_read_with_bad_header(all_parsers): parser.read_csv(s, header=[10]) +def test_negative_header(all_parsers): + # see gh-27779 + parser = all_parsers + data = """1,2,3,4,5 +6,7,8,9,10 +11,12,13,14,15 +""" + with pytest.raises( + ValueError, + match="Passing negative integer to header is invalid. " + "For no header, use header=None instead", + ): + parser.read_csv(StringIO(data), header=-1) + + +@pytest.mark.parametrize("header", [([-1, 2, 4]), ([-5, 0])]) +def test_negative_multi_index_header(all_parsers, header): + # see gh-27779 + parser = all_parsers + data = """1,2,3,4,5 + 6,7,8,9,10 + 11,12,13,14,15 + """ + with pytest.raises( + ValueError, match="cannot specify multi-index header with negative integers" + ): + parser.read_csv(StringIO(data), header=header) + + @pytest.mark.parametrize("header", [True, False]) def test_bool_header_arg(all_parsers, header): # see gh-6114 diff --git a/pandas/tests/io/pytables/test_pytables.py b/pandas/tests/io/pytables/test_pytables.py index d67f2c3b7bd66..77cac00882771 100644 --- a/pandas/tests/io/pytables/test_pytables.py +++ b/pandas/tests/io/pytables/test_pytables.py @@ -37,7 +37,6 @@ import pandas.util.testing as tm from pandas.util.testing import assert_frame_equal, assert_series_equal, set_timezone -from pandas.io import pytables as pytables # noqa:E402 from pandas.io.formats.printing import pprint_thing from pandas.io.pytables import ( ClosedFileError, @@ -46,7 +45,9 @@ Term, read_hdf, ) -from pandas.io.pytables import TableIterator # noqa:E402 + +from pandas.io import pytables as pytables # noqa: E402 isort:skip +from pandas.io.pytables import TableIterator # noqa: E402 isort:skip tables = pytest.importorskip("tables") @@ -5446,3 +5447,16 @@ def test_read_with_where_tz_aware_index(self): store.append(key, expected, format="table", append=True) result = pd.read_hdf(path, key, where="DATE > 20151130") assert_frame_equal(result, expected) + + def test_py2_created_with_datetimez(self, datapath): + # The test HDF5 file was created in Python 2, but could not be read in + # Python 3. + # + # GH26443 + index = [pd.Timestamp("2019-01-01T18:00").tz_localize("America/New_York")] + expected = DataFrame({"data": 123}, index=index) + with ensure_clean_store( + datapath("io", "data", "legacy_hdf", "gh26443.h5"), mode="r" + ) as store: + result = store["key"] + assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index ce459ab24afe0..d68b6a1effaa0 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -1,5 +1,8 @@ import contextlib import os +import subprocess +import sys +import textwrap import warnings import pytest @@ -125,3 +128,33 @@ def test_compression_warning(compression_only): with tm.assert_produces_warning(RuntimeWarning, check_stacklevel=False): with f: df.to_csv(f, compression=compression_only) + + +def test_with_missing_lzma(): + """Tests if import pandas works when lzma is not present.""" + # https://github.com/pandas-dev/pandas/issues/27575 + code = textwrap.dedent( + """\ + import sys + sys.modules['lzma'] = None + import pandas + """ + ) + subprocess.check_output([sys.executable, "-c", code]) + + +def test_with_missing_lzma_runtime(): + """Tests if RuntimeError is hit when calling lzma without + having the module available.""" + code = textwrap.dedent( + """ + import sys + import pytest + sys.modules['lzma'] = None + import pandas + df = pandas.DataFrame() + with pytest.raises(RuntimeError, match='lzma module'): + df.to_csv('foo.csv', compression='xz') + """ + ) + subprocess.check_output([sys.executable, "-c", code]) diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 87a2405a10dd5..ee668d6890756 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -8,7 +8,7 @@ import pandas.util.testing as tm from pandas.util.testing import assert_frame_equal, ensure_clean -from pandas.io.feather_format import read_feather, to_feather # noqa:E402 +from pandas.io.feather_format import read_feather, to_feather # noqa: E402 isort:skip pyarrow = pytest.importorskip("pyarrow") diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 076d0c9f947c7..30555508f0998 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -13,7 +13,6 @@ import bz2 import glob import gzip -import lzma import os import pickle import shutil @@ -22,7 +21,7 @@ import pytest -from pandas.compat import is_platform_little_endian +from pandas.compat import _get_lzma_file, _import_lzma, is_platform_little_endian import pandas as pd from pandas import Index @@ -30,6 +29,8 @@ from pandas.tseries.offsets import Day, MonthEnd +lzma = _import_lzma() + @pytest.fixture(scope="module") def current_pickle_data(): @@ -270,7 +271,7 @@ def compress_file(self, src_path, dest_path, compression): with zipfile.ZipFile(dest_path, "w", compression=zipfile.ZIP_DEFLATED) as f: f.write(src_path, os.path.basename(src_path)) elif compression == "xz": - f = lzma.LZMAFile(dest_path, "w") + f = _get_lzma_file(lzma)(dest_path, "w") else: msg = "Unrecognized compression type: {}".format(compression) raise ValueError(msg) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index d8465a427eaea..25727447b4c6f 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -565,7 +565,6 @@ def _transaction_test(self): class _TestSQLApi(PandasSQLTest): - """ Base class to test the public API. diff --git a/pandas/tests/plotting/common.py b/pandas/tests/plotting/common.py index 4929422d20e8a..5a591f72d7361 100644 --- a/pandas/tests/plotting/common.py +++ b/pandas/tests/plotting/common.py @@ -103,6 +103,28 @@ def _check_legend_labels(self, axes, labels=None, visible=True): else: assert ax.get_legend() is None + def _check_legend_marker(self, ax, expected_markers=None, visible=True): + """ + Check ax has expected legend markers + + Parameters + ---------- + ax : matplotlib Axes object + expected_markers : list-like + expected legend markers + visible : bool + expected legend visibility. labels are checked only when visible is + True + """ + if visible and (expected_markers is None): + raise ValueError("Markers must be specified when visible is True") + if visible: + handles, _ = ax.get_legend_handles_labels() + markers = [handle.get_marker() for handle in handles] + assert markers == expected_markers + else: + assert ax.get_legend() is None + def _check_data(self, xp, rs): """ Check each axes has identical lines diff --git a/pandas/tests/plotting/test_backend.py b/pandas/tests/plotting/test_backend.py index e79e7b6239eb3..d126407cfd823 100644 --- a/pandas/tests/plotting/test_backend.py +++ b/pandas/tests/plotting/test_backend.py @@ -46,14 +46,18 @@ def test_backend_is_correct(monkeypatch): @td.skip_if_no_mpl def test_register_entrypoint(): + + dist = pkg_resources.get_distribution("pandas") + if dist.module_path not in pandas.__file__: + # We are running from a non-installed pandas, and this test is invalid + pytest.skip("Testing a non-installed pandas") + mod = types.ModuleType("my_backend") mod.plot = lambda *args, **kwargs: 1 backends = pkg_resources.get_entry_map("pandas") my_entrypoint = pkg_resources.EntryPoint( - "pandas_plotting_backend", - mod.__name__, - dist=pkg_resources.get_distribution("pandas"), + "pandas_plotting_backend", mod.__name__, dist=dist ) backends["pandas_plotting_backends"]["my_backend"] = my_entrypoint # TODO: the docs recommend importlib.util.module_from_spec. But this works for now. diff --git a/pandas/tests/plotting/test_boxplot_method.py b/pandas/tests/plotting/test_boxplot_method.py index cab0efe53f1fc..5bbaff580c356 100644 --- a/pandas/tests/plotting/test_boxplot_method.py +++ b/pandas/tests/plotting/test_boxplot_method.py @@ -9,7 +9,7 @@ import pandas.util._test_decorators as td -from pandas import DataFrame, MultiIndex, Series +from pandas import DataFrame, MultiIndex, Series, date_range, timedelta_range from pandas.tests.plotting.common import TestPlotBase, _check_plot_works import pandas.util.testing as tm @@ -160,6 +160,21 @@ def test_fontsize(self): df.boxplot("a", fontsize=16), xlabelsize=16, ylabelsize=16 ) + def test_boxplot_numeric_data(self): + # GH 22799 + df = DataFrame( + { + "a": date_range("2012-01-01", periods=100), + "b": np.random.randn(100), + "c": np.random.randn(100) + 2, + "d": date_range("2012-01-01", periods=100).astype(str), + "e": date_range("2012-01-01", periods=100, tz="UTC"), + "f": timedelta_range("1 days", periods=100), + } + ) + ax = df.plot(kind="box") + assert [x.get_text() for x in ax.get_xticklabels()] == ["b", "c"] + @td.skip_if_no_mpl class TestDataFrameGroupByPlots(TestPlotBase): diff --git a/pandas/tests/plotting/test_converter.py b/pandas/tests/plotting/test_converter.py index 35d12706f0590..7001264c41c05 100644 --- a/pandas/tests/plotting/test_converter.py +++ b/pandas/tests/plotting/test_converter.py @@ -40,6 +40,21 @@ def test_initial_warning(): assert "Using an implicitly" in out +def test_registry_mpl_resets(): + # Check that Matplotlib converters are properly reset (see issue #27481) + code = ( + "import matplotlib.units as units; " + "import matplotlib.dates as mdates; " + "n_conv = len(units.registry); " + "import pandas as pd; " + "pd.plotting.register_matplotlib_converters(); " + "pd.plotting.deregister_matplotlib_converters(); " + "assert len(units.registry) == n_conv" + ) + call = [sys.executable, "-c", code] + subprocess.check_output(call) + + def test_timtetonum_accepts_unicode(): assert converter.time2num("00:01") == converter.time2num("00:01") diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index 69070ea11e478..e2b7f2819f957 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -1410,7 +1410,7 @@ def test_plot_outofbounds_datetime(self): def test_format_timedelta_ticks_narrow(self): - expected_labels = ["00:00:00.0000000{:0>2d}".format(i) for i in range(10)] + expected_labels = ["00:00:00.0000000{:0>2d}".format(i) for i in np.arange(10)] rng = timedelta_range("0", periods=10, freq="ns") df = DataFrame(np.random.randn(len(rng), 3), rng) diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index 65815bcedebfc..f672cd3a6aa58 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -1881,6 +1881,31 @@ def test_df_legend_labels(self): self._check_legend_labels(ax, labels=["LABEL_b", "LABEL_c"]) assert df5.columns.tolist() == ["b", "c"] + def test_missing_marker_multi_plots_on_same_ax(self): + # GH 18222 + df = pd.DataFrame( + data=[[1, 1, 1, 1], [2, 2, 4, 8]], columns=["x", "r", "g", "b"] + ) + fig, ax = self.plt.subplots(nrows=1, ncols=3) + # Left plot + df.plot(x="x", y="r", linewidth=0, marker="o", color="r", ax=ax[0]) + df.plot(x="x", y="g", linewidth=1, marker="x", color="g", ax=ax[0]) + df.plot(x="x", y="b", linewidth=1, marker="o", color="b", ax=ax[0]) + self._check_legend_labels(ax[0], labels=["r", "g", "b"]) + self._check_legend_marker(ax[0], expected_markers=["o", "x", "o"]) + # Center plot + df.plot(x="x", y="b", linewidth=1, marker="o", color="b", ax=ax[1]) + df.plot(x="x", y="r", linewidth=0, marker="o", color="r", ax=ax[1]) + df.plot(x="x", y="g", linewidth=1, marker="x", color="g", ax=ax[1]) + self._check_legend_labels(ax[1], labels=["b", "r", "g"]) + self._check_legend_marker(ax[1], expected_markers=["o", "o", "x"]) + # Right plot + df.plot(x="x", y="g", linewidth=1, marker="x", color="g", ax=ax[2]) + df.plot(x="x", y="b", linewidth=1, marker="o", color="b", ax=ax[2]) + df.plot(x="x", y="r", linewidth=0, marker="o", color="r", ax=ax[2]) + self._check_legend_labels(ax[2], labels=["g", "b", "r"]) + self._check_legend_marker(ax[2], expected_markers=["x", "o", "o"]) + def test_legend_name(self): multi = DataFrame( randn(4, 4), @@ -3152,6 +3177,58 @@ def test_x_multiindex_values_ticks(self): assert labels_position["(2013, 1)"] == 2.0 assert labels_position["(2013, 2)"] == 3.0 + @pytest.mark.parametrize("kind", ["line", "area"]) + def test_xlim_plot_line(self, kind): + # test if xlim is set correctly in plot.line and plot.area + # GH 27686 + df = pd.DataFrame([2, 4], index=[1, 2]) + ax = df.plot(kind=kind) + xlims = ax.get_xlim() + assert xlims[0] < 1 + assert xlims[1] > 2 + + def test_xlim_plot_line_correctly_in_mixed_plot_type(self): + # test if xlim is set correctly when ax contains multiple different kinds + # of plots, GH 27686 + fig, ax = self.plt.subplots() + + indexes = ["k1", "k2", "k3", "k4"] + df = pd.DataFrame( + { + "s1": [1000, 2000, 1500, 2000], + "s2": [900, 1400, 2000, 3000], + "s3": [1500, 1500, 1600, 1200], + "secondary_y": [1, 3, 4, 3], + }, + index=indexes, + ) + df[["s1", "s2", "s3"]].plot.bar(ax=ax, stacked=False) + df[["secondary_y"]].plot(ax=ax, secondary_y=True) + + xlims = ax.get_xlim() + assert xlims[0] < 0 + assert xlims[1] > 3 + + # make sure axis labels are plotted correctly as well + xticklabels = [t.get_text() for t in ax.get_xticklabels()] + assert xticklabels == indexes + + def test_subplots_sharex_false(self): + # test when sharex is set to False, two plots should have different + # labels, GH 25160 + df = pd.DataFrame(np.random.rand(10, 2)) + df.iloc[5:, 1] = np.nan + df.iloc[:5, 0] = np.nan + + figs, axs = self.plt.subplots(2, 1) + df.plot.line(ax=axs, subplots=True, sharex=False) + + expected_ax1 = np.arange(4.5, 10, 0.5) + expected_ax2 = np.arange(-0.5, 5, 0.5) + + tm.assert_numpy_array_equal(axs[0].get_xticks(), expected_ax1) + tm.assert_numpy_array_equal(axs[1].get_xticks(), expected_ax2) + def _generate_4_axes_via_gridspec(): import matplotlib.pyplot as plt diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index 8b4a78e9195b5..2c4c8aa7461a3 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -167,6 +167,15 @@ def test_label(self): ax.legend() # draw it self._check_legend_labels(ax, labels=["LABEL"]) + def test_boolean(self): + # GH 23719 + s = Series([False, False, True]) + _check_plot_works(s.plot, include_bool=True) + + msg = "no numeric data to plot" + with pytest.raises(TypeError, match=msg): + _check_plot_works(s.plot) + def test_line_area_nan_series(self): values = [1, 2, np.nan, 3] s = Series(values) @@ -888,3 +897,15 @@ def test_plot_accessor_updates_on_inplace(self): _, ax = self.plt.subplots() after = ax.xaxis.get_ticklocs() tm.assert_numpy_array_equal(before, after) + + @pytest.mark.parametrize("kind", ["line", "area"]) + def test_plot_xlim_for_series(self, kind): + # test if xlim is also correctly plotted in Series for line and area + # GH 27686 + s = Series([2, 3]) + _, ax = self.plt.subplots() + s.plot(kind=kind, ax=ax) + xlims = ax.get_xlim() + + assert xlims[0] < 0 + assert xlims[1] > 1 diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index b6c6f967333a8..a04f093ee7818 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1340,6 +1340,18 @@ def test_merge_take_missing_values_from_index_of_other_dtype(self): expected = expected.reindex(columns=["a", "key", "b"]) tm.assert_frame_equal(result, expected) + def test_merge_readonly(self): + # https://github.com/pandas-dev/pandas/issues/27943 + data1 = pd.DataFrame( + np.arange(20).reshape((4, 5)) + 1, columns=["a", "b", "c", "d", "e"] + ) + data2 = pd.DataFrame( + np.arange(20).reshape((5, 4)) + 1, columns=["a", "b", "x", "y"] + ) + + data1._data.blocks[0].values.flags.writeable = False + data1.merge(data2) # no error + def _check_merge(x, y): for how in ["inner", "left", "outer"]: diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index 6b66386bafc5e..7412b1de643a1 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -1,3 +1,5 @@ +import datetime + import numpy as np import pytest import pytz @@ -588,14 +590,23 @@ def test_non_sorted(self): # ok, though has dupes merge_asof(trades, self.quotes, on="time", by="ticker") - def test_tolerance(self): + @pytest.mark.parametrize( + "tolerance", + [ + Timedelta("1day"), + pytest.param( + datetime.timedelta(days=1), + marks=pytest.mark.xfail(reason="not implemented", strict=True), + ), + ], + ids=["pd.Timedelta", "datetime.timedelta"], + ) + def test_tolerance(self, tolerance): trades = self.trades quotes = self.quotes - result = merge_asof( - trades, quotes, on="time", by="ticker", tolerance=Timedelta("1day") - ) + result = merge_asof(trades, quotes, on="time", by="ticker", tolerance=tolerance) expected = self.tolerance assert_frame_equal(result, expected) @@ -1246,3 +1257,39 @@ def test_by_mixed_tz_aware(self): ) expected["value_y"] = np.array([np.nan], dtype=object) assert_frame_equal(result, expected) + + def test_timedelta_tolerance_nearest(self): + # GH 27642 + + left = pd.DataFrame( + list(zip([0, 5, 10, 15, 20, 25], [0, 1, 2, 3, 4, 5])), + columns=["time", "left"], + ) + + left["time"] = pd.to_timedelta(left["time"], "ms") + + right = pd.DataFrame( + list(zip([0, 3, 9, 12, 15, 18], [0, 1, 2, 3, 4, 5])), + columns=["time", "right"], + ) + + right["time"] = pd.to_timedelta(right["time"], "ms") + + expected = pd.DataFrame( + list( + zip( + [0, 5, 10, 15, 20, 25], + [0, 1, 2, 3, 4, 5], + [0, np.nan, 2, 4, np.nan, np.nan], + ) + ), + columns=["time", "left", "right"], + ) + + expected["time"] = pd.to_timedelta(expected["time"], "ms") + + result = pd.merge_asof( + left, right, on="time", tolerance=Timedelta("1ms"), direction="nearest" + ) + + assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 6366bf0521fbc..13f0f14014a31 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -50,7 +50,6 @@ def sort_with_none(request): class TestConcatAppendCommon: - """ Test common dtype coercion rules between concat and append. """ diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index be82e7f595f8c..03b15d2df1a26 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -2447,3 +2447,84 @@ def test_crosstab_unsorted_order(self): [[1, 0, 0], [0, 1, 0], [0, 0, 1]], index=e_idx, columns=e_columns ) tm.assert_frame_equal(result, expected) + + def test_margin_normalize(self): + # GH 27500 + df = pd.DataFrame( + { + "A": ["foo", "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"], + "B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"], + "C": [ + "small", + "large", + "large", + "small", + "small", + "large", + "small", + "small", + "large", + ], + "D": [1, 2, 2, 3, 3, 4, 5, 6, 7], + "E": [2, 4, 5, 5, 6, 6, 8, 9, 9], + } + ) + # normalize on index + result = pd.crosstab( + [df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=0 + ) + expected = pd.DataFrame( + [[0.5, 0.5], [0.5, 0.5], [0.666667, 0.333333], [0, 1], [0.444444, 0.555556]] + ) + expected.index = MultiIndex( + levels=[["Sub-Total", "bar", "foo"], ["", "one", "two"]], + codes=[[1, 1, 2, 2, 0], [1, 2, 1, 2, 0]], + names=["A", "B"], + ) + expected.columns = Index(["large", "small"], dtype="object", name="C") + tm.assert_frame_equal(result, expected) + + # normalize on columns + result = pd.crosstab( + [df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=1 + ) + expected = pd.DataFrame( + [ + [0.25, 0.2, 0.222222], + [0.25, 0.2, 0.222222], + [0.5, 0.2, 0.333333], + [0, 0.4, 0.222222], + ] + ) + expected.columns = Index( + ["large", "small", "Sub-Total"], dtype="object", name="C" + ) + expected.index = MultiIndex( + levels=[["bar", "foo"], ["one", "two"]], + codes=[[0, 0, 1, 1], [0, 1, 0, 1]], + names=["A", "B"], + ) + tm.assert_frame_equal(result, expected) + + # normalize on both index and column + result = pd.crosstab( + [df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=True + ) + expected = pd.DataFrame( + [ + [0.111111, 0.111111, 0.222222], + [0.111111, 0.111111, 0.222222], + [0.222222, 0.111111, 0.333333], + [0.000000, 0.222222, 0.222222], + [0.444444, 0.555555, 1], + ] + ) + expected.columns = Index( + ["large", "small", "Sub-Total"], dtype="object", name="C" + ) + expected.index = MultiIndex( + levels=[["Sub-Total", "bar", "foo"], ["", "one", "two"]], + codes=[[1, 1, 2, 2, 0], [1, 2, 1, 2, 0]], + names=["A", "B"], + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/scalar/period/test_asfreq.py b/pandas/tests/scalar/period/test_asfreq.py index 4cff061cabc40..357274e724c68 100644 --- a/pandas/tests/scalar/period/test_asfreq.py +++ b/pandas/tests/scalar/period/test_asfreq.py @@ -30,11 +30,8 @@ def test_asfreq_near_zero_weekly(self): assert week1.asfreq("D", "E") >= per1 assert week2.asfreq("D", "S") <= per2 - @pytest.mark.xfail( - reason="GH#19643 period_helper asfreq functions fail to check for overflows" - ) def test_to_timestamp_out_of_bounds(self): - # GH#19643, currently gives Timestamp('1754-08-30 22:43:41.128654848') + # GH#19643, used to incorrectly give Timestamp in 1754 per = Period("0001-01-01", freq="B") with pytest.raises(OutOfBoundsDatetime): per.to_timestamp() diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index b57b817461788..a1de205afc0e2 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -1298,23 +1298,13 @@ def test_add_offset_nat(self): timedelta(365), ]: assert p + o is NaT - - if isinstance(o, np.timedelta64): - with pytest.raises(TypeError): - o + p - else: - assert o + p is NaT + assert o + p is NaT for freq in ["M", "2M", "3M"]: p = Period("NaT", freq=freq) for o in [offsets.MonthEnd(2), offsets.MonthEnd(12)]: assert p + o is NaT - - if isinstance(o, np.timedelta64): - with pytest.raises(TypeError): - o + p - else: - assert o + p is NaT + assert o + p is NaT for o in [ offsets.YearBegin(2), @@ -1324,12 +1314,7 @@ def test_add_offset_nat(self): timedelta(365), ]: assert p + o is NaT - - if isinstance(o, np.timedelta64): - with pytest.raises(TypeError): - o + p - else: - assert o + p is NaT + assert o + p is NaT # freq is Tick for freq in ["D", "2D", "3D"]: @@ -1343,12 +1328,7 @@ def test_add_offset_nat(self): timedelta(hours=48), ]: assert p + o is NaT - - if isinstance(o, np.timedelta64): - with pytest.raises(TypeError): - o + p - else: - assert o + p is NaT + assert o + p is NaT for o in [ offsets.YearBegin(2), @@ -1358,12 +1338,7 @@ def test_add_offset_nat(self): timedelta(hours=23), ]: assert p + o is NaT - - if isinstance(o, np.timedelta64): - with pytest.raises(TypeError): - o + p - else: - assert o + p is NaT + assert o + p is NaT for freq in ["H", "2H", "3H"]: p = Period("NaT", freq=freq) @@ -1376,9 +1351,7 @@ def test_add_offset_nat(self): timedelta(days=4, minutes=180), ]: assert p + o is NaT - - if not isinstance(o, np.timedelta64): - assert o + p is NaT + assert o + p is NaT for o in [ offsets.YearBegin(2), @@ -1388,12 +1361,7 @@ def test_add_offset_nat(self): timedelta(hours=23, minutes=30), ]: assert p + o is NaT - - if isinstance(o, np.timedelta64): - with pytest.raises(TypeError): - o + p - else: - assert o + p is NaT + assert o + p is NaT def test_sub_offset(self): # freq is DateOffset @@ -1581,7 +1549,11 @@ def test_period_immutable(): @pytest.mark.xfail( - PY35, reason="Parsing as Period('0007-01-01', 'D') for reasons unknown", strict=True + # xpassing on MacPython with strict=False + # https://travis-ci.org/MacPython/pandas-wheels/jobs/574706922 + PY35, + reason="Parsing as Period('0007-01-01', 'D') for reasons unknown", + strict=False, ) def test_small_year_parsing(): per1 = Period("0001-01-07", "D") diff --git a/pandas/tests/scalar/test_nat.py b/pandas/tests/scalar/test_nat.py index e7ad76cf95ba0..5eb69fb2952dc 100644 --- a/pandas/tests/scalar/test_nat.py +++ b/pandas/tests/scalar/test_nat.py @@ -1,4 +1,5 @@ from datetime import datetime, timedelta +import operator import numpy as np import pytest @@ -21,6 +22,7 @@ isna, ) from pandas.core.arrays import DatetimeArray, PeriodArray, TimedeltaArray +from pandas.core.ops import roperator from pandas.util import testing as tm @@ -250,6 +252,7 @@ def _get_overlap_public_nat_methods(klass, as_tuple=False): "day_name", "dst", "floor", + "fromisocalendar", "fromisoformat", "fromordinal", "fromtimestamp", @@ -294,6 +297,8 @@ def test_overlap_public_nat_methods(klass, expected): # "fromisoformat" was introduced in 3.7 if klass is Timestamp and not compat.PY37: expected.remove("fromisoformat") + if klass is Timestamp and not compat.PY38: + expected.remove("fromisocalendar") assert _get_overlap_public_nat_methods(klass) == expected @@ -333,8 +338,9 @@ def test_nat_doc_strings(compare): "value,val_type", [ (2, "scalar"), - (1.5, "scalar"), - (np.nan, "scalar"), + (1.5, "floating"), + (np.nan, "floating"), + ("foo", "str"), (timedelta(3600), "timedelta"), (Timedelta("5s"), "timedelta"), (datetime(2014, 1, 1), "timestamp"), @@ -348,6 +354,14 @@ def test_nat_arithmetic_scalar(op_name, value, val_type): # see gh-6873 invalid_ops = { "scalar": {"right_div_left"}, + "floating": { + "right_div_left", + "left_minus_right", + "right_minus_left", + "left_plus_right", + "right_plus_left", + }, + "str": set(_ops.keys()), "timedelta": {"left_times_right", "right_times_left"}, "timestamp": { "left_times_right", @@ -366,6 +380,16 @@ def test_nat_arithmetic_scalar(op_name, value, val_type): and isinstance(value, Timedelta) ): msg = "Cannot multiply" + elif val_type == "str": + # un-specific check here because the message comes from str + # and varies by method + msg = ( + "can only concatenate str|" + "unsupported operand type|" + "can't multiply sequence|" + "Can't convert 'NaTType'|" + "must be str, not NaTType" + ) else: msg = "unsupported operand type" @@ -435,6 +459,28 @@ def test_nat_arithmetic_td64_vector(op_name, box): tm.assert_equal(_ops[op_name](vec, NaT), box_nat) +@pytest.mark.parametrize( + "dtype,op,out_dtype", + [ + ("datetime64[ns]", operator.add, "datetime64[ns]"), + ("datetime64[ns]", roperator.radd, "datetime64[ns]"), + ("datetime64[ns]", operator.sub, "timedelta64[ns]"), + ("datetime64[ns]", roperator.rsub, "timedelta64[ns]"), + ("timedelta64[ns]", operator.add, "datetime64[ns]"), + ("timedelta64[ns]", roperator.radd, "datetime64[ns]"), + ("timedelta64[ns]", operator.sub, "datetime64[ns]"), + ("timedelta64[ns]", roperator.rsub, "timedelta64[ns]"), + ], +) +def test_nat_arithmetic_ndarray(dtype, op, out_dtype): + other = np.arange(10).astype(dtype) + result = op(NaT, other) + + expected = np.empty(other.shape, dtype=out_dtype) + expected.fill("NaT") + tm.assert_numpy_array_equal(result, expected) + + def test_nat_pinned_docstrings(): # see gh-17327 assert NaT.ctime.__doc__ == datetime.ctime.__doc__ diff --git a/pandas/tests/scalar/timestamp/test_timestamp.py b/pandas/tests/scalar/timestamp/test_timestamp.py index 401fc285424fe..652dd34ca7ce2 100644 --- a/pandas/tests/scalar/timestamp/test_timestamp.py +++ b/pandas/tests/scalar/timestamp/test_timestamp.py @@ -1047,3 +1047,23 @@ def test_to_numpy_alias(self): # GH 24653: alias .to_numpy() for scalars ts = Timestamp(datetime.now()) assert ts.to_datetime64() == ts.to_numpy() + + +class SubDatetime(datetime): + pass + + +@pytest.mark.parametrize( + "lh,rh", + [ + (SubDatetime(2000, 1, 1), Timedelta(hours=1)), + (Timedelta(hours=1), SubDatetime(2000, 1, 1)), + ], +) +def test_dt_subclass_add_timedelta(lh, rh): + # GH#25851 + # ensure that subclassed datetime works for + # Timedelta operations + result = lh + rh + expected = SubDatetime(2000, 1, 1, 1) + assert result == expected diff --git a/pandas/tests/series/test_alter_axes.py b/pandas/tests/series/test_alter_axes.py index 0a25d6ba203cb..5d74ad95be90d 100644 --- a/pandas/tests/series/test_alter_axes.py +++ b/pandas/tests/series/test_alter_axes.py @@ -267,6 +267,25 @@ def test_rename_axis_none(self, kwargs): expected = Series([1, 2, 3], index=expected_index) tm.assert_series_equal(result, expected) + def test_rename_with_custom_indexer(self): + # GH 27814 + class MyIndexer: + pass + + ix = MyIndexer() + s = Series([1, 2, 3]).rename(ix) + assert s.name is ix + + def test_rename_with_custom_indexer_inplace(self): + # GH 27814 + class MyIndexer: + pass + + ix = MyIndexer() + s = Series([1, 2, 3]) + s.rename(ix, inplace=True) + assert s.name is ix + def test_set_axis_inplace_axes(self, axis_series): # GH14636 ser = Series(np.arange(4), index=[1, 3, 5, 7], dtype="int64") diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 3a5a387b919be..d6cb7f8d6a8be 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -20,6 +20,7 @@ from pandas.api.types import is_scalar from pandas.core.index import MultiIndex from pandas.core.indexes.datetimes import Timestamp +from pandas.core.indexes.timedeltas import TimedeltaIndex import pandas.util.testing as tm from pandas.util.testing import ( assert_almost_equal, @@ -237,6 +238,59 @@ def test_npdiff(self): r = np.diff(s) assert_series_equal(Series([nan, 0, 0, 0, nan]), r) + def test_dt_nm_bool_diff(self): + # Combined datetime diff, normal diff and boolean diff test + ts = tm.makeTimeSeries(name="ts") + ts.diff() + + # int dtype + a = 10000000000000000 + b = a + 1 + s = Series([a, b]) + + rs = s.diff() + assert rs[1] == 1 + + # neg n + rs = ts.diff(-1) + xp = ts - ts.shift(-1) + assert_series_equal(rs, xp) + + # 0 + rs = ts.diff(0) + xp = ts - ts + assert_series_equal(rs, xp) + + # datetime diff (GH3100) + s = Series(date_range("20130102", periods=5)) + rs = s - s.shift(1) + xp = s.diff() + assert_series_equal(rs, xp) + + # timedelta diff + nrs = rs - rs.shift(1) + nxp = xp.diff() + assert_series_equal(nrs, nxp) + + # with tz + s = Series( + date_range("2000-01-01 09:00:00", periods=5, tz="US/Eastern"), name="foo" + ) + result = s.diff() + assert_series_equal( + result, Series(TimedeltaIndex(["NaT"] + ["1 days"] * 4), name="foo") + ) + + # boolean series + s = Series([False, True, True, False, False]) + result = s.diff() + assert_series_equal(result, Series([nan, True, False, True, False])) + + # boolean nan series + s = Series([False, True, nan, False, False]) + result = s.diff() + assert_series_equal(result, Series([nan, 1, nan, nan, 0], dtype="object")) + def _check_accum_op(self, name, datetime_series_, check_dtype=True): func = getattr(np, name) tm.assert_numpy_array_equal( @@ -1482,16 +1536,7 @@ def test_value_counts_with_nan(self): @pytest.mark.parametrize( "dtype", - [ - "int_", - "uint", - "float_", - "unicode_", - "timedelta64[h]", - pytest.param( - "datetime64[D]", marks=pytest.mark.xfail(reason="GH#7996", strict=True) - ), - ], + ["int_", "uint", "float_", "unicode_", "timedelta64[h]", "datetime64[D]"], ) def test_drop_duplicates_categorical_non_bool(self, dtype, ordered_fixture): cat_array = np.array([1, 2, 3, 4, 5], dtype=np.dtype(dtype)) @@ -1499,6 +1544,10 @@ def test_drop_duplicates_categorical_non_bool(self, dtype, ordered_fixture): # Test case 1 input1 = np.array([1, 2, 3, 3], dtype=np.dtype(dtype)) tc1 = Series(Categorical(input1, categories=cat_array, ordered=ordered_fixture)) + if dtype == "datetime64[D]": + # pre-empty flaky xfail, tc1 values are seemingly-random + if not (np.array(tc1) == input1).all(): + pytest.xfail(reason="GH#7996") expected = Series([False, False, False, True]) tm.assert_series_equal(tc1.duplicated(), expected) @@ -1524,6 +1573,10 @@ def test_drop_duplicates_categorical_non_bool(self, dtype, ordered_fixture): # Test case 2 input2 = np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype(dtype)) tc2 = Series(Categorical(input2, categories=cat_array, ordered=ordered_fixture)) + if dtype == "datetime64[D]": + # pre-empty flaky xfail, tc2 values are seemingly-random + if not (np.array(tc2) == input2).all(): + pytest.xfail(reason="GH#7996") expected = Series([False, False, False, False, True, True, False]) tm.assert_series_equal(tc2.duplicated(), expected) diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py index 0686b397cbd81..0ddf1dfcabb59 100644 --- a/pandas/tests/series/test_io.py +++ b/pandas/tests/series/test_io.py @@ -191,6 +191,20 @@ def test_to_csv_compression(self, s, encoding, compression): s, pd.read_csv(fh, index_col=0, squeeze=True, encoding=encoding) ) + def test_to_csv_interval_index(self): + # GH 28210 + s = Series(["foo", "bar", "baz"], index=pd.interval_range(0, 3)) + + with ensure_clean("__tmp_to_csv_interval_index__.csv") as path: + s.to_csv(path, header=False) + result = self.read_csv(path, index_col=0, squeeze=True) + + # can't roundtrip intervalindex via read_csv so check string repr (GH 23595) + expected = s.copy() + expected.index = expected.index.astype(str) + + assert_series_equal(result, expected) + class TestSeriesIO: def test_to_frame(self, datetime_series): diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index f1b84acf68755..ddd2c566f4cda 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -578,6 +578,28 @@ def test_fillna_categorical(self, fill_value, expected_output): exp = Series(Categorical(expected_output, categories=["a", "b"])) tm.assert_series_equal(s.fillna(fill_value), exp) + @pytest.mark.parametrize( + "fill_value, expected_output", + [ + (Series(["a", "b", "c", "d", "e"]), ["a", "b", "b", "d", "e"]), + (Series(["b", "d", "a", "d", "a"]), ["a", "d", "b", "d", "a"]), + ( + Series( + Categorical( + ["b", "d", "a", "d", "a"], categories=["b", "c", "d", "e", "a"] + ) + ), + ["a", "d", "b", "d", "a"], + ), + ], + ) + def test_fillna_categorical_with_new_categories(self, fill_value, expected_output): + # GH 26215 + data = ["a", np.nan, "b", np.nan, np.nan] + s = Series(Categorical(data, categories=["a", "b", "c", "d", "e"])) + exp = Series(Categorical(expected_output, categories=["a", "b", "c", "d", "e"])) + tm.assert_series_equal(s.fillna(fill_value), exp) + def test_fillna_categorical_raise(self): data = ["a", np.nan, "b", np.nan, np.nan] s = Series(Categorical(data, categories=["a", "b"])) diff --git a/pandas/tests/series/test_period.py b/pandas/tests/series/test_period.py index 9b34b52bf39b9..4aeb211170d8f 100644 --- a/pandas/tests/series/test_period.py +++ b/pandas/tests/series/test_period.py @@ -71,10 +71,9 @@ def test_NaT_scalar(self): series[2] = val assert pd.isna(series[2]) - @pytest.mark.xfail(reason="PeriodDtype Series not supported yet") def test_NaT_cast(self): result = Series([np.nan]).astype("period[D]") - expected = Series([pd.NaT]) + expected = Series([pd.NaT], dtype="period[D]") tm.assert_series_equal(result, expected) def test_set_none(self): diff --git a/pandas/tests/series/test_timeseries.py b/pandas/tests/series/test_timeseries.py index d0ca5d82c6b33..fbe3f929cf5b5 100644 --- a/pandas/tests/series/test_timeseries.py +++ b/pandas/tests/series/test_timeseries.py @@ -355,48 +355,6 @@ def test_asfreq_datetimeindex_empty_series(self): ) tm.assert_index_equal(expected.index, result.index) - def test_diff(self): - # Just run the function - self.ts.diff() - - # int dtype - a = 10000000000000000 - b = a + 1 - s = Series([a, b]) - - rs = s.diff() - assert rs[1] == 1 - - # neg n - rs = self.ts.diff(-1) - xp = self.ts - self.ts.shift(-1) - assert_series_equal(rs, xp) - - # 0 - rs = self.ts.diff(0) - xp = self.ts - self.ts - assert_series_equal(rs, xp) - - # datetime diff (GH3100) - s = Series(date_range("20130102", periods=5)) - rs = s - s.shift(1) - xp = s.diff() - assert_series_equal(rs, xp) - - # timedelta diff - nrs = rs - rs.shift(1) - nxp = xp.diff() - assert_series_equal(nrs, nxp) - - # with tz - s = Series( - date_range("2000-01-01 09:00:00", periods=5, tz="US/Eastern"), name="foo" - ) - result = s.diff() - assert_series_equal( - result, Series(TimedeltaIndex(["NaT"] + ["1 days"] * 4), name="foo") - ) - def test_pct_change(self): rs = self.ts.pct_change(fill_method=None) assert_series_equal(rs, self.ts / self.ts.shift(1) - 1) diff --git a/pandas/tests/series/test_ufunc.py b/pandas/tests/series/test_ufunc.py index c024e9caba156..8144a3931b9b8 100644 --- a/pandas/tests/series/test_ufunc.py +++ b/pandas/tests/series/test_ufunc.py @@ -252,10 +252,7 @@ def __add__(self, other): "values", [ pd.array([1, 3, 2]), - pytest.param( - pd.array([1, 10, 0], dtype="Sparse[int]"), - marks=pytest.mark.xfail(resason="GH-27080. Bug in SparseArray"), - ), + pd.array([1, 10, 0], dtype="Sparse[int]"), pd.to_datetime(["2000", "2010", "2001"]), pd.to_datetime(["2000", "2010", "2001"]).tz_localize("CET"), pd.to_datetime(["2000", "2010", "2001"]).to_period(freq="D"), diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index 479e55c86fcd1..65b2dab1b02a8 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -1,4 +1,5 @@ import collections +from distutils.version import LooseVersion from functools import partial import string @@ -117,3 +118,13 @@ def test_git_version(): git_version = pd.__git_version__ assert len(git_version) == 40 assert all(c in string.hexdigits for c in git_version) + + +def test_version_tag(): + version = pd.__version__ + try: + version > LooseVersion("0.0.1") + except TypeError: + raise ValueError( + "No git tags exist, please sync tags between upstream and your repo" + ) diff --git a/pandas/tests/test_expressions.py b/pandas/tests/test_expressions.py index 4070624985068..ca514f62f451d 100644 --- a/pandas/tests/test_expressions.py +++ b/pandas/tests/test_expressions.py @@ -66,7 +66,7 @@ def run_arithmetic(self, df, other, assert_func, check_dtype=False, test_flex=Tr operator_name = "truediv" if test_flex: - op = lambda x, y: getattr(df, arith)(y) + op = lambda x, y: getattr(x, arith)(y) op.__name__ = arith else: op = getattr(operator, operator_name) @@ -318,7 +318,6 @@ def testit(): for f in [self.frame, self.frame2, self.mixed, self.mixed2]: for cond in [True, False]: - c = np.empty(f.shape, dtype=np.bool_) c.fill(cond) result = expr.where(c, f.values, f.values + 1) @@ -431,3 +430,29 @@ def test_bool_ops_column_name_dtype(self, test_input, expected): # GH 22383 - .ne fails if columns containing column name 'dtype' result = test_input.loc[:, ["a", "dtype"]].ne(test_input.loc[:, ["a", "dtype"]]) assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "arith", ("add", "sub", "mul", "mod", "truediv", "floordiv") + ) + @pytest.mark.parametrize("axis", (0, 1)) + def test_frame_series_axis(self, axis, arith): + # GH#26736 Dataframe.floordiv(Series, axis=1) fails + if axis == 1 and arith == "floordiv": + pytest.xfail("'floordiv' does not succeed with axis=1 #27636") + + df = self.frame + if axis == 1: + other = self.frame.iloc[0, :] + else: + other = self.frame.iloc[:, 0] + + expr._MIN_ELEMENTS = 0 + + op_func = getattr(df, arith) + + expr.set_use_numexpr(False) + expected = op_func(other, axis=axis) + expr.set_use_numexpr(True) + + result = op_func(other, axis=axis) + assert_frame_equal(expected, result) diff --git a/pandas/tests/window/test_ewm.py b/pandas/tests/window/test_ewm.py index a05b567adad7a..1683fda500f85 100644 --- a/pandas/tests/window/test_ewm.py +++ b/pandas/tests/window/test_ewm.py @@ -4,7 +4,7 @@ from pandas.errors import UnsupportedFunctionCall from pandas import DataFrame, Series -import pandas.core.window as rwindow +from pandas.core.window import EWM from pandas.tests.window.common import Base @@ -60,7 +60,7 @@ def test_constructor(self, which): @pytest.mark.parametrize("method", ["std", "mean", "var"]) def test_numpy_compat(self, method): # see gh-12811 - e = rwindow.EWM(Series([2, 4, 6]), alpha=0.5) + e = EWM(Series([2, 4, 6]), alpha=0.5) msg = "numpy operations are not valid with window objects" diff --git a/pandas/tests/window/test_expanding.py b/pandas/tests/window/test_expanding.py index 1e92c981964c5..098acdff93ac6 100644 --- a/pandas/tests/window/test_expanding.py +++ b/pandas/tests/window/test_expanding.py @@ -5,7 +5,7 @@ import pandas as pd from pandas import DataFrame, Series -import pandas.core.window as rwindow +from pandas.core.window import Expanding from pandas.tests.window.common import Base import pandas.util.testing as tm @@ -42,7 +42,7 @@ def test_constructor(self, which): @pytest.mark.parametrize("method", ["std", "mean", "sum", "max", "min", "var"]) def test_numpy_compat(self, method): # see gh-12811 - e = rwindow.Expanding(Series([2, 4, 6]), window=2) + e = Expanding(Series([2, 4, 6]), window=2) msg = "numpy operations are not valid with window objects" diff --git a/pandas/tests/window/test_moments.py b/pandas/tests/window/test_moments.py index d860859958254..3d6cd7d10bd10 100644 --- a/pandas/tests/window/test_moments.py +++ b/pandas/tests/window/test_moments.py @@ -10,7 +10,7 @@ import pandas as pd from pandas import DataFrame, Index, Series, concat, isna, notna -import pandas.core.window as rwindow +from pandas.core.window.common import _flex_binary_moment from pandas.tests.window.common import Base import pandas.util.testing as tm @@ -1878,7 +1878,7 @@ def test_flex_binary_moment(self): " np.ndarray/Series/DataFrame" ) with pytest.raises(TypeError, match=msg): - rwindow._flex_binary_moment(5, 6, None) + _flex_binary_moment(5, 6, None) def test_corr_sanity(self): # GH 3155 diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index c7177e1d3914f..b4787bf25e3bb 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -8,7 +8,7 @@ import pandas as pd from pandas import DataFrame, Series -import pandas.core.window as rwindow +from pandas.core.window import Rolling from pandas.tests.window.common import Base import pandas.util.testing as tm @@ -101,7 +101,7 @@ def test_constructor_timedelta_window_and_minperiods(self, window, raw): @pytest.mark.parametrize("method", ["std", "mean", "sum", "max", "min", "var"]) def test_numpy_compat(self, method): # see gh-12811 - r = rwindow.Rolling(Series([2, 4, 6]), window=2) + r = Rolling(Series([2, 4, 6]), window=2) msg = "numpy operations are not valid with window objects" @@ -326,3 +326,11 @@ def test_rolling_axis_count(self, axis_frame): result = df.rolling(2, axis=axis_frame).count() tm.assert_frame_equal(result, expected) + + def test_readonly_array(self): + # GH-27766 + arr = np.array([1, 3, np.nan, 3, 5]) + arr.setflags(write=False) + result = pd.Series(arr).rolling(2).mean() + expected = pd.Series([np.nan, 2, np.nan, np.nan, 4]) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/window/test_window.py b/pandas/tests/window/test_window.py index a6a56c98a9377..5692404205012 100644 --- a/pandas/tests/window/test_window.py +++ b/pandas/tests/window/test_window.py @@ -6,7 +6,7 @@ import pandas as pd from pandas import Series -import pandas.core.window as rwindow +from pandas.core.window import Window from pandas.tests.window.common import Base @@ -50,7 +50,7 @@ def test_constructor_with_win_type(self, which, win_types): @pytest.mark.parametrize("method", ["sum", "mean"]) def test_numpy_compat(self, method): # see gh-12811 - w = rwindow.Window(Series([2, 4, 6]), window=[0, 2]) + w = Window(Series([2, 4, 6]), window=[0, 2]) msg = "numpy operations are not valid with window objects" diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index a208d5ad2fea9..edf58ba3850a1 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -204,8 +204,7 @@ def __add__(date): normalize : bool, default False Whether to round the result of a DateOffset addition down to the previous midnight. - **kwds - Temporal parameter that add to or replace the offset value. + **kwds : Temporal parameter that add to or replace the offset value. Parameters that **add** to the offset (like Timedelta): @@ -233,16 +232,19 @@ def __add__(date): See Also -------- - dateutil.relativedelta.relativedelta + dateutil.relativedelta.relativedelta : The relativedelta type is designed + to be applied to an existing datetime an can replace specific components of + that datetime, or represents an interval of time. Examples -------- + >>> from pandas.tseries.offsets import DateOffset >>> ts = pd.Timestamp('2017-01-01 09:10:11') >>> ts + DateOffset(months=3) Timestamp('2017-04-01 09:10:11') >>> ts = pd.Timestamp('2017-01-01 09:10:11') - >>> ts + DateOffset(month=3) + >>> ts + DateOffset(months=2) Timestamp('2017-03-01 09:10:11') """ diff --git a/pandas/util/_decorators.py b/pandas/util/_decorators.py index 5c7d481ff2586..8a25e511b5fc4 100644 --- a/pandas/util/_decorators.py +++ b/pandas/util/_decorators.py @@ -1,21 +1,35 @@ from functools import wraps import inspect from textwrap import dedent -from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union +from typing import ( + Any, + Callable, + Dict, + List, + Optional, + Tuple, + Type, + TypeVar, + Union, + cast, +) import warnings from pandas._libs.properties import cache_readonly # noqa +FuncType = Callable[..., Any] +F = TypeVar("F", bound=FuncType) + def deprecate( name: str, - alternative: Callable, + alternative: Callable[..., Any], version: str, alt_name: Optional[str] = None, klass: Optional[Type[Warning]] = None, stacklevel: int = 2, msg: Optional[str] = None, -) -> Callable: +) -> Callable[..., Any]: """ Return a new function that emits a deprecation warning on use. @@ -47,7 +61,7 @@ def deprecate( warning_msg = msg or "{} is deprecated, use {} instead".format(name, alt_name) @wraps(alternative) - def wrapper(*args, **kwargs): + def wrapper(*args, **kwargs) -> Callable[..., Any]: warnings.warn(warning_msg, klass, stacklevel=stacklevel) return alternative(*args, **kwargs) @@ -90,9 +104,9 @@ def wrapper(*args, **kwargs): def deprecate_kwarg( old_arg_name: str, new_arg_name: Optional[str], - mapping: Optional[Union[Dict, Callable[[Any], Any]]] = None, + mapping: Optional[Union[Dict[Any, Any], Callable[[Any], Any]]] = None, stacklevel: int = 2, -) -> Callable: +) -> Callable[..., Any]: """ Decorator to deprecate a keyword argument of a function. @@ -160,27 +174,27 @@ def deprecate_kwarg( "mapping from old to new argument values " "must be dict or callable!" ) - def _deprecate_kwarg(func): + def _deprecate_kwarg(func: F) -> F: @wraps(func) - def wrapper(*args, **kwargs): + def wrapper(*args, **kwargs) -> Callable[..., Any]: old_arg_value = kwargs.pop(old_arg_name, None) - if new_arg_name is None and old_arg_value is not None: - msg = ( - "the '{old_name}' keyword is deprecated and will be " - "removed in a future version. " - "Please take steps to stop the use of '{old_name}'" - ).format(old_name=old_arg_name) - warnings.warn(msg, FutureWarning, stacklevel=stacklevel) - kwargs[old_arg_name] = old_arg_value - return func(*args, **kwargs) - if old_arg_value is not None: - if mapping is not None: - if hasattr(mapping, "get"): - new_arg_value = mapping.get(old_arg_value, old_arg_value) - else: + if new_arg_name is None: + msg = ( + "the '{old_name}' keyword is deprecated and will be " + "removed in a future version. " + "Please take steps to stop the use of '{old_name}'" + ).format(old_name=old_arg_name) + warnings.warn(msg, FutureWarning, stacklevel=stacklevel) + kwargs[old_arg_name] = old_arg_value + return func(*args, **kwargs) + + elif mapping is not None: + if callable(mapping): new_arg_value = mapping(old_arg_value) + else: + new_arg_value = mapping.get(old_arg_value, old_arg_value) msg = ( "the {old_name}={old_val!r} keyword is deprecated, " "use {new_name}={new_val!r} instead" @@ -198,7 +212,7 @@ def wrapper(*args, **kwargs): ).format(old_name=old_arg_name, new_name=new_arg_name) warnings.warn(msg, FutureWarning, stacklevel=stacklevel) - if kwargs.get(new_arg_name, None) is not None: + if kwargs.get(new_arg_name) is not None: msg = ( "Can only specify '{old_name}' or '{new_name}', " "not both" ).format(old_name=old_arg_name, new_name=new_arg_name) @@ -207,17 +221,17 @@ def wrapper(*args, **kwargs): kwargs[new_arg_name] = new_arg_value return func(*args, **kwargs) - return wrapper + return cast(F, wrapper) return _deprecate_kwarg def rewrite_axis_style_signature( name: str, extra_params: List[Tuple[str, Any]] -) -> Callable: - def decorate(func): +) -> Callable[..., Any]: + def decorate(func: F) -> F: @wraps(func) - def wrapper(*args, **kwargs): + def wrapper(*args, **kwargs) -> Callable[..., Any]: return func(*args, **kwargs) kind = inspect.Parameter.POSITIONAL_OR_KEYWORD @@ -234,8 +248,9 @@ def wrapper(*args, **kwargs): sig = inspect.Signature(params) - func.__signature__ = sig - return wrapper + # https://github.com/python/typing/issues/598 + func.__signature__ = sig # type: ignore + return cast(F, wrapper) return decorate @@ -279,18 +294,17 @@ def __init__(self, *args, **kwargs): self.params = args or kwargs - def __call__(self, func: Callable) -> Callable: + def __call__(self, func: F) -> F: func.__doc__ = func.__doc__ and func.__doc__ % self.params return func def update(self, *args, **kwargs) -> None: """ Update self.params with supplied args. - - If called, we assume self.params is a dict. """ - self.params.update(*args, **kwargs) + if isinstance(self.params, dict): + self.params.update(*args, **kwargs) class Appender: @@ -320,7 +334,7 @@ def __init__(self, addendum: Optional[str], join: str = "", indents: int = 0): self.addendum = addendum self.join = join - def __call__(self, func: Callable) -> Callable: + def __call__(self, func: F) -> F: func.__doc__ = func.__doc__ if func.__doc__ else "" self.addendum = self.addendum if self.addendum else "" docitems = [func.__doc__, self.addendum] diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index 3de4e5d66d577..627757aaa3741 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -25,9 +25,8 @@ def test_foo(): """ from distutils.version import LooseVersion import locale -from typing import Optional +from typing import Callable, Optional -from _pytest.mark.structures import MarkDecorator import pytest from pandas.compat import is_platform_32bit, is_platform_windows @@ -103,7 +102,7 @@ def _skip_if_no_scipy(): ) -def skip_if_installed(package: str,) -> MarkDecorator: +def skip_if_installed(package: str,) -> Callable: """ Skip a test if a package is installed. @@ -117,7 +116,7 @@ def skip_if_installed(package: str,) -> MarkDecorator: ) -def skip_if_no(package: str, min_version: Optional[str] = None) -> MarkDecorator: +def skip_if_no(package: str, min_version: Optional[str] = None) -> Callable: """ Generic function to help skip tests when required packages are not present on the testing system. diff --git a/pandas/util/testing.py b/pandas/util/testing.py index cf8452cdd0c59..0d543f891a5f6 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -5,7 +5,6 @@ from functools import wraps import gzip import http.client -import lzma import os import re from shutil import rmtree @@ -26,7 +25,7 @@ ) import pandas._libs.testing as _testing -from pandas.compat import raise_with_traceback +from pandas.compat import _get_lzma_file, _import_lzma, raise_with_traceback from pandas.core.dtypes.common import ( is_bool, @@ -70,6 +69,8 @@ from pandas.io.common import urlopen from pandas.io.formats.printing import pprint_thing +lzma = _import_lzma() + N = 30 K = 4 _RAISE_NETWORK_ERROR_DEFAULT = False @@ -211,7 +212,7 @@ def decompress_file(path, compression): elif compression == "bz2": f = bz2.BZ2File(path, "rb") elif compression == "xz": - f = lzma.LZMAFile(path, "rb") + f = _get_lzma_file(lzma)(path, "rb") elif compression == "zip": zip_file = zipfile.ZipFile(path) zip_names = zip_file.namelist() @@ -264,9 +265,7 @@ def write_to_compressed(compression, path, data, dest="test"): compress_method = bz2.BZ2File elif compression == "xz": - import lzma - - compress_method = lzma.LZMAFile + compress_method = _get_lzma_file(lzma) else: msg = "Unrecognized compression type: {}".format(compression) raise ValueError(msg) @@ -581,7 +580,8 @@ def assert_index_equal( check_categorical: bool = True, obj: str = "Index", ) -> None: - """Check that left and right Index are equal. + """ + Check that left and right Index are equal. Parameters ---------- @@ -1082,7 +1082,8 @@ def assert_series_equal( check_categorical=True, obj="Series", ): - """Check that left and right Series are equal. + """ + Check that left and right Series are equal. Parameters ---------- diff --git a/requirements-dev.txt b/requirements-dev.txt index e49ad10bfc99d..cf11a3ee28258 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -45,7 +45,7 @@ html5lib lxml openpyxl pyarrow>=0.9.0 -pyqt +pyqt5>=5.9.2 tables>=3.4.2 python-snappy s3fs diff --git a/scripts/find_commits_touching_func.py b/scripts/find_commits_touching_func.py index 1075a257d4270..95a892b822cff 100755 --- a/scripts/find_commits_touching_func.py +++ b/scripts/find_commits_touching_func.py @@ -10,11 +10,11 @@ Usage:: $ ./find_commits_touching_func.py (see arguments below) """ -import logging -import re -import os import argparse from collections import namedtuple +import logging +import os +import re from dateutil.parser import parse diff --git a/scripts/generate_pip_deps_from_conda.py b/scripts/generate_pip_deps_from_conda.py index ac73859b22598..29fe8bf84c12b 100755 --- a/scripts/generate_pip_deps_from_conda.py +++ b/scripts/generate_pip_deps_from_conda.py @@ -16,11 +16,11 @@ import os import re import sys -import yaml +import yaml EXCLUDE = {"python=3"} -RENAME = {"pytables": "tables"} +RENAME = {"pytables": "tables", "pyqt": "pyqt5"} def conda_package_to_pip(package): diff --git a/scripts/merge-pr.py b/scripts/merge-pr.py index 95352751a23c6..300cb149f387f 100755 --- a/scripts/merge-pr.py +++ b/scripts/merge-pr.py @@ -22,14 +22,15 @@ # usage: ./apache-pr-merge.py (see config env vars below) # # Lightly modified from version of this script in incubator-parquet-format -from subprocess import check_output -from requests.auth import HTTPBasicAuth -import requests import os +from subprocess import check_output import sys import textwrap +import requests +from requests.auth import HTTPBasicAuth + PANDAS_HOME = "." PROJECT_NAME = "pandas" print("PANDAS_HOME = " + PANDAS_HOME) diff --git a/scripts/tests/test_validate_docstrings.py b/scripts/tests/test_validate_docstrings.py index f3364e6725a20..85e5bf239cbfa 100644 --- a/scripts/tests/test_validate_docstrings.py +++ b/scripts/tests/test_validate_docstrings.py @@ -2,12 +2,13 @@ import random import string import textwrap -import pytest -import numpy as np -import pandas as pd +import numpy as np +import pytest import validate_docstrings +import pandas as pd + validate_one = validate_docstrings.validate_one @@ -200,7 +201,7 @@ def contains(self, pat, case=True, na=np.nan): def mode(self, axis, numeric_only): """ - Ensure sphinx directives don't affect checks for trailing periods. + Ensure reST directives don't affect checks for leading periods. Parameters ---------- @@ -447,6 +448,27 @@ def deprecation_in_wrong_order(self): def method_wo_docstrings(self): pass + def directives_without_two_colons(self, first, second): + """ + Ensure reST directives have trailing colons. + + Parameters + ---------- + first : str + Sentence ending in period, followed by single directive w/o colons. + + .. versionchanged 0.1.2 + + second : bool + Sentence ending in period, followed by multiple directives w/o + colons. + + .. versionadded 0.1.2 + .. deprecated 0.00.0 + + """ + pass + class BadSummaries: def wrong_line(self): @@ -840,6 +862,7 @@ def test_bad_class(self, capsys): "plot", "method", "private_classes", + "directives_without_two_colons", ], ) def test_bad_generic_functions(self, capsys, func): @@ -879,6 +902,14 @@ def test_bad_generic_functions(self, capsys, func): "deprecation_in_wrong_order", ("Deprecation warning should precede extended summary",), ), + ( + "BadGenericDocStrings", + "directives_without_two_colons", + ( + "reST directives ['versionchanged', 'versionadded', " + "'deprecated'] must be followed by two colons", + ), + ), ( "BadSeeAlso", "desc_no_period", diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py index 37623d32db685..401eaf8ff5ed5 100755 --- a/scripts/validate_docstrings.py +++ b/scripts/validate_docstrings.py @@ -13,20 +13,20 @@ $ ./validate_docstrings.py $ ./validate_docstrings.py pandas.DataFrame.head """ -import os -import sys -import json -import re -import glob -import functools -import collections import argparse -import pydoc -import inspect -import importlib +import ast +import collections import doctest +import functools +import glob +import importlib +import inspect +import json +import os +import pydoc +import re +import sys import tempfile -import ast import textwrap import flake8.main.application @@ -41,24 +41,25 @@ # script. Setting here before matplotlib is loaded. # We don't warn for the number of open plots, as none is actually being opened os.environ["MPLBACKEND"] = "Template" -import matplotlib +import matplotlib # noqa: E402 isort:skip matplotlib.rc("figure", max_open_warning=10000) -import numpy +import numpy # noqa: E402 isort:skip BASE_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.insert(0, os.path.join(BASE_PATH)) -import pandas +import pandas # noqa: E402 isort:skip sys.path.insert(1, os.path.join(BASE_PATH, "doc", "sphinxext")) -from numpydoc.docscrape import NumpyDocString -from pandas.io.formats.printing import pprint_thing +from numpydoc.docscrape import NumpyDocString # noqa: E402 isort:skip +from pandas.io.formats.printing import pprint_thing # noqa: E402 isort:skip PRIVATE_CLASSES = ["NDFrame", "IndexOpsMixin"] DIRECTIVES = ["versionadded", "versionchanged", "deprecated"] +DIRECTIVE_PATTERN = re.compile(rf"^\s*\.\. ({'|'.join(DIRECTIVES)})(?!::)", re.I | re.M) ALLOWED_SECTIONS = [ "Parameters", "Attributes", @@ -93,6 +94,7 @@ "GL07": "Sections are in the wrong order. Correct order is: " "{correct_sections}", "GL08": "The object does not have a docstring", "GL09": "Deprecation warning should precede extended summary", + "GL10": "reST directives {directives} must be followed by two colons", "SS01": "No summary found (a short summary in a single line should be " "present at the beginning of the docstring)", "SS02": "Summary does not start with a capital letter", @@ -478,6 +480,10 @@ def parameter_mismatches(self): def correct_parameters(self): return not bool(self.parameter_mismatches) + @property + def directives_without_two_colons(self): + return DIRECTIVE_PATTERN.findall(self.raw_doc) + def parameter_type(self, param): return self.doc_parameters[param][0] @@ -697,6 +703,10 @@ def get_validation_data(doc): if doc.deprecated and not doc.extended_summary.startswith(".. deprecated:: "): errs.append(error("GL09")) + directives_without_two_colons = doc.directives_without_two_colons + if directives_without_two_colons: + errs.append(error("GL10", directives=directives_without_two_colons)) + if not doc.summary: errs.append(error("SS01")) else: diff --git a/setup.cfg b/setup.cfg index 716ff5d9d8853..43dbac15f5cfe 100644 --- a/setup.cfg +++ b/setup.cfg @@ -110,68 +110,25 @@ directory = coverage_html_report # To be kept consistent with "Import Formatting" section in contributing.rst [isort] -known_pre_libs=pandas._config -known_pre_core=pandas._libs,pandas.util._*,pandas.compat,pandas.errors -known_dtypes=pandas.core.dtypes -known_post_core=pandas.tseries,pandas.io,pandas.plotting -sections=FUTURE,STDLIB,THIRDPARTY,PRE_LIBS,PRE_CORE,DTYPES,FIRSTPARTY,POST_CORE,LOCALFOLDER - -known_first_party=pandas -known_third_party=Cython,numpy,dateutil,matplotlib,python-dateutil,pytz,pyarrow,pytest - -multi_line_output=3 -include_trailing_comma=True -force_grid_wrap=0 -combine_as_imports=True -line_length=88 -force_sort_within_sections=True -skip_glob=env, -skip= - pandas/__init__.py - pandas/core/api.py, - pandas/io/msgpack/__init__.py - asv_bench/benchmarks/attrs_caching.py, - asv_bench/benchmarks/binary_ops.py, - asv_bench/benchmarks/categoricals.py, - asv_bench/benchmarks/ctors.py, - asv_bench/benchmarks/eval.py, - asv_bench/benchmarks/frame_ctor.py, - asv_bench/benchmarks/frame_methods.py, - asv_bench/benchmarks/gil.py, - asv_bench/benchmarks/groupby.py, - asv_bench/benchmarks/index_object.py, - asv_bench/benchmarks/indexing.py, - asv_bench/benchmarks/inference.py, - asv_bench/benchmarks/io/csv.py, - asv_bench/benchmarks/io/excel.py, - asv_bench/benchmarks/io/hdf.py, - asv_bench/benchmarks/io/json.py, - asv_bench/benchmarks/io/msgpack.py, - asv_bench/benchmarks/io/pickle.py, - asv_bench/benchmarks/io/sql.py, - asv_bench/benchmarks/io/stata.py, - asv_bench/benchmarks/join_merge.py, - asv_bench/benchmarks/multiindex_object.py, - asv_bench/benchmarks/panel_ctor.py, - asv_bench/benchmarks/panel_methods.py, - asv_bench/benchmarks/plotting.py, - asv_bench/benchmarks/reindex.py, - asv_bench/benchmarks/replace.py, - asv_bench/benchmarks/reshape.py, - asv_bench/benchmarks/rolling.py, - asv_bench/benchmarks/series_methods.py, - asv_bench/benchmarks/sparse.py, - asv_bench/benchmarks/stat_ops.py, - asv_bench/benchmarks/timeseries.py - asv_bench/benchmarks/pandas_vb_common.py - asv_bench/benchmarks/offset.py - asv_bench/benchmarks/dtypes.py - asv_bench/benchmarks/strings.py - asv_bench/benchmarks/period.py +known_pre_libs = pandas._config +known_pre_core = pandas._libs,pandas.util._*,pandas.compat,pandas.errors +known_dtypes = pandas.core.dtypes +known_post_core = pandas.tseries,pandas.io,pandas.plotting +sections = FUTURE,STDLIB,THIRDPARTY,PRE_LIBS,PRE_CORE,DTYPES,FIRSTPARTY,POST_CORE,LOCALFOLDER +known_first_party = pandas +known_third_party = _pytest,announce,dateutil,docutils,flake8,git,hypothesis,jinja2,lxml,matplotlib,numpy,numpydoc,pkg_resources,pyarrow,pytest,pytz,requests,scipy,setuptools,sphinx,sqlalchemy,validate_docstrings,yaml +multi_line_output = 3 +include_trailing_comma = True +force_grid_wrap = 0 +combine_as_imports = True +line_length = 88 +force_sort_within_sections = True +skip_glob = env, +skip = pandas/__init__.py,pandas/core/api.py [mypy] ignore_missing_imports=True no_implicit_optional=True [mypy-pandas.conftest,pandas.tests.*] -ignore_errors=True \ No newline at end of file +ignore_errors=True diff --git a/setup.py b/setup.py index d2c6b18b892cd..a86527ace092b 100755 --- a/setup.py +++ b/setup.py @@ -6,16 +6,16 @@ BSD license. Parts are from lxml (https://github.com/lxml/lxml) """ +from distutils.sysconfig import get_config_vars +from distutils.version import LooseVersion import os from os.path import join as pjoin - -import pkg_resources import platform -from distutils.sysconfig import get_config_vars -import sys import shutil -from distutils.version import LooseVersion -from setuptools import setup, Command, find_packages +import sys + +import pkg_resources +from setuptools import Command, find_packages, setup # versioning import versioneer @@ -58,8 +58,8 @@ def is_platform_mac(): # The import of Extension must be after the import of Cython, otherwise # we do not get the appropriately patched class. # See https://cython.readthedocs.io/en/latest/src/reference/compilation.html -from distutils.extension import Extension # noqa:E402 -from distutils.command.build import build # noqa:E402 +from distutils.extension import Extension # noqa: E402 isort:skip +from distutils.command.build import build # noqa: E402 isort:skip try: if not _CYTHON_INSTALLED: @@ -831,9 +831,7 @@ def srcpath(name=None, suffix=".pyx", subdir="src"): ] }, entry_points={ - "pandas_plotting_backends": [ - "matplotlib = pandas:plotting._matplotlib", - ], + "pandas_plotting_backends": ["matplotlib = pandas:plotting._matplotlib"] }, **setuptools_kwargs )