diff --git a/.gitignore b/.gitignore index d987bab6fd5d7..19f1cc804dca0 100644 --- a/.gitignore +++ b/.gitignore @@ -18,6 +18,7 @@ .vagrant .noseids .ipynb_checkpoints +.tags # Compiled source # ################### diff --git a/.travis.yml b/.travis.yml index 5a16c1a6c25e7..c6f6d8b81ae59 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,20 +1,30 @@ - +sudo: false language: python -env: +# To turn off cached miniconda, cython files and compiler cache comment out the +# USE_CACHE=true line for the build in the matrix below. To delete caches go to +# https://travis-ci.org/OWNER/REPOSITORY/caches or run +# travis cache --delete inside the project directory from the travis command line client +# The cash directories will be deleted if anything in ci/ changes in a commit +cache: + directories: + - $HOME/miniconda # miniconda cache + - $HOME/.cache # cython cache + - $HOME/.ccache # compiler cache +env: global: # scatterci API key #- secure: "Bx5umgo6WjuGY+5XFa004xjCiX/vq0CyMZ/ETzcs7EIBI1BE/0fIDXOoWhoxbY9HPfdPGlDnDgB9nGqr5wArO2s+BavyKBWg6osZ3dmkfuJPMOWeyCa92EeP+sfKw8e5HSU5MizW9e319wHWOF/xkzdHR7T67Qd5erhv91x4DnQ=" # ironcache API key - - secure: "e4eEFn9nDQc3Xa5BWYkzfX37jaWVq89XidVX+rcCNEr5OlOImvveeXnF1IzbRXznH4Sv0YsLwUd8RGUWOmyCvkONq/VJeqCHWtTMyfaCIdqSyhIP9Odz8r9ahch+Y0XFepBey92AJHmlnTh+2GjCDgIiqq4fzglojnp56Vg1ojA=" - - secure: "CjmYmY5qEu3KrvMtel6zWFEtMq8ORBeS1S1odJHnjQpbwT1KY2YFZRVlLphfyDQXSz6svKUdeRrCNp65baBzs3DQNA8lIuXGIBYFeJxqVGtYAZZs6+TzBPfJJK798sGOj5RshrOJkFG2rdlWNuTq/XphI0JOrN3nPUkRrdQRpAw=" + #- secure: "e4eEFn9nDQc3Xa5BWYkzfX37jaWVq89XidVX+rcCNEr5OlOImvveeXnF1IzbRXznH4Sv0YsLwUd8RGUWOmyCvkONq/VJeqCHWtTMyfaCIdqSyhIP9Odz8r9ahch+Y0XFepBey92AJHmlnTh+2GjCDgIiqq4fzglojnp56Vg1ojA=" + #- secure: "CjmYmY5qEu3KrvMtel6zWFEtMq8ORBeS1S1odJHnjQpbwT1KY2YFZRVlLphfyDQXSz6svKUdeRrCNp65baBzs3DQNA8lIuXGIBYFeJxqVGtYAZZs6+TzBPfJJK798sGOj5RshrOJkFG2rdlWNuTq/XphI0JOrN3nPUkRrdQRpAw=" # pandas-docs-bot GH - secure: "PCzUFR8CHmw9lH84p4ygnojdF7Z8U5h7YfY0RyT+5K/aiQ1ZTU3ZkDTPI0/rR5FVMxsEEKEQKMcc5fvqW0PeD7Q2wRmluloKgT9w4EVEJ1ppKf7lITPcvZR2QgVOvjv4AfDtibLHFNiaSjzoqyJVjM4igjOu8WTlF3JfZcmOQjQ=" git: # for cloning - depth: 500 + depth: 1000 matrix: fast_finish: true @@ -29,72 +39,129 @@ matrix: - BUILD_TYPE=conda - JOB_TAG=_OSX - TRAVIS_PYTHON_VERSION=3.5 + - CACHE_NAME="35_osx" + - USE_CACHE=true - python: 2.7 env: - - JOB_NAME: "27_slow_nnet_LOCALE" - - NOSE_ARGS="slow and not network and not disabled" - - LOCALE_OVERRIDE="zh_CN.GB18030" - - FULL_DEPS=true - - JOB_TAG=_LOCALE + - JOB_NAME: "27_slow_nnet_LOCALE" + - NOSE_ARGS="slow and not network and not disabled" + - LOCALE_OVERRIDE="zh_CN.UTF-8" + - FULL_DEPS=true + - JOB_TAG=_LOCALE + - CACHE_NAME="27_slow_nnet_LOCALE" + - USE_CACHE=true + addons: + apt: + packages: + - language-pack-zh-hans - python: 2.7 env: - - JOB_NAME: "27_nslow" - - NOSE_ARGS="not slow and not disabled" - - FULL_DEPS=true - - CLIPBOARD_GUI=gtk2 - - LINT=true + - JOB_NAME: "27_nslow" + - NOSE_ARGS="not slow and not disabled" + - FULL_DEPS=true + - CLIPBOARD_GUI=gtk2 + - LINT=true + - CACHE_NAME="27_nslow" + - USE_CACHE=true + addons: + apt: + packages: + - python-gtk2 - python: 3.4 env: - - JOB_NAME: "34_nslow" - - NOSE_ARGS="not slow and not disabled" - - FULL_DEPS=true - - CLIPBOARD=xsel + - JOB_NAME: "34_nslow" + - NOSE_ARGS="not slow and not disabled" + - FULL_DEPS=true + - CLIPBOARD=xsel + - CACHE_NAME="34_nslow" + - USE_CACHE=true + addons: + apt: + packages: + - xsel - python: 3.5 env: - - JOB_NAME: "35_nslow" - - NOSE_ARGS="not slow and not network and not disabled" - - FULL_DEPS=true - - CLIPBOARD=xsel - - COVERAGE=true + - JOB_NAME: "35_nslow" + - NOSE_ARGS="not slow and not network and not disabled" + - FULL_DEPS=true + - CLIPBOARD=xsel + - COVERAGE=true + - CACHE_NAME="35_nslow" +# - USE_CACHE=true # Don't use cache for 35_nslow + addons: + apt: + packages: + - xsel +# In allow_failures - python: 2.7 env: - - JOB_NAME: "27_slow" - - JOB_TAG=_SLOW - - NOSE_ARGS="slow and not network and not disabled" - - FULL_DEPS=true + - JOB_NAME: "27_slow" + - JOB_TAG=_SLOW + - NOSE_ARGS="slow and not network and not disabled" + - FULL_DEPS=true + - CACHE_NAME="27_slow" + - USE_CACHE=true +# In allow_failures - python: 3.4 env: - - JOB_NAME: "34_slow" - - JOB_TAG=_SLOW - - NOSE_ARGS="slow and not network and not disabled" - - FULL_DEPS=true - - CLIPBOARD=xsel + - JOB_NAME: "34_slow" + - JOB_TAG=_SLOW + - NOSE_ARGS="slow and not network and not disabled" + - FULL_DEPS=true + - CLIPBOARD=xsel + - CACHE_NAME="34_slow" + - USE_CACHE=true + addons: + apt: + packages: + - xsel +# In allow_failures - python: 2.7 env: - - JOB_NAME: "27_build_test_conda" - - JOB_TAG=_BUILD_TEST - - NOSE_ARGS="not slow and not disabled" - - FULL_DEPS=true - - BUILD_TEST=true + - JOB_NAME: "27_build_test_conda" + - JOB_TAG=_BUILD_TEST + - NOSE_ARGS="not slow and not disabled" + - FULL_DEPS=true + - BUILD_TEST=true + - CACHE_NAME="27_build_test_conda" + - USE_CACHE=true +# In allow_failures - python: 3.5 env: - - JOB_NAME: "35_numpy_dev" - - JOB_TAG=_NUMPY_DEV - - NOSE_ARGS="not slow and not network and not disabled" - - PANDAS_TESTING_MODE="deprecate" + - JOB_NAME: "35_numpy_dev" + - JOB_TAG=_NUMPY_DEV + - NOSE_ARGS="not slow and not network and not disabled" + - PANDAS_TESTING_MODE="deprecate" + - CACHE_NAME="35_numpy_dev" + - USE_CACHE=true + addons: + apt: + packages: + - libatlas-base-dev + - gfortran +# In allow_failures - python: 2.7 env: - - JOB_NAME: "27_nslow_nnet_COMPAT" - - NOSE_ARGS="not slow and not network and not disabled" - - LOCALE_OVERRIDE="it_IT.UTF-8" - - INSTALL_TEST=true - - JOB_TAG=_COMPAT + - JOB_NAME: "27_nslow_nnet_COMPAT" + - NOSE_ARGS="not slow and not network and not disabled" + - LOCALE_OVERRIDE="it_IT.UTF-8" + - INSTALL_TEST=true + - JOB_TAG=_COMPAT + - CACHE_NAME="27_nslow_nnet_COMPAT" + - USE_CACHE=true + addons: + apt: + packages: + - language-pack-it +# In allow_failures - python: 2.7 env: - - JOB_NAME: "doc_build" - - FULL_DEPS=true - - DOC_BUILD=true # if rst files were changed, build docs in parallel with tests - - JOB_TAG=_DOC_BUILD + - JOB_NAME: "doc_build" + - FULL_DEPS=true + - DOC_BUILD=true + - JOB_TAG=_DOC_BUILD + - CACHE_NAME="doc_build" + - USE_CACHE=true allow_failures: - python: 2.7 env: @@ -102,6 +169,8 @@ matrix: - JOB_TAG=_SLOW - NOSE_ARGS="slow and not network and not disabled" - FULL_DEPS=true + - CACHE_NAME="27_slow" + - USE_CACHE=true - python: 3.4 env: - JOB_NAME: "34_slow" @@ -109,6 +178,12 @@ matrix: - NOSE_ARGS="slow and not network and not disabled" - FULL_DEPS=true - CLIPBOARD=xsel + - CACHE_NAME="34_slow" + - USE_CACHE=true + addons: + apt: + packages: + - xsel - python: 2.7 env: - JOB_NAME: "27_build_test_conda" @@ -116,12 +191,21 @@ matrix: - NOSE_ARGS="not slow and not disabled" - FULL_DEPS=true - BUILD_TEST=true + - CACHE_NAME="27_build_test_conda" + - USE_CACHE=true - python: 3.5 env: - JOB_NAME: "35_numpy_dev" - JOB_TAG=_NUMPY_DEV - NOSE_ARGS="not slow and not network and not disabled" - PANDAS_TESTING_MODE="deprecate" + - CACHE_NAME="35_numpy_dev" + - USE_CACHE=true + addons: + apt: + packages: + - libatlas-base-dev + - gfortran - python: 2.7 env: - JOB_NAME: "27_nslow_nnet_COMPAT" @@ -129,15 +213,24 @@ matrix: - LOCALE_OVERRIDE="it_IT.UTF-8" - INSTALL_TEST=true - JOB_TAG=_COMPAT + - CACHE_NAME="27_nslow_nnet_COMPAT" + - USE_CACHE=true + addons: + apt: + packages: + - language-pack-it - python: 2.7 env: - JOB_NAME: "doc_build" - FULL_DEPS=true - DOC_BUILD=true - JOB_TAG=_DOC_BUILD + - CACHE_NAME="doc_build" + - USE_CACHE=true before_install: - echo "before_install" + - source ci/travis_process_gbq_encryption.sh - echo $VIRTUAL_ENV - export PATH="$HOME/miniconda/bin:$PATH" - df -h @@ -153,9 +246,10 @@ before_install: install: - echo "install start" - - ci/prep_ccache.sh + - ci/check_cache.sh + - ci/prep_cython_cache.sh - ci/install_travis.sh - - ci/submit_ccache.sh + - ci/submit_cython_cache.sh - echo "install done" before_script: @@ -175,6 +269,6 @@ after_success: after_script: - echo "after_script start" - ci/install_test.sh - - source activate pandas && ci/print_versions.py + - source activate pandas && python -c "import pandas; pandas.show_versions();" - ci/print_skipped.py /tmp/nosetests.xml - echo "after_script done" diff --git a/LICENSES/ULTRAJSON_LICENSE b/LICENSES/ULTRAJSON_LICENSE index defca46e7f820..3b2886eb9cfae 100644 --- a/LICENSES/ULTRAJSON_LICENSE +++ b/LICENSES/ULTRAJSON_LICENSE @@ -25,10 +25,10 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc) -http://code.google.com/p/stringencoders/ +https://github.com/client9/stringencoders Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. Numeric decoder derived from from TCL library http://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms * Copyright (c) 1988-1993 The Regents of the University of California. - * Copyright (c) 1994 Sun Microsystems, Inc. \ No newline at end of file + * Copyright (c) 1994 Sun Microsystems, Inc. diff --git a/README.md b/README.md index 40b8e1e0a1272..6ebc287fa2cf6 100644 --- a/README.md +++ b/README.md @@ -129,102 +129,27 @@ Here are just a few of the things that pandas does well: The source code is currently hosted on GitHub at: http://github.com/pydata/pandas -Binary installers for the latest released version are available at the Python -package index - - http://pypi.python.org/pypi/pandas/ - -And via `easy_install`: +Binary installers for the latest released version are available at the [Python +package index](http://pypi.python.org/pypi/pandas/) and on conda. ```sh -easy_install pandas +# conda +conda install pandas ``` -or `pip`: - ```sh +# or PyPI pip install pandas ``` -or `conda`: - -```sh -conda install pandas -``` - ## Dependencies - [NumPy](http://www.numpy.org): 1.7.0 or higher - [python-dateutil](http://labix.org/python-dateutil): 1.5 or higher - [pytz](http://pytz.sourceforge.net) - Needed for time zone support with ``pandas.date_range`` -### Highly Recommended Dependencies -- [numexpr](https://github.com/pydata/numexpr) - - Needed to accelerate some expression evaluation operations - - Required by PyTables -- [bottleneck](http://berkeleyanalytics.com/bottleneck) - - Needed to accelerate certain numerical operations - -### Optional dependencies -- [Cython](http://www.cython.org): Only necessary to build development version. Version 0.17.1 or higher. -- [SciPy](http://www.scipy.org): miscellaneous statistical functions -- [PyTables](http://www.pytables.org): necessary for HDF5-based storage -- [SQLAlchemy](http://www.sqlalchemy.org): for SQL database support. Version 0.8.1 or higher recommended. -- [matplotlib](http://matplotlib.sourceforge.net/): for plotting -- [statsmodels](http://statsmodels.sourceforge.net/) - - Needed for parts of `pandas.stats` -- For Excel I/O: - - [xlrd/xlwt](http://www.python-excel.org/) - - Excel reading (xlrd) and writing (xlwt) - - [openpyxl](http://packages.python.org/openpyxl/) - - openpyxl version 1.6.1 or higher, but lower than 2.0.0, for - writing .xlsx files - - xlrd >= 0.9.0 - - [XlsxWriter](https://pypi.python.org/pypi/XlsxWriter) - - Alternative Excel writer. -- [Google bq Command Line Tool](https://cloud.google.com/bigquery/bq-command-line-tool) - - Needed for `pandas.io.gbq` -- [boto](https://pypi.python.org/pypi/boto): necessary for Amazon S3 access. -- One of the following combinations of libraries is needed to use the - top-level [`pandas.read_html`][read-html-docs] function: - - [BeautifulSoup4][BeautifulSoup4] and [html5lib][html5lib] (Any - recent version of [html5lib][html5lib] is okay.) - - [BeautifulSoup4][BeautifulSoup4] and [lxml][lxml] - - [BeautifulSoup4][BeautifulSoup4] and [html5lib][html5lib] and [lxml][lxml] - - Only [lxml][lxml], although see [HTML reading gotchas][html-gotchas] - for reasons as to why you should probably **not** take this approach. - -#### Notes about HTML parsing libraries -- If you install [BeautifulSoup4][BeautifulSoup4] you must install - either [lxml][lxml] or [html5lib][html5lib] or both. - `pandas.read_html` will **not** work with *only* `BeautifulSoup4` - installed. -- You are strongly encouraged to read [HTML reading - gotchas][html-gotchas]. It explains issues surrounding the - installation and usage of the above three libraries. -- You may need to install an older version of - [BeautifulSoup4][BeautifulSoup4]: - - Versions 4.2.1, 4.1.3 and 4.0.2 have been confirmed for 64 and - 32-bit Ubuntu/Debian -- Additionally, if you're using [Anaconda][Anaconda] you should - definitely read [the gotchas about HTML parsing][html-gotchas] - libraries -- If you're on a system with `apt-get` you can do - - ```sh - sudo apt-get build-dep python-lxml - ``` - - to get the necessary dependencies for installation of [lxml][lxml]. - This will prevent further headaches down the line. - - [html5lib]: https://github.com/html5lib/html5lib-python "html5lib" - [BeautifulSoup4]: http://www.crummy.com/software/BeautifulSoup "BeautifulSoup4" - [lxml]: http://lxml.de - [Anaconda]: https://store.continuum.io/cshop/anaconda - [NumPy]: http://numpy.scipy.org/ - [html-gotchas]: http://pandas.pydata.org/pandas-docs/stable/gotchas.html#html-table-parsing - [read-html-docs]: http://pandas.pydata.org/pandas-docs/stable/generated/pandas.io.html.read_html.html#pandas.io.html.read_html +See the [full installation instructions](http://pandas.pydata.org/pandas-docs/stable/install.html#dependencies) +for recommended and optional dependencies. ## Installation from sources To install pandas from source you need Cython in addition to the normal diff --git a/appveyor.yml b/appveyor.yml index 7941820204916..84c34b34626b9 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -16,11 +16,13 @@ environment: CMD_IN_ENV: "cmd /E:ON /V:ON /C .\\ci\\run_with_env.cmd" matrix: - - PYTHON: "C:\\Python34_64" - PYTHON_VERSION: "3.4" - PYTHON_ARCH: "64" - CONDA_PY: "34" - CONDA_NPY: "19" + + # disable python 3.4 ATM + #- PYTHON: "C:\\Python34_64" + # PYTHON_VERSION: "3.4" + # PYTHON_ARCH: "64" + # CONDA_PY: "34" + # CONDA_NPY: "19" - PYTHON: "C:\\Python27_64" PYTHON_VERSION: "2.7" @@ -32,7 +34,8 @@ environment: PYTHON_VERSION: "3.5" PYTHON_ARCH: "64" CONDA_PY: "35" - CONDA_NPY: "110" + CONDA_NPY: "111" + # We always use a 64-bit machine, but can build x86 distributions # with the PYTHON_ARCH variable (which is used by CMD_IN_ENV). @@ -46,6 +49,12 @@ init: - "ECHO %PYTHON_VERSION% %PYTHON%" install: + # cancel older builds for the same PR + - ps: if ($env:APPVEYOR_PULL_REQUEST_NUMBER -and $env:APPVEYOR_BUILD_NUMBER -ne ((Invoke-RestMethod ` + https://ci.appveyor.com/api/projects/$env:APPVEYOR_ACCOUNT_NAME/$env:APPVEYOR_PROJECT_SLUG/history?recordsNumber=50).builds | ` + Where-Object pullRequestId -eq $env:APPVEYOR_PULL_REQUEST_NUMBER)[0].buildNumber) { ` + throw "There are newer queued builds for this pull request, failing early." } + # this installs the appropriate Miniconda (Py2/Py3, 32/64 bit) # updates conda & installs: conda-build jinja2 anaconda-client - powershell .\ci\install.ps1 @@ -59,11 +68,23 @@ install: - cmd: rmdir C:\cygwin /s /q # install our build environment - - cmd: conda config --set show_channel_urls yes --set always_yes yes --set changeps1 no + - cmd: conda config --set show_channel_urls true --set always_yes true --set changeps1 false - cmd: conda update -q conda - - cmd: conda config --add channels http://conda.anaconda.org/pandas + + # fix conda-build version + # https://github.com/conda/conda-build/issues/1001 + # disabling 3.4 as windows complains upon compiling byte + # code + + - cmd: conda install conda-build=1.21.7 - cmd: conda config --set ssl_verify false + # add the pandas channel *before* defaults to have defaults take priority + - cmd: conda config --add channels pandas + - cmd: conda config --remove channels defaults + - cmd: conda config --add channels defaults + - cmd: conda install anaconda-client + # this is now the downloaded conda... - cmd: conda info -a diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index 7b9fe353df2e3..f5fa849464881 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -77,11 +77,11 @@ // On conda install pytables, otherwise tables {"environment_type": "conda", "tables": ""}, {"environment_type": "conda", "pytables": null}, - {"environment_type": "virtualenv", "tables": null}, - {"environment_type": "virtualenv", "pytables": ""}, + {"environment_type": "(?!conda).*", "tables": null}, + {"environment_type": "(?!conda).*", "pytables": ""}, // On conda&win32, install libpython {"sys_platform": "(?!win32).*", "libpython": ""}, - {"sys_platform": "win32", "libpython": null}, + {"environment_type": "conda", "sys_platform": "win32", "libpython": null}, {"environment_type": "(?!conda).*", "libpython": ""} ], "include": [], diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py new file mode 100644 index 0000000000000..6eac7b4831f0f --- /dev/null +++ b/asv_bench/benchmarks/algorithms.py @@ -0,0 +1,31 @@ +import numpy as np +import pandas as pd + + +class algorithm(object): + goal_time = 0.2 + + def setup(self): + N = 100000 + + self.int_unique = pd.Int64Index(np.arange(N * 5)) + # cache is_unique + self.int_unique.is_unique + + self.int = pd.Int64Index(np.arange(N).repeat(5)) + self.float = pd.Float64Index(np.random.randn(N).repeat(5)) + + def time_int_factorize(self): + self.int.factorize() + + def time_float_factorize(self): + self.int.factorize() + + def time_int_unique_duplicated(self): + self.int_unique.duplicated() + + def time_int_duplicated(self): + self.int.duplicated() + + def time_float_duplicated(self): + self.float.duplicated() diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index 244af3a577fe2..bf1e1b3f40ab0 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -1,4 +1,8 @@ from .pandas_vb_common import * +try: + from pandas.types.concat import union_categoricals +except ImportError: + pass import string @@ -12,6 +16,17 @@ def time_concat_categorical(self): concat([self.s, self.s]) +class union_categorical(object): + goal_time = 0.2 + + def setup(self): + self.a = pd.Categorical((list('aabbcd') * 1000000)) + self.b = pd.Categorical((list('bbcdjk') * 1000000)) + + def time_union_categorical(self): + union_categoricals([self.a, self.b]) + + class categorical_value_counts(object): goal_time = 1 diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index 5c5a1df4ea1f8..a21dee2e612d2 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -1,4 +1,5 @@ from .pandas_vb_common import * +import string class frame_apply_axis_1(object): @@ -606,6 +607,21 @@ def time_frame_isnull(self): isnull(self.df) +class frame_isnull_strings(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.sample = np.array(list(string.ascii_lowercase) + + list(string.ascii_uppercase) + + list(string.whitespace)) + self.data = np.random.choice(self.sample, (1000, 1000)) + self.df = DataFrame(self.data) + + def time_frame_isnull(self): + isnull(self.df) + + class frame_isnull_obj(object): goal_time = 0.2 diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 586bd00b091fe..0611a3564ff7a 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -379,15 +379,24 @@ def time_groupby_dt_timegrouper_size(self): #---------------------------------------------------------------------- # groupby with a variable value for ngroups -class groupby_ngroups_10000(object): +class groupby_ngroups_int_10000(object): goal_time = 0.2 + dtype = 'int' + ngroups = 10000 def setup(self): np.random.seed(1234) - self.ngroups = 10000 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) + size = self.ngroups * 2 + rng = np.arange(self.ngroups) + ts = rng.take(np.random.randint(0, self.ngroups, size=size)) + if self.dtype == 'int': + value = np.random.randint(0, size, size=size) + else: + value = np.concatenate([np.random.random(self.ngroups) * 0.1, + np.random.random(self.ngroups) * 10.0]) + + self.df = DataFrame({'timestamp': ts, + 'value': value}) def time_all(self): self.df.groupby('value')['timestamp'].all() @@ -482,109 +491,35 @@ def time_value_counts(self): def time_var(self): self.df.groupby('value')['timestamp'].var() - -class groupby_ngroups_100(object): +class groupby_ngroups_int_100(groupby_ngroups_int_10000): goal_time = 0.2 + dtype = 'int' + ngroups = 100 - def setup(self): - np.random.seed(1234) - self.ngroups = 100 - self.size = (self.ngroups * 2) - self.rng = np.arange(self.ngroups) - self.df = DataFrame(dict(timestamp=self.rng.take(np.random.randint(0, self.ngroups, size=self.size)), value=np.random.randint(0, self.size, size=self.size))) - - def time_all(self): - self.df.groupby('value')['timestamp'].all() - - def time_any(self): - self.df.groupby('value')['timestamp'].any() - - def time_count(self): - self.df.groupby('value')['timestamp'].count() - - def time_cumcount(self): - self.df.groupby('value')['timestamp'].cumcount() - - def time_cummax(self): - self.df.groupby('value')['timestamp'].cummax() - - def time_cummin(self): - self.df.groupby('value')['timestamp'].cummin() - - def time_cumprod(self): - self.df.groupby('value')['timestamp'].cumprod() - - def time_cumsum(self): - self.df.groupby('value')['timestamp'].cumsum() - - def time_describe(self): - self.df.groupby('value')['timestamp'].describe() - - def time_diff(self): - self.df.groupby('value')['timestamp'].diff() - - def time_first(self): - self.df.groupby('value')['timestamp'].first() - - def time_head(self): - self.df.groupby('value')['timestamp'].head() - - def time_last(self): - self.df.groupby('value')['timestamp'].last() - - def time_mad(self): - self.df.groupby('value')['timestamp'].mad() - - def time_max(self): - self.df.groupby('value')['timestamp'].max() - - def time_mean(self): - self.df.groupby('value')['timestamp'].mean() - - def time_median(self): - self.df.groupby('value')['timestamp'].median() - - def time_min(self): - self.df.groupby('value')['timestamp'].min() - - def time_nunique(self): - self.df.groupby('value')['timestamp'].nunique() - - def time_pct_change(self): - self.df.groupby('value')['timestamp'].pct_change() - - def time_prod(self): - self.df.groupby('value')['timestamp'].prod() - - def time_rank(self): - self.df.groupby('value')['timestamp'].rank() - - def time_sem(self): - self.df.groupby('value')['timestamp'].sem() - - def time_size(self): - self.df.groupby('value')['timestamp'].size() - - def time_skew(self): - self.df.groupby('value')['timestamp'].skew() - - def time_std(self): - self.df.groupby('value')['timestamp'].std() +class groupby_ngroups_float_100(groupby_ngroups_int_10000): + goal_time = 0.2 + dtype = 'float' + ngroups = 100 - def time_sum(self): - self.df.groupby('value')['timestamp'].sum() +class groupby_ngroups_float_10000(groupby_ngroups_int_10000): + goal_time = 0.2 + dtype = 'float' + ngroups = 10000 - def time_tail(self): - self.df.groupby('value')['timestamp'].tail() - def time_unique(self): - self.df.groupby('value')['timestamp'].unique() +class groupby_float32(object): + # GH 13335 + goal_time = 0.2 - def time_value_counts(self): - self.df.groupby('value')['timestamp'].value_counts() + def setup(self): + tmp1 = (np.random.random(10000) * 0.1).astype(np.float32) + tmp2 = (np.random.random(10000) * 10.0).astype(np.float32) + tmp = np.concatenate((tmp1, tmp2)) + arr = np.repeat(tmp, 10) + self.df = DataFrame(dict(a=arr, b=arr)) - def time_var(self): - self.df.groupby('value')['timestamp'].var() + def time_groupby_sum(self): + self.df.groupby(['a'])['b'].sum() #---------------------------------------------------------------------- diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py index 8c65f09937df4..a0a1b560d36f3 100644 --- a/asv_bench/benchmarks/index_object.py +++ b/asv_bench/benchmarks/index_object.py @@ -63,6 +63,27 @@ def time_index_datetime_union(self): self.rng.union(self.rng2) +class index_datetime_set_difference(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.A = self.N - 20000 + self.B = self.N + 20000 + self.idx1 = DatetimeIndex(range(self.N)) + self.idx2 = DatetimeIndex(range(self.A, self.B)) + self.idx3 = DatetimeIndex(range(self.N, self.B)) + + def time_index_datetime_difference(self): + self.idx1.difference(self.idx2) + + def time_index_datetime_difference_disjoint(self): + self.idx1.difference(self.idx3) + + def time_index_datetime_symmetric_difference(self): + self.idx1.symmetric_difference(self.idx2) + + class index_float64_boolean_indexer(object): goal_time = 0.2 @@ -183,6 +204,40 @@ def time_index_int64_union(self): self.left.union(self.right) +class index_int64_set_difference(object): + goal_time = 0.2 + + def setup(self): + self.N = 500000 + self.options = np.arange(self.N) + self.left = Index(self.options.take( + np.random.permutation(self.N)[:(self.N // 2)])) + self.right = Index(self.options.take( + np.random.permutation(self.N)[:(self.N // 2)])) + + def time_index_int64_difference(self): + self.left.difference(self.right) + + def time_index_int64_symmetric_difference(self): + self.left.symmetric_difference(self.right) + + +class index_str_set_difference(object): + goal_time = 0.2 + + def setup(self): + self.N = 10000 + self.strs = tm.rands_array(10, self.N) + self.left = Index(self.strs[:self.N * 2 // 3]) + self.right = Index(self.strs[self.N // 3:]) + + def time_str_difference(self): + self.left.difference(self.right) + + def time_str_symmetric_difference(self): + self.left.symmetric_difference(self.right) + + class index_str_boolean_indexer(object): goal_time = 0.2 diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index 32d80a7913234..094ae23a92fad 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -19,24 +19,6 @@ def time_dataframe_getitem_scalar(self): self.df[self.col][self.idx] -class datamatrix_getitem_scalar(object): - goal_time = 0.2 - - def setup(self): - try: - self.klass = DataMatrix - except: - self.klass = DataFrame - self.index = tm.makeStringIndex(1000) - self.columns = tm.makeStringIndex(30) - self.df = self.klass(np.random.rand(1000, 30), index=self.index, columns=self.columns) - self.idx = self.index[100] - self.col = self.columns[10] - - def time_datamatrix_getitem_scalar(self): - self.df[self.col][self.idx] - - class series_get_value(object): goal_time = 0.2 @@ -486,4 +468,15 @@ def setup(self): self.midx = self.midx.take(np.random.permutation(np.arange(100000))) def time_sort_level_zero(self): - self.midx.sortlevel(0) \ No newline at end of file + self.midx.sortlevel(0) + +class float_loc(object): + # GH 13166 + goal_time = 0.2 + + def setup(self): + a = np.arange(100000) + self.ind = pd.Float64Index(a * 4.8000000418824129e-08) + + def time_float_loc(self): + self.ind.get_loc(0) diff --git a/asv_bench/benchmarks/inference.py b/asv_bench/benchmarks/inference.py index 3fceed087facb..0f9689dadcbb0 100644 --- a/asv_bench/benchmarks/inference.py +++ b/asv_bench/benchmarks/inference.py @@ -135,4 +135,30 @@ def setup(self): self.df_timedelta64 = DataFrame(dict(A=(self.df_datetime64['A'] - self.df_datetime64['B']), B=self.df_datetime64['B'])) def time_dtype_infer_uint32(self): - (self.df_uint32['A'] + self.df_uint32['B']) \ No newline at end of file + (self.df_uint32['A'] + self.df_uint32['B']) + + +class to_numeric(object): + + param_names = ['dtype', 'downcast'] + params = [['string-float', 'string-int', 'string-nint', 'datetime64', + 'int-list', 'int32'], + [None, 'integer', 'signed', 'unsigned', 'float']] + + N = 500000 + + data_dict = { + 'string-int': (['1'] * (N / 2)) + ([2] * (N / 2)), + 'string-nint': (['-1'] * (N / 2)) + ([2] * (N / 2)), + 'datetime64': np.repeat(np.array(['1970-01-01', '1970-01-02'], + dtype='datetime64[D]'), N), + 'string-float': (['1.1'] * (N / 2)) + ([2] * (N / 2)), + 'int-list': ([1] * (N / 2)) + ([2] * (N / 2)), + 'int32': np.repeat(np.int32(1), N) + } + + def setup(self, dtype, downcast): + self.data = self.data_dict[dtype] + + def time_downcast(self, dtype, downcast): + pd.to_numeric(self.data, downcast=downcast) diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py index 39ebd9cb1cb73..c98179c8950c5 100644 --- a/asv_bench/benchmarks/join_merge.py +++ b/asv_bench/benchmarks/join_merge.py @@ -179,10 +179,6 @@ def setup(self): self.df_multi = DataFrame(np.random.randn(len(self.index2), 4), index=self.index2, columns=['A', 'B', 'C', 'D']) except: pass - try: - self.DataFrame = DataMatrix - except: - pass self.df = pd.DataFrame({'data1': np.random.randn(100000), 'data2': np.random.randn(100000), 'key1': self.key1, 'key2': self.key2, }) self.df_key1 = pd.DataFrame(np.random.randn(len(self.level1), 4), index=self.level1, columns=['A', 'B', 'C', 'D']) self.df_key2 = pd.DataFrame(np.random.randn(len(self.level2), 4), index=self.level2, columns=['A', 'B', 'C', 'D']) @@ -210,10 +206,6 @@ def setup(self): self.df_multi = DataFrame(np.random.randn(len(self.index2), 4), index=self.index2, columns=['A', 'B', 'C', 'D']) except: pass - try: - self.DataFrame = DataMatrix - except: - pass self.df = pd.DataFrame({'data1': np.random.randn(100000), 'data2': np.random.randn(100000), 'key1': self.key1, 'key2': self.key2, }) self.df_key1 = pd.DataFrame(np.random.randn(len(self.level1), 4), index=self.level1, columns=['A', 'B', 'C', 'D']) self.df_key2 = pd.DataFrame(np.random.randn(len(self.level2), 4), index=self.level2, columns=['A', 'B', 'C', 'D']) @@ -241,10 +233,6 @@ def setup(self): self.df_multi = DataFrame(np.random.randn(len(self.index2), 4), index=self.index2, columns=['A', 'B', 'C', 'D']) except: pass - try: - self.DataFrame = DataMatrix - except: - pass self.df = pd.DataFrame({'data1': np.random.randn(100000), 'data2': np.random.randn(100000), 'key1': self.key1, 'key2': self.key2, }) self.df_key1 = pd.DataFrame(np.random.randn(len(self.level1), 4), index=self.level1, columns=['A', 'B', 'C', 'D']) self.df_key2 = pd.DataFrame(np.random.randn(len(self.level2), 4), index=self.level2, columns=['A', 'B', 'C', 'D']) @@ -272,10 +260,6 @@ def setup(self): self.df_multi = DataFrame(np.random.randn(len(self.index2), 4), index=self.index2, columns=['A', 'B', 'C', 'D']) except: pass - try: - self.DataFrame = DataMatrix - except: - pass self.df = pd.DataFrame({'data1': np.random.randn(100000), 'data2': np.random.randn(100000), 'key1': self.key1, 'key2': self.key2, }) self.df_key1 = pd.DataFrame(np.random.randn(len(self.level1), 4), index=self.level1, columns=['A', 'B', 'C', 'D']) self.df_key2 = pd.DataFrame(np.random.randn(len(self.level2), 4), index=self.level2, columns=['A', 'B', 'C', 'D']) @@ -309,6 +293,62 @@ def time_join_dataframe_integer_key(self): merge(self.df, self.df2, on='key1') +class merge_asof_noby(object): + + def setup(self): + np.random.seed(0) + one_count = 200000 + two_count = 1000000 + self.df1 = pd.DataFrame({'time': np.random.randint(0, one_count/20, one_count), + 'value1': np.random.randn(one_count)}) + self.df2 = pd.DataFrame({'time': np.random.randint(0, two_count/20, two_count), + 'value2': np.random.randn(two_count)}) + self.df1 = self.df1.sort_values('time') + self.df2 = self.df2.sort_values('time') + + def time_merge_asof_noby(self): + merge_asof(self.df1, self.df2, on='time') + + +class merge_asof_by_object(object): + + def setup(self): + import string + np.random.seed(0) + one_count = 200000 + two_count = 1000000 + self.df1 = pd.DataFrame({'time': np.random.randint(0, one_count/20, one_count), + 'key': np.random.choice(list(string.uppercase), one_count), + 'value1': np.random.randn(one_count)}) + self.df2 = pd.DataFrame({'time': np.random.randint(0, two_count/20, two_count), + 'key': np.random.choice(list(string.uppercase), two_count), + 'value2': np.random.randn(two_count)}) + self.df1 = self.df1.sort_values('time') + self.df2 = self.df2.sort_values('time') + + def time_merge_asof_by_object(self): + merge_asof(self.df1, self.df2, on='time', by='key') + + +class merge_asof_by_int(object): + + def setup(self): + np.random.seed(0) + one_count = 200000 + two_count = 1000000 + self.df1 = pd.DataFrame({'time': np.random.randint(0, one_count/20, one_count), + 'key': np.random.randint(0, 25, one_count), + 'value1': np.random.randn(one_count)}) + self.df2 = pd.DataFrame({'time': np.random.randint(0, two_count/20, two_count), + 'key': np.random.randint(0, 25, two_count), + 'value2': np.random.randn(two_count)}) + self.df1 = self.df1.sort_values('time') + self.df2 = self.df2.sort_values('time') + + def time_merge_asof_by_int(self): + merge_asof(self.df1, self.df2, on='time', by='key') + + class join_non_unique_equal(object): goal_time = 0.2 diff --git a/asv_bench/benchmarks/parser_vb.py b/asv_bench/benchmarks/parser_vb.py index 04f25034638cd..6dc8bffd6dac9 100644 --- a/asv_bench/benchmarks/parser_vb.py +++ b/asv_bench/benchmarks/parser_vb.py @@ -114,6 +114,27 @@ def teardown(self): os.remove('test.csv') +class read_csv_categorical(object): + goal_time = 0.2 + + def setup(self): + N = 100000 + group1 = ['aaaaaaaa', 'bbbbbbb', 'cccccccc', 'dddddddd', 'eeeeeeee'] + df = DataFrame({'a': np.random.choice(group1, N).astype('object'), + 'b': np.random.choice(group1, N).astype('object'), + 'c': np.random.choice(group1, N).astype('object')}) + df.to_csv('strings.csv', index=False) + + def time_read_csv_categorical_post(self): + read_csv('strings.csv').apply(pd.Categorical) + + def time_read_csv_categorical_direct(self): + read_csv('strings.csv', dtype='category') + + def teardown(self): + os.remove('strings.csv') + + class read_table_multiple_date(object): goal_time = 0.2 diff --git a/asv_bench/benchmarks/period.py b/asv_bench/benchmarks/period.py index 012030a71ac82..75b2c2dcacfed 100644 --- a/asv_bench/benchmarks/period.py +++ b/asv_bench/benchmarks/period.py @@ -1,4 +1,4 @@ -from pandas import PeriodIndex, date_range +from pandas import Series, Period, PeriodIndex, date_range class create_period_index_from_date_range(object): @@ -7,3 +7,43 @@ class create_period_index_from_date_range(object): def time_period_index(self): # Simulate irregular PeriodIndex PeriodIndex(date_range('1985', periods=1000).to_pydatetime(), freq='D') + + +class period_setitem(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + if hasattr(Series, 'convert'): + Series.resample = Series.convert + self.ts = Series(np.random.randn(self.N), index=self.rng) + self.rng = period_range(start='1/1/1990', freq='S', periods=20000) + self.df = DataFrame(index=range(len(self.rng))) + + def time_period_setitem(self): + self.df['col'] = self.rng + + +class period_algorithm(object): + goal_time = 0.2 + + def setup(self): + data = [Period('2011-01', freq='M'), Period('2011-02', freq='M'), + Period('2011-03', freq='M'), Period('2011-04', freq='M')] + self.s = Series(data * 1000) + self.i = PeriodIndex(data, freq='M') + + def time_period_series_drop_duplicates(self): + self.s.drop_duplicates() + + def time_period_index_drop_duplicates(self): + self.i.drop_duplicates() + + def time_period_series_value_counts(self): + self.s.value_counts() + + def time_period_index_value_counts(self): + self.i.value_counts() + + diff --git a/asv_bench/benchmarks/timedelta.py b/asv_bench/benchmarks/timedelta.py index 2f252a4d3e1dc..9719fd87dfb2e 100644 --- a/asv_bench/benchmarks/timedelta.py +++ b/asv_bench/benchmarks/timedelta.py @@ -31,4 +31,19 @@ def setup(self): self.arr = ['00:00:{0:02d}'.format(i) for i in self.arr] def time_timedelta_convert_string_seconds(self): - to_timedelta(self.arr) \ No newline at end of file + to_timedelta(self.arr) + + +class timedelta_convert_bad_parse(object): + goal_time = 0.2 + + def setup(self): + self.arr = np.random.randint(0, 1000, size=10000) + self.arr = ['{0} days'.format(i) for i in self.arr] + self.arr[-1] = 'apple' + + def time_timedelta_convert_coerce(self): + to_timedelta(self.arr, errors='coerce') + + def time_timedelta_convert_ignore(self): + to_timedelta(self.arr, errors='ignore') diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py index bdf193cd1f3d3..fda6ebb4b437e 100644 --- a/asv_bench/benchmarks/timeseries.py +++ b/asv_bench/benchmarks/timeseries.py @@ -218,20 +218,20 @@ def time_dti_reset_index_tz(self): self.df.reset_index() -class period_setitem(object): +class datetime_algorithm(object): goal_time = 0.2 def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - if hasattr(Series, 'convert'): - Series.resample = Series.convert - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.rng = period_range(start='1/1/1990', freq='S', periods=20000) - self.df = DataFrame(index=range(len(self.rng))) + N = 100000 + self.dti = pd.date_range('2011-01-01', freq='H', periods=N).repeat(5) + self.dti_tz = pd.date_range('2011-01-01', freq='H', periods=N, + tz='Asia/Tokyo').repeat(5) + + def time_dti_factorize(self): + self.dti.factorize() - def time_period_setitem(self): - self.df['col'] = self.rng + def time_dti_tz_factorize(self): + self.dti_tz.factorize() class timeseries_1min_5min_mean(object): @@ -1155,3 +1155,63 @@ def setup(self): def time_timeseries_year_incr(self): (self.date + self.year) + + +class timeseries_semi_month_offset(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + # date is not on an offset which will be slowest case + self.date = dt.datetime(2011, 1, 2) + self.semi_month_end = pd.offsets.SemiMonthEnd() + self.semi_month_begin = pd.offsets.SemiMonthBegin() + + def time_semi_month_end_apply(self): + self.semi_month_end.apply(self.date) + + def time_semi_month_end_incr(self): + self.date + self.semi_month_end + + def time_semi_month_end_incr_n(self): + self.date + 10 * self.semi_month_end + + def time_semi_month_end_decr(self): + self.date - self.semi_month_end + + def time_semi_month_end_decr_n(self): + self.date - 10 * self.semi_month_end + + def time_semi_month_end_apply_index(self): + self.semi_month_end.apply_index(self.rng) + + def time_semi_month_end_incr_rng(self): + self.rng + self.semi_month_end + + def time_semi_month_end_decr_rng(self): + self.rng - self.semi_month_end + + def time_semi_month_begin_apply(self): + self.semi_month_begin.apply(self.date) + + def time_semi_month_begin_incr(self): + self.date + self.semi_month_begin + + def time_semi_month_begin_incr_n(self): + self.date + 10 * self.semi_month_begin + + def time_semi_month_begin_decr(self): + self.date - self.semi_month_begin + + def time_semi_month_begin_decr_n(self): + self.date - 10 * self.semi_month_begin + + def time_semi_month_begin_apply_index(self): + self.semi_month_begin.apply_index(self.rng) + + def time_semi_month_begin_incr_rng(self): + self.rng + self.semi_month_begin + + def time_semi_month_begin_decr_rng(self): + self.rng - self.semi_month_begin diff --git a/bench/bench_sparse.py b/bench/bench_sparse.py deleted file mode 100644 index 0aa705118d970..0000000000000 --- a/bench/bench_sparse.py +++ /dev/null @@ -1,92 +0,0 @@ -import numpy as np - -from pandas import * -import pandas.core.sparse as spm -import pandas.compat as compat -reload(spm) -from pandas.core.sparse import * - -N = 10000. - -arr1 = np.arange(N) -index = Index(np.arange(N)) - -off = N // 10 -arr1[off: 2 * off] = np.NaN -arr1[4 * off: 5 * off] = np.NaN -arr1[8 * off: 9 * off] = np.NaN - -arr2 = np.arange(N) -arr2[3 * off // 2: 2 * off + off // 2] = np.NaN -arr2[8 * off + off // 2: 9 * off + off // 2] = np.NaN - -s1 = SparseSeries(arr1, index=index) -s2 = SparseSeries(arr2, index=index) - -is1 = SparseSeries(arr1, kind='integer', index=index) -is2 = SparseSeries(arr2, kind='integer', index=index) - -s1_dense = s1.to_dense() -s2_dense = s2.to_dense() - -if compat.is_platform_linux(): - pth = '/home/wesm/code/pandas/example' -else: - pth = '/Users/wesm/code/pandas/example' - -dm = DataFrame.load(pth) - -sdf = dm.to_sparse() - - -def new_data_like(sdf): - new_data = {} - for col, series in compat.iteritems(sdf): - new_data[col] = SparseSeries(np.random.randn(len(series.sp_values)), - index=sdf.index, - sparse_index=series.sp_index, - fill_value=series.fill_value) - - return SparseDataFrame(new_data) - -# data = {} -# for col, ser in dm.iteritems(): -# data[col] = SparseSeries(ser) - -dwp = Panel.fromDict({'foo': dm}) -# sdf = SparseDataFrame(data) - - -lp = stack_sparse_frame(sdf) - - -swp = SparsePanel({'A': sdf}) -swp = SparsePanel({'A': sdf, - 'B': sdf, - 'C': sdf, - 'D': sdf}) - -y = sdf -x = SparsePanel({'x1': sdf + new_data_like(sdf) / 10, - 'x2': sdf + new_data_like(sdf) / 10}) - -dense_y = sdf -dense_x = x.to_dense() - -# import hotshot, hotshot.stats -# prof = hotshot.Profile('test.prof') - -# benchtime, stones = prof.runcall(ols, y=y, x=x) - -# prof.close() - -# stats = hotshot.stats.load('test.prof') - -dense_model = ols(y=dense_y, x=dense_x) - -import pandas.stats.plm as plm -import pandas.stats.interface as face -reload(plm) -reload(face) - -# model = face.ols(y=y, x=x) diff --git a/ci/before_install_travis.sh b/ci/before_install_travis.sh index 76775ecbc78f0..f90427f97d3b7 100755 --- a/ci/before_install_travis.sh +++ b/ci/before_install_travis.sh @@ -9,8 +9,6 @@ echo "inside $0" # overview if [ "${TRAVIS_OS_NAME}" == "linux" ]; then - sudo apt-get update $APT_ARGS # run apt-get update for all versions - sh -e /etc/init.d/xvfb start fi diff --git a/ci/check_cache.sh b/ci/check_cache.sh new file mode 100755 index 0000000000000..cd7a6e8f6b6f9 --- /dev/null +++ b/ci/check_cache.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +if [ "$TRAVIS_PULL_REQUEST" == "false" ] +then + echo "Not a PR: checking for changes in ci/ from last 2 commits" + git diff HEAD~2 --numstat | grep -E "ci/" + ci_changes=$(git diff HEAD~2 --numstat | grep -E "ci/"| wc -l) +else + echo "PR: checking for changes in ci/ from last 2 commits" + git fetch origin pull/${TRAVIS_PULL_REQUEST}/head:PR_HEAD + git diff PR_HEAD~2 --numstat | grep -E "ci/" + ci_changes=$(git diff PR_HEAD~2 --numstat | grep -E "ci/"| wc -l) +fi + +MINICONDA_DIR="$HOME/miniconda/" +CACHE_DIR="$HOME/.cache/" +CCACHE_DIR="$HOME/.ccache/" + +if [ $ci_changes -ne 0 ] +then + echo "Files have changed in ci/ deleting all caches" + rm -rf "$MINICONDA_DIR" + rm -rf "$CACHE_DIR" + rm -rf "$CCACHE_DIR" +fi \ No newline at end of file diff --git a/ci/cron/go_doc.sh b/ci/cron/go_doc.sh deleted file mode 100755 index 89659577d0e7f..0000000000000 --- a/ci/cron/go_doc.sh +++ /dev/null @@ -1,99 +0,0 @@ -#!/bin/bash - -# This is a one-command cron job for setting up -# a virtualenv-based, linux-based, py2-based environment -# for building the Pandas documentation. -# -# The first run will install all required deps from pypi -# into the venv including monsters like scipy. -# You may want to set it up yourself to speed up the -# process. -# -# This is meant to be run as a cron job under a dedicated -# user account whose HOME directory contains this script. -# a CI directory will be created under it and all files -# stored within it. -# -# The hardcoded dep versions will gradually become obsolete -# You may need to tweak them -# -# @y-p, Jan/2014 - -# disto latex is sometimes finicky. Optionall use -# a local texlive install -export PATH=/mnt/debian/texlive/2013/bin/x86_64-linux:$PATH - -# Having ccache will speed things up -export PATH=/usr/lib64/ccache/:$PATH - -# limit disk usage -ccache -M 200M - -BASEDIR="$HOME/CI" -REPO_URL="https://github.com/pydata/pandas" -REPO_LOC="$BASEDIR/pandas" - -if [ ! -d $BASEDIR ]; then - mkdir -p $BASEDIR - virtualenv $BASEDIR/venv -fi - -source $BASEDIR/venv/bin/activate - -pip install numpy==1.7.2 -pip install cython==0.20.0 -pip install python-dateutil==2.2 -pip install --pre pytz==2013.9 -pip install sphinx==1.1.3 -pip install numexpr==2.2.2 - -pip install matplotlib==1.3.0 -pip install lxml==3.2.5 -pip install beautifulsoup4==4.3.2 -pip install html5lib==0.99 - -# You'll need R as well -pip install rpy2==2.3.9 - -pip install tables==3.0.0 -pip install bottleneck==0.7.0 -pip install ipython==0.13.2 - -# only if you have too -pip install scipy==0.13.2 - -pip install openpyxl==1.6.2 -pip install xlrd==0.9.2 -pip install xlwt==0.7.5 -pip install xlsxwriter==0.5.1 -pip install sqlalchemy==0.8.3 - -if [ ! -d "$REPO_LOC" ]; then - git clone "$REPO_URL" "$REPO_LOC" -fi - -cd "$REPO_LOC" -git reset --hard -git clean -df -git checkout master -git pull origin -make - -source $BASEDIR/venv/bin/activate -export PATH="/usr/lib64/ccache/:$PATH" -pip uninstall pandas -yq -pip install "$REPO_LOC" - -cd "$REPO_LOC"/doc - -python make.py clean -python make.py html -if [ ! $? == 0 ]; then - exit 1 -fi -python make.py zip_html -# usually requires manual intervention -# python make.py latex - -# If you have access: -# python make.py upload_dev diff --git a/ci/install-2.7_NUMPY_DEV.sh b/ci/install-2.7_NUMPY_DEV.sh index 00b6255daf70f..22ac8f6547879 100644 --- a/ci/install-2.7_NUMPY_DEV.sh +++ b/ci/install-2.7_NUMPY_DEV.sh @@ -12,8 +12,6 @@ pip uninstall numpy -y # these wheels don't play nice with the conda libgfortran / openblas # time conda install -n pandas libgfortran openblas || exit 1 -time sudo apt-get $APT_ARGS install libatlas-base-dev gfortran - # install numpy wheel from master pip install --pre --upgrade --no-index --timeout=60 --trusted-host travis-dev-wheels.scipy.org -f http://travis-dev-wheels.scipy.org/ numpy diff --git a/ci/install-3.5_NUMPY_DEV.sh b/ci/install-3.5_NUMPY_DEV.sh index ecb07ca23c667..946ec43ad9f1a 100644 --- a/ci/install-3.5_NUMPY_DEV.sh +++ b/ci/install-3.5_NUMPY_DEV.sh @@ -12,8 +12,6 @@ pip uninstall numpy -y # these wheels don't play nice with the conda libgfortran / openblas # time conda install -n pandas libgfortran openblas || exit 1 -time sudo apt-get $APT_ARGS install libatlas-base-dev gfortran - # install numpy wheel from master pip install --pre --upgrade --no-index --timeout=60 --trusted-host travis-dev-wheels.scipy.org -f http://travis-dev-wheels.scipy.org/ numpy scipy diff --git a/ci/install_travis.sh b/ci/install_travis.sh index 07d33f56beba8..98ce36acc096e 100755 --- a/ci/install_travis.sh +++ b/ci/install_travis.sh @@ -28,64 +28,68 @@ function edit_init() edit_init +home_dir=$(pwd) +echo "home_dir: [$home_dir]" + python_major_version="${TRAVIS_PYTHON_VERSION:0:1}" [ "$python_major_version" == "2" ] && python_major_version="" -home_dir=$(pwd) -echo "home_dir: [$home_dir]" +MINICONDA_DIR="$HOME/miniconda" -if [ -n "$LOCALE_OVERRIDE" ]; then - # make sure the locale is available - # probably useless, since you would need to relogin - time sudo locale-gen "$LOCALE_OVERRIDE" +if [ -d "$MINICONDA_DIR" ] && [ -e "$MINICONDA_DIR/bin/conda" ] && [ "$USE_CACHE" ]; then + echo "Miniconda install already present from cache: $MINICONDA_DIR" - # Need to enable for locale testing. The location of the locale file(s) is - # distro specific. For example, on Arch Linux all of the locales are in a - # commented file--/etc/locale.gen--that must be commented in to be used - # whereas Ubuntu looks in /var/lib/locales/supported.d/* and generates locales - # based on what's in the files in that folder - time echo 'it_CH.UTF-8 UTF-8' | sudo tee -a /var/lib/locales/supported.d/it - time sudo locale-gen + conda config --set always_yes yes --set changeps1 no || exit 1 + echo "update conda" + conda update -q conda || exit 1 -fi + # Useful for debugging any issues with conda + conda info -a || exit 1 -# install gui for clipboard testing -if [ -n "$CLIPBOARD_GUI" ]; then - echo "Using CLIPBOARD_GUI: $CLIPBOARD_GUI" - [ -n "$python_major_version" ] && py="py" - python_cb_gui_pkg=python${python_major_version}-${py}${CLIPBOARD_GUI} - time sudo apt-get $APT_ARGS install $python_cb_gui_pkg -fi + # set the compiler cache to work + if [ "${TRAVIS_OS_NAME}" == "linux" ]; then + echo "Using ccache" + export PATH=/usr/lib/ccache:/usr/lib64/ccache:$PATH + gcc=$(which gcc) + echo "gcc: $gcc" + ccache=$(which ccache) + echo "ccache: $ccache" + export CC='ccache gcc' + fi +else + echo "Using clean Miniconda install" + echo "Not using ccache" + rm -rf "$MINICONDA_DIR" + # install miniconda + if [ "${TRAVIS_OS_NAME}" == "osx" ]; then + wget http://repo.continuum.io/miniconda/Miniconda-latest-MacOSX-x86_64.sh -O miniconda.sh || exit 1 + else + wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh || exit 1 + fi + bash miniconda.sh -b -p "$MINICONDA_DIR" || exit 1 -# install a clipboard if $CLIPBOARD is not empty -if [ -n "$CLIPBOARD" ]; then - echo "Using clipboard: $CLIPBOARD" - time sudo apt-get $APT_ARGS install $CLIPBOARD -fi + echo "update conda" + conda config --set ssl_verify false || exit 1 + conda config --set always_yes true --set changeps1 false || exit 1 + conda update -q conda -python_major_version="${TRAVIS_PYTHON_VERSION:0:1}" -[ "$python_major_version" == "2" ] && python_major_version="" + # add the pandas channel *before* defaults to have defaults take priority + echo "add channels" + conda config --add channels pandas || exit 1 + conda config --remove channels defaults || exit 1 + conda config --add channels defaults || exit 1 -# install miniconda -if [ "${TRAVIS_OS_NAME}" == "osx" ]; then - wget http://repo.continuum.io/miniconda/Miniconda-latest-MacOSX-x86_64.sh -O miniconda.sh || exit 1 -else - wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh || exit 1 -fi -bash miniconda.sh -b -p $HOME/miniconda || exit 1 + conda install anaconda-client -conda config --set always_yes yes --set changeps1 no || exit 1 -conda update -q conda || exit 1 -conda config --add channels http://conda.anaconda.org/pandas || exit 1 -conda config --set ssl_verify false || exit 1 + # Useful for debugging any issues with conda + conda info -a || exit 1 -# Useful for debugging any issues with conda -conda info -a || exit 1 + time conda create -n pandas python=$TRAVIS_PYTHON_VERSION nose coverage flake8 || exit 1 +fi # build deps REQ="ci/requirements-${TRAVIS_PYTHON_VERSION}${JOB_TAG}.build" -time conda create -n pandas python=$TRAVIS_PYTHON_VERSION nose coverage flake8 || exit 1 # may have additional installation instructions for this build INSTALL="ci/install-${TRAVIS_PYTHON_VERSION}${JOB_TAG}.sh" @@ -98,16 +102,6 @@ time conda install -n pandas --file=${REQ} || exit 1 source activate pandas -# set the compiler cache to work -if [[ "$IRON_TOKEN" && "${TRAVIS_OS_NAME}" == "linux" ]]; then - export PATH=/usr/lib/ccache:/usr/lib64/ccache:$PATH - gcc=$(which gcc) - echo "gcc: $gcc" - ccache=$(which ccache) - echo "ccache: $ccache" - export CC='ccache gcc' -fi - if [ "$BUILD_TEST" ]; then # build testing @@ -144,5 +138,9 @@ else fi +if [ "$JOB_NAME" == "34_slow" ]; then + conda install -c conda-forge/label/rc -c conda-forge matplotlib +fi + echo "done" exit 0 diff --git a/ci/lint.sh b/ci/lint.sh index a4c960084040f..a866b04445f96 100755 --- a/ci/lint.sh +++ b/ci/lint.sh @@ -8,7 +8,7 @@ RET=0 if [ "$LINT" ]; then echo "Linting" - for path in 'core' 'indexes' 'types' 'formats' 'io' 'stats' 'compat' 'sparse' 'tools' 'tseries' 'tests' 'computation' 'util' + for path in 'api' 'core' 'indexes' 'types' 'formats' 'io' 'stats' 'compat' 'sparse' 'tools' 'tseries' 'tests' 'computation' 'util' do echo "linting -> pandas/$path" flake8 pandas/$path --filename '*.py' @@ -17,7 +17,23 @@ if [ "$LINT" ]; then fi done - echo "Linting DONE" + echo "Linting *.py DONE" + + echo "Linting *.pyx" + flake8 pandas --filename '*.pyx' --select=E501,E302,E203,E111,E114,E221,E303,E128,E231,E126 + echo "Linting *.pyx DONE" + + echo "Linting *.pxi.in" + for path in 'src' + do + echo "linting -> pandas/$path" + flake8 pandas/$path --filename '*.pxi.in' --select=E501,E302,E203,E111,E114,E221,E303,E231,E126 + if [ $? -ne "0" ]; then + RET=1 + fi + + done + echo "Linting *.pxi.in DONE" echo "Check for invalid testing" grep -r -E --include '*.py' --exclude nosetester.py --exclude testing.py '(numpy|np)\.testing' pandas diff --git a/ci/prep_ccache.sh b/ci/prep_ccache.sh deleted file mode 100755 index 7e586cc4d3085..0000000000000 --- a/ci/prep_ccache.sh +++ /dev/null @@ -1,52 +0,0 @@ -#!/bin/bash - -if [ "${TRAVIS_OS_NAME}" != "linux" ]; then - echo "not using ccache on non-linux" - exit 0 -fi - -if [ "$IRON_TOKEN" ]; then - - home_dir=$(pwd) - - # install the compiler cache - sudo apt-get $APT_ARGS install ccache p7zip-full - # iron_cache, pending py3 fixes upstream - pip install -I --allow-external --allow-insecure git+https://github.com/iron-io/iron_cache_python.git@8a451c7d7e4d16e0c3bedffd0f280d5d9bd4fe59#egg=iron_cache - - python ci/ironcache/get.py - ccache -C - - clear_cache=0 - if [ -f ~/ccache.7z ]; then - echo "Cache retrieved" - clear_cache=1 - cd $HOME - 7za e $HOME/ccache.7z - # ls -l $HOME - cd / - tar xvf $HOME/ccache - rm -rf $HOME/ccache.7z - rm -rf $HOME/ccache - - fi - - # did the last commit change cython files? - cd $home_dir - - retval=$(git diff HEAD~3 --numstat | grep -P "pyx|pxd"|wc -l) - echo "number of cython files changed: $retval" - - if [ $clear_cache -eq 1 ] && [ $retval -eq 0 ] - then - # nope, reuse cython files - echo "Will reuse cached cython file" - touch "$TRAVIS_BUILD_DIR"/pandas/*.c - touch "$TRAVIS_BUILD_DIR"/pandas/src/*.c - touch "$TRAVIS_BUILD_DIR"/pandas/*.cpp - else - echo "Rebuilding cythonized files" - fi -fi - -exit 0 diff --git a/ci/prep_cython_cache.sh b/ci/prep_cython_cache.sh new file mode 100755 index 0000000000000..6f16dce2fb431 --- /dev/null +++ b/ci/prep_cython_cache.sh @@ -0,0 +1,74 @@ +#!/bin/bash + +ls "$HOME/.cache/" + +PYX_CACHE_DIR="$HOME/.cache/pyxfiles" +pyx_file_list=`find ${TRAVIS_BUILD_DIR} -name "*.pyx"` +pyx_cache_file_list=`find ${PYX_CACHE_DIR} -name "*.pyx"` + +CACHE_File="$HOME/.cache/cython_files.tar" + +# Clear the cython cache 0 = NO, 1 = YES +clear_cache=0 + +pyx_files=`echo "$pyx_file_list" | wc -l` +pyx_cache_files=`echo "$pyx_cache_file_list" | wc -l` + +if [[ pyx_files -ne pyx_cache_files ]] +then + echo "Different number of pyx files" + clear_cache=1 +fi + +home_dir=$(pwd) + +if [ -f "$CACHE_File" ] && [ "$USE_CACHE" ] && [ -d "$PYX_CACHE_DIR" ]; then + + echo "Cache available - checking pyx diff" + + for i in ${pyx_file_list} + do + diff=`diff -u $i $PYX_CACHE_DIR${i}` + if [[ $? -eq 2 ]] + then + echo "${i##*/} can't be diffed; probably not in cache" + clear_cache=1 + fi + if [[ ! -z $diff ]] + then + echo "${i##*/} has changed:" + echo $diff + clear_cache=1 + fi + done + + if [ "$TRAVIS_PULL_REQUEST" == "false" ] + then + echo "Not a PR" + # Uncomment next 2 lines to turn off cython caching not in a PR + # echo "Non PR cython caching is disabled" + # clear_cache=1 + else + echo "In a PR" + # Uncomment next 2 lines to turn off cython caching in a PR + # echo "PR cython caching is disabled" + # clear_cache=1 + fi + +fi + +if [ $clear_cache -eq 0 ] && [ "$USE_CACHE" ] +then + # No and use_cache is set + echo "Will reuse cached cython file" + cd / + tar xvmf $CACHE_File + cd $home_dir +else + echo "Rebuilding cythonized files" + echo "Use cache (Blank if not set) = $USE_CACHE" + echo "Clear cache (1=YES) = $clear_cache" +fi + + +exit 0 diff --git a/ci/requirements-2.7.build b/ci/requirements-2.7.build index eca8460468d34..b2e2038faf7c3 100644 --- a/ci/requirements-2.7.build +++ b/ci/requirements-2.7.build @@ -1,4 +1,4 @@ python-dateutil=2.4.1 pytz=2013b -numpy=1.9.3 +numpy cython=0.19.1 diff --git a/ci/requirements-2.7.pip b/ci/requirements-2.7.pip index cc3462dbf9ed0..d16b932c8be4f 100644 --- a/ci/requirements-2.7.pip +++ b/ci/requirements-2.7.pip @@ -6,3 +6,4 @@ oauth2client==1.5.0 pathlib backports.lzma py +PyCrypto diff --git a/ci/requirements-2.7.run b/ci/requirements-2.7.run index 0c1132eaa62d3..560d6571b8771 100644 --- a/ci/requirements-2.7.run +++ b/ci/requirements-2.7.run @@ -1,6 +1,6 @@ python-dateutil=2.4.1 pytz=2013b -numpy=1.9.3 +numpy xlwt=0.7.5 numexpr pytables diff --git a/ci/requirements-2.7_DOC_BUILD.run b/ci/requirements-2.7_DOC_BUILD.run index 854776762fdb5..b87a41df4191d 100644 --- a/ci/requirements-2.7_DOC_BUILD.run +++ b/ci/requirements-2.7_DOC_BUILD.run @@ -1,6 +1,9 @@ ipython +ipykernel sphinx nbconvert +nbformat +notebook matplotlib scipy lxml diff --git a/ci/submit_ccache.sh b/ci/submit_ccache.sh deleted file mode 100755 index 7630bb7cc2760..0000000000000 --- a/ci/submit_ccache.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/bash - -if [ "${TRAVIS_OS_NAME}" != "linux" ]; then - echo "not using ccache on non-linux" - exit 0 -fi - -if [ "$IRON_TOKEN" ]; then - - home_dir=$(pwd) - ccache -s - - MISSES=$(ccache -s | grep "cache miss" | grep -Po "\d+") - echo "MISSES: $MISSES" - - if [ x"$MISSES" == x"0" ]; then - echo "No cache misses detected, skipping upload" - exit 0 - fi - - # install the compiler cache - sudo apt-get $APT_ARGS install ccache p7zip-full - # iron_cache, pending py3 fixes upstream - pip install -I --allow-external --allow-insecure git+https://github.com/iron-io/iron_cache_python.git@8a451c7d7e4d16e0c3bedffd0f280d5d9bd4fe59#egg=iron_cache - - rm -rf $HOME/ccache.7z - - tar cf - $HOME/.ccache \ - "$TRAVIS_BUILD_DIR"/pandas/{index,algos,lib,tslib,parser,hashtable}.c \ - "$TRAVIS_BUILD_DIR"/pandas/src/{sparse,testing}.c \ - "$TRAVIS_BUILD_DIR"/pandas/msgpack.cpp \ - | 7za a -si $HOME/ccache.7z - - split -b 500000 -d $HOME/ccache.7z $HOME/ccache. - - python ci/ironcache/put.py -fi - -exit 0 diff --git a/ci/submit_cython_cache.sh b/ci/submit_cython_cache.sh new file mode 100755 index 0000000000000..4f60df0ccb2d8 --- /dev/null +++ b/ci/submit_cython_cache.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +CACHE_File="$HOME/.cache/cython_files.tar" +PYX_CACHE_DIR="$HOME/.cache/pyxfiles" +pyx_file_list=`find ${TRAVIS_BUILD_DIR} -name "*.pyx"` + +rm -rf $CACHE_File +rm -rf $PYX_CACHE_DIR + +home_dir=$(pwd) + +mkdir $PYX_CACHE_DIR +rsync -Rv $pyx_file_list $PYX_CACHE_DIR + +echo "pyx files:" +echo $pyx_file_list + +tar cf ${CACHE_File} --files-from /dev/null + +for i in ${pyx_file_list} +do + f=${i%.pyx} + ls $f.{c,cpp} | tar rf ${CACHE_File} -T - +done + +echo "Cython files in cache tar:" +tar tvf ${CACHE_File} + +exit 0 diff --git a/ci/travis_encrypt_gbq.sh b/ci/travis_encrypt_gbq.sh new file mode 100755 index 0000000000000..719db67f384e0 --- /dev/null +++ b/ci/travis_encrypt_gbq.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +GBQ_JSON_FILE=$1 +GBQ_PROJECT_ID=$2 + +if [[ $# -ne 2 ]]; then + echo -e "Too few arguments.\nUsage: ./travis_encrypt_gbq.sh "\ + " " + exit 1 +fi + +if [[ $GBQ_JSON_FILE != *.json ]]; then + echo "ERROR: Expected *.json file" + exit 1 +fi + +if [[ ! -f $GBQ_JSON_FILE ]]; then + echo "ERROR: File $GBQ_JSON_FILE does not exist" + exit 1 +fi + +echo "Encrypting $GBQ_JSON_FILE..." +read -d "\n" TRAVIS_KEY TRAVIS_IV <<<$(travis encrypt-file $GBQ_JSON_FILE \ +travis_gbq.json.enc -f | grep -o "\w*_iv\|\w*_key"); + +echo "Adding your secure key and project id to travis_gbq_config.txt ..." +echo -e "TRAVIS_IV_ENV=$TRAVIS_IV\nTRAVIS_KEY_ENV=$TRAVIS_KEY\n"\ +"GBQ_PROJECT_ID='$GBQ_PROJECT_ID'" > travis_gbq_config.txt + +echo "Done. Removing file $GBQ_JSON_FILE" +rm $GBQ_JSON_FILE + +echo -e "Created encrypted credentials file travis_gbq.json.enc.\n"\ + "NOTE: Do NOT commit the *.json file containing your unencrypted" \ + "private key" diff --git a/ci/travis_gbq.json.enc b/ci/travis_gbq.json.enc new file mode 100644 index 0000000000000..c2a33bbd6f263 Binary files /dev/null and b/ci/travis_gbq.json.enc differ diff --git a/ci/travis_gbq_config.txt b/ci/travis_gbq_config.txt new file mode 100644 index 0000000000000..3b68d62f177cc --- /dev/null +++ b/ci/travis_gbq_config.txt @@ -0,0 +1,3 @@ +TRAVIS_IV_ENV=encrypted_1d9d7b1f171b_iv +TRAVIS_KEY_ENV=encrypted_1d9d7b1f171b_key +GBQ_PROJECT_ID='pandas-travis' diff --git a/ci/travis_process_gbq_encryption.sh b/ci/travis_process_gbq_encryption.sh new file mode 100755 index 0000000000000..7ff4c08f78e37 --- /dev/null +++ b/ci/travis_process_gbq_encryption.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +source ci/travis_gbq_config.txt + +if [[ -n ${!TRAVIS_IV_ENV} ]]; then + openssl aes-256-cbc -K ${!TRAVIS_KEY_ENV} -iv ${!TRAVIS_IV_ENV} \ + -in ci/travis_gbq.json.enc -out ci/travis_gbq.json -d; + export GBQ_PROJECT_ID=$GBQ_PROJECT_ID; + echo 'Successfully decrypted gbq credentials' +fi + diff --git a/doc/README.rst b/doc/README.rst index 06d95e6b9c44d..a93ad32a4c8f8 100644 --- a/doc/README.rst +++ b/doc/README.rst @@ -160,7 +160,7 @@ and `Good as first PR `_ where you could start out. -Or maybe you have an idea of you own, by using pandas, looking for something +Or maybe you have an idea of your own, by using pandas, looking for something in the documentation and thinking 'this can be improved', let's do something about that! diff --git a/doc/make.py b/doc/make.py index c09514d758833..d46be2611ce3d 100755 --- a/doc/make.py +++ b/doc/make.py @@ -18,13 +18,16 @@ """ from __future__ import print_function -import glob +import io +import glob # noqa import os import shutil import sys -import sphinx +from contextlib import contextmanager + +import sphinx # noqa import argparse -import jinja2 +import jinja2 # noqa os.environ['PYTHONPATH'] = '..' @@ -102,10 +105,98 @@ def clean(): shutil.rmtree('source/generated') +@contextmanager +def cleanup_nb(nb): + try: + yield + finally: + try: + os.remove(nb + '.executed') + except OSError: + pass + + +def get_kernel(): + """Find the kernel name for your python version""" + return 'python%s' % sys.version_info.major + + +def execute_nb(src, dst, allow_errors=False, timeout=1000, kernel_name=''): + """ + Execute notebook in `src` and write the output to `dst` + + Parameters + ---------- + src, dst: str + path to notebook + allow_errors: bool + timeout: int + kernel_name: str + defualts to value set in notebook metadata + + Returns + ------- + dst: str + """ + import nbformat + from nbconvert.preprocessors import ExecutePreprocessor + + with io.open(src, encoding='utf-8') as f: + nb = nbformat.read(f, as_version=4) + + ep = ExecutePreprocessor(allow_errors=allow_errors, + timeout=timeout, + kernel_name=kernel_name) + ep.preprocess(nb, resources={}) + + with io.open(dst, 'wt', encoding='utf-8') as f: + nbformat.write(nb, f) + return dst + + +def convert_nb(src, dst, to='html', template_file='basic'): + """ + Convert a notebook `src`. + + Parameters + ---------- + src, dst: str + filepaths + to: {'rst', 'html'} + format to export to + template_file: str + name of template file to use. Default 'basic' + """ + from nbconvert import HTMLExporter, RSTExporter + + dispatch = {'rst': RSTExporter, 'html': HTMLExporter} + exporter = dispatch[to.lower()](template_file=template_file) + + (body, resources) = exporter.from_filename(src) + with io.open(dst, 'wt', encoding='utf-8') as f: + f.write(body) + return dst + + def html(): check_build() - os.system('jupyter nbconvert --to=html --template=basic ' - '--output=source/html-styling.html source/html-styling.ipynb') + + notebooks = [ + 'source/html-styling.ipynb', + ] + + for nb in notebooks: + with cleanup_nb(nb): + try: + print("Converting %s" % nb) + kernel_name = get_kernel() + executed = execute_nb(nb, nb + '.executed', allow_errors=True, + kernel_name=kernel_name) + convert_nb(executed, nb.rstrip('.ipynb') + '.html') + except (ImportError, IndexError) as e: + print(e) + print("Failed to convert %s" % nb) + if os.system('sphinx-build -P -b html -d build/doctrees ' 'source build/html'): raise SystemExit("Building HTML failed.") @@ -116,6 +207,7 @@ def html(): except: pass + def zip_html(): try: print("\nZipping up HTML docs...") diff --git a/doc/source/_static/rplot-seaborn-example1.png b/doc/source/_static/rplot-seaborn-example1.png deleted file mode 100644 index d19a3a018bfbf..0000000000000 Binary files a/doc/source/_static/rplot-seaborn-example1.png and /dev/null differ diff --git a/doc/source/_static/rplot-seaborn-example2.png b/doc/source/_static/rplot-seaborn-example2.png deleted file mode 100644 index 9293082e78129..0000000000000 Binary files a/doc/source/_static/rplot-seaborn-example2.png and /dev/null differ diff --git a/doc/source/_static/rplot-seaborn-example3.png b/doc/source/_static/rplot-seaborn-example3.png deleted file mode 100644 index 8fd311acbd528..0000000000000 Binary files a/doc/source/_static/rplot-seaborn-example3.png and /dev/null differ diff --git a/doc/source/_static/rplot-seaborn-example3b.png b/doc/source/_static/rplot-seaborn-example3b.png deleted file mode 100644 index 4bfbac574ef29..0000000000000 Binary files a/doc/source/_static/rplot-seaborn-example3b.png and /dev/null differ diff --git a/doc/source/_static/rplot-seaborn-example4.png b/doc/source/_static/rplot-seaborn-example4.png deleted file mode 100644 index 8e08c7e86178a..0000000000000 Binary files a/doc/source/_static/rplot-seaborn-example4.png and /dev/null differ diff --git a/doc/source/_static/rplot-seaborn-example6.png b/doc/source/_static/rplot-seaborn-example6.png deleted file mode 100644 index 0fa56f4a018e7..0000000000000 Binary files a/doc/source/_static/rplot-seaborn-example6.png and /dev/null differ diff --git a/doc/source/advanced.rst b/doc/source/advanced.rst index e50e792201d26..0c843dd39b56f 100644 --- a/doc/source/advanced.rst +++ b/doc/source/advanced.rst @@ -729,7 +729,7 @@ Int64Index and RangeIndex Prior to 0.18.0, the ``Int64Index`` would provide the default index for all ``NDFrame`` objects. ``RangeIndex`` is a sub-class of ``Int64Index`` added in version 0.18.0, now providing the default index for all ``NDFrame`` objects. -``RangeIndex`` is an optimized version of ``Int64Index`` that can represent a monotonic ordered set. These are analagous to python :ref:`range types `. +``RangeIndex`` is an optimized version of ``Int64Index`` that can represent a monotonic ordered set. These are analagous to python `range types `__. .. _indexing.float64index: diff --git a/doc/source/api.rst b/doc/source/api.rst index 0e893308dd935..a510f663d19ee 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -151,6 +151,8 @@ Data manipulations cut qcut merge + merge_ordered + merge_asof concat get_dummies factorize @@ -378,6 +380,7 @@ Reindexing / Selection / Label manipulation Series.reindex Series.reindex_like Series.rename + Series.rename_axis Series.reset_index Series.sample Series.select @@ -469,6 +472,7 @@ These can be accessed like ``Series.dt.``. Series.dt.is_quarter_end Series.dt.is_year_start Series.dt.is_year_end + Series.dt.is_leap_year Series.dt.daysinmonth Series.dt.days_in_month Series.dt.tz @@ -887,6 +891,7 @@ Reindexing / Selection / Label manipulation DataFrame.reindex_axis DataFrame.reindex_like DataFrame.rename + DataFrame.rename_axis DataFrame.reset_index DataFrame.sample DataFrame.select @@ -943,6 +948,7 @@ Time series-related :toctree: generated/ DataFrame.asfreq + DataFrame.asof DataFrame.shift DataFrame.first_valid_index DataFrame.last_valid_index @@ -1343,6 +1349,8 @@ Modifying and Computations Index.unique Index.nunique Index.value_counts + Index.fillna + Index.dropna Conversion ~~~~~~~~~~ @@ -1492,6 +1500,7 @@ Time/Date Components DatetimeIndex.is_quarter_end DatetimeIndex.is_year_start DatetimeIndex.is_year_end + DatetimeIndex.is_leap_year DatetimeIndex.inferred_freq Selecting diff --git a/doc/source/basics.rst b/doc/source/basics.rst index 917d2f2bb8b04..1f670fb7fb593 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -1159,14 +1159,17 @@ mapping (a dict or Series) or an arbitrary function. s.rename(str.upper) If you pass a function, it must return a value when called with any of the -labels (and must produce a set of unique values). But if you pass a dict or -Series, it need only contain a subset of the labels as keys: +labels (and must produce a set of unique values). A dict or +Series can also be used: .. ipython:: python df.rename(columns={'one' : 'foo', 'two' : 'bar'}, index={'a' : 'apple', 'b' : 'banana', 'd' : 'durian'}) +If the mapping doesn't include a column/index label, it isn't renamed. Also +extra labels in the mapping don't throw an error. + The :meth:`~DataFrame.rename` method also provides an ``inplace`` named parameter that is by default ``False`` and copies the underlying data. Pass ``inplace=True`` to rename the data in place. @@ -1748,42 +1751,98 @@ Convert a subset of columns to a specified type using :meth:`~DataFrame.astype` dft.loc[:, ['a', 'b']] = dft.loc[:, ['a', 'b']].astype(np.uint8) dft.dtypes +.. _basics.object_conversion: + object conversion ~~~~~~~~~~~~~~~~~ -:meth:`~DataFrame.convert_objects` is a method to try to force conversion of types from the ``object`` dtype to other types. -To force conversion of specific types that are *number like*, e.g. could be a string that represents a number, -pass ``convert_numeric=True``. This will force strings and numbers alike to be numbers if possible, otherwise -they will be set to ``np.nan``. +pandas offers various functions to try to force conversion of types from the ``object`` dtype to other types. +The following functions are available for one dimensional object arrays or scalars: + +- :meth:`~pandas.to_numeric` (conversion to numeric dtypes) + + .. ipython:: python + + m = ['1.1', 2, 3] + pd.to_numeric(m) + +- :meth:`~pandas.to_datetime` (conversion to datetime objects) + + .. ipython:: python + + import datetime + m = ['2016-07-09', datetime.datetime(2016, 3, 2)] + pd.to_datetime(m) + +- :meth:`~pandas.to_timedelta` (conversion to timedelta objects) + + .. ipython:: python + + m = ['5us', pd.Timedelta('1day')] + pd.to_timedelta(m) + +To force a conversion, we can pass in an ``errors`` argument, which specifies how pandas should deal with elements +that cannot be converted to desired dtype or object. By default, ``errors='raise'``, meaning that any errors encountered +will be raised during the conversion process. However, if ``errors='coerce'``, these errors will be ignored and pandas +will convert problematic elements to ``pd.NaT`` (for datetime and timedelta) or ``np.nan`` (for numeric). This might be +useful if you are reading in data which is mostly of the desired dtype (e.g. numeric, datetime), but occasionally has +non-conforming elements intermixed that you want to represent as missing: .. ipython:: python - :okwarning: - df3['D'] = '1.' - df3['E'] = '1' - df3.convert_objects(convert_numeric=True).dtypes + import datetime + m = ['apple', datetime.datetime(2016, 3, 2)] + pd.to_datetime(m, errors='coerce') - # same, but specific dtype conversion - df3['D'] = df3['D'].astype('float16') - df3['E'] = df3['E'].astype('int32') - df3.dtypes + m = ['apple', 2, 3] + pd.to_numeric(m, errors='coerce') -To force conversion to ``datetime64[ns]``, pass ``convert_dates='coerce'``. -This will convert any datetime-like object to dates, forcing other values to ``NaT``. -This might be useful if you are reading in data which is mostly dates, -but occasionally has non-dates intermixed and you want to represent as missing. + m = ['apple', pd.Timedelta('1day')] + pd.to_timedelta(m, errors='coerce') + +The ``errors`` parameter has a third option of ``errors='ignore'``, which will simply return the passed in data if it +encounters any errors with the conversion to a desired data type: .. ipython:: python - import datetime - s = pd.Series([datetime.datetime(2001,1,1,0,0), - 'foo', 1.0, 1, pd.Timestamp('20010104'), - '20010105'], dtype='O') - s - pd.to_datetime(s, errors='coerce') + import datetime + m = ['apple', datetime.datetime(2016, 3, 2)] + pd.to_datetime(m, errors='ignore') + + m = ['apple', 2, 3] + pd.to_numeric(m, errors='ignore') + + m = ['apple', pd.Timedelta('1day')] + pd.to_timedelta(m, errors='ignore') + +In addition to object conversion, :meth:`~pandas.to_numeric` provides another argument ``downcast``, which gives the +option of downcasting the newly (or already) numeric data to a smaller dtype, which can conserve memory: + +.. ipython:: python + + m = ['1', 2, 3] + pd.to_numeric(m, downcast='integer') # smallest signed int dtype + pd.to_numeric(m, downcast='signed') # same as 'integer' + pd.to_numeric(m, downcast='unsigned') # smallest unsigned int dtype + pd.to_numeric(m, downcast='float') # smallest float dtype -In addition, :meth:`~DataFrame.convert_objects` will attempt the *soft* conversion of any *object* dtypes, meaning that if all -the objects in a Series are of the same type, the Series will have that dtype. +As these methods apply only to one-dimensional arrays, lists or scalars; they cannot be used directly on multi-dimensional objects such +as DataFrames. However, with :meth:`~pandas.DataFrame.apply`, we can "apply" the function over each column efficiently: + +.. ipython:: python + + import datetime + df = pd.DataFrame([['2016-07-09', datetime.datetime(2016, 3, 2)]] * 2, dtype='O') + df + df.apply(pd.to_datetime) + + df = pd.DataFrame([['1.1', 2, 3]] * 2, dtype='O') + df + df.apply(pd.to_numeric) + + df = pd.DataFrame([['5us', pd.Timedelta('1day')]] * 2, dtype='O') + df + df.apply(pd.to_timedelta) gotchas ~~~~~~~ diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst index b518bc947c2da..59ddfe602c033 100644 --- a/doc/source/categorical.rst +++ b/doc/source/categorical.rst @@ -648,6 +648,88 @@ In this case the categories are not the same and so an error is raised: The same applies to ``df.append(df_different)``. +.. _categorical.union: + +Unioning +~~~~~~~~ + +.. versionadded:: 0.19.0 + +If you want to combine categoricals that do not necessarily have +the same categories, the ``union_categoricals`` function will +combine a list-like of categoricals. The new categories +will be the union of the categories being combined. + +.. ipython:: python + + from pandas.types.concat import union_categoricals + a = pd.Categorical(["b", "c"]) + b = pd.Categorical(["a", "b"]) + union_categoricals([a, b]) + +By default, the resulting categories will be ordered as +they appear in the data. If you want the categories to +be lexsorted, use ``sort_categories=True`` argument. + +.. ipython:: python + + union_categoricals([a, b], sort_categories=True) + +``union_categoricals`` also works with the "easy" case of combining two +categoricals of the same categories and order information +(e.g. what you could also ``append`` for). + +.. ipython:: python + + a = pd.Categorical(["a", "b"], ordered=True) + b = pd.Categorical(["a", "b", "a"], ordered=True) + union_categoricals([a, b]) + +The below raises ``TypeError`` because the categories are ordered and not identical. + +.. code-block:: ipython + + In [1]: a = pd.Categorical(["a", "b"], ordered=True) + In [2]: b = pd.Categorical(["a", "b", "c"], ordered=True) + In [3]: union_categoricals([a, b]) + Out[3]: + TypeError: to union ordered Categoricals, all categories must be the same + +.. _categorical.concat: + +Concatenation +~~~~~~~~~~~~~ + +This section describes concatenations specific to ``category`` dtype. See :ref:`Concatenating objects` for general description. + +By default, ``Series`` or ``DataFrame`` concatenation which contains the same categories +results in ``category`` dtype, otherwise results in ``object`` dtype. +Use ``.astype`` or ``union_categoricals`` to get ``category`` result. + +.. ipython:: python + + # same categories + s1 = pd.Series(['a', 'b'], dtype='category') + s2 = pd.Series(['a', 'b', 'a'], dtype='category') + pd.concat([s1, s2]) + + # different categories + s3 = pd.Series(['b', 'c'], dtype='category') + pd.concat([s1, s3]) + + pd.concat([s1, s3]).astype('category') + union_categoricals([s1.values, s3.values]) + + +Following table summarizes the results of ``Categoricals`` related concatenations. + +| arg1 | arg2 | result | +|---------|-------------------------------------------|---------| +| category | category (identical categories) | category | +| category | category (different categories, both not ordered) | object (dtype is inferred) | +| category | category (different categories, either one is ordered) | object (dtype is inferred) | +| category | not category | object (dtype is inferred) | + Getting Data In/Out ------------------- diff --git a/doc/source/computation.rst b/doc/source/computation.rst index 59675e33e724b..1414d2dd3c8dc 100644 --- a/doc/source/computation.rst +++ b/doc/source/computation.rst @@ -391,6 +391,89 @@ For some windowing functions, additional parameters must be specified: such that the weights are normalized with respect to each other. Weights of ``[1, 1, 1]`` and ``[2, 2, 2]`` yield the same result. +.. _stats.moments.ts: + +Time-aware Rolling +~~~~~~~~~~~~~~~~~~ + +.. versionadded:: 0.19.0 + +New in version 0.19.0 are the ability to pass an offset (or convertible) to a ``.rolling()`` method and have it produce +variable sized windows based on the passed time window. For each time point, this includes all preceding values occurring +within the indicated time delta. + +This can be particularly useful for a non-regular time frequency index. + +.. ipython:: python + + dft = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}, + index=pd.date_range('20130101 09:00:00', periods=5, freq='s')) + dft + +This is a regular frequency index. Using an integer window parameter works to roll along the window frequency. + +.. ipython:: python + + dft.rolling(2).sum() + dft.rolling(2, min_periods=1).sum() + +Specifying an offset allows a more intuitive specification of the rolling frequency. + +.. ipython:: python + + dft.rolling('2s').sum() + +Using a non-regular, but still monotonic index, rolling with an integer window does not impart any special calculation. + + +.. ipython:: python + + dft = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}, + index = pd.Index([pd.Timestamp('20130101 09:00:00'), + pd.Timestamp('20130101 09:00:02'), + pd.Timestamp('20130101 09:00:03'), + pd.Timestamp('20130101 09:00:05'), + pd.Timestamp('20130101 09:00:06')], + name='foo')) + dft + dft.rolling(2).sum() + + +Using the time-specification generates variable windows for this sparse data. + +.. ipython:: python + + dft.rolling('2s').sum() + +Furthermore, we now allow an optional ``on`` parameter to specify a column (rather than the +default of the index) in a DataFrame. + +.. ipython:: python + + dft = dft.reset_index() + dft + dft.rolling('2s', on='foo').sum() + +.. _stats.moments.ts-versus-resampling: + +Time-aware Rolling vs. Resampling +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Using ``.rolling()`` with a time-based index is quite similar to :ref:`resampling `. They +both operate and perform reductive operations on time-indexed pandas objects. + +When using ``.rolling()`` with an offset. The offset is a time-delta. Take a backwards-in-time looking window, and +aggregate all of the values in that window (including the end-point, but not the start-point). This is the new value +at that point in the result. These are variable sized windows in time-space for each point of the input. You will get +a same sized result as the input. + +When using ``.resample()`` with an offset. Construct a new index that is the frequency of the offset. For each frequency +bin, aggregate points from the input within a backwards-in-time looking window that fall in that bin. The result of this +aggregation is the output for that frequency point. The windows are fixed size size in the frequency space. Your result +will have the shape of a regular frequency between the min and the max of the original input object. + +To summarize, ``.rolling()`` is a time-based window operation, while ``.resample()`` is a frequency-based window operation. + Centering Windows ~~~~~~~~~~~~~~~~~ diff --git a/doc/source/conf.py b/doc/source/conf.py index 87510d13ee484..a1b71f0279c7a 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -288,11 +288,11 @@ # Example configuration for intersphinx: refer to the Python standard library. intersphinx_mapping = { - 'statsmodels': ('http://statsmodels.sourceforge.net/devel/', None), + 'statsmodels': ('http://www.statsmodels.org/devel/', None), 'matplotlib': ('http://matplotlib.org/', None), 'python': ('http://docs.python.org/3', None), 'numpy': ('http://docs.scipy.org/doc/numpy', None), - 'scipy': ('http://docs.scipy.org/doc/scipy', None), + 'scipy': ('http://docs.scipy.org/doc/scipy/reference', None), 'py': ('http://pylib.readthedocs.org/en/latest/', None) } import glob @@ -318,6 +318,7 @@ # Add custom Documenter to handle attributes/methods of an AccessorProperty # eg pandas.Series.str and pandas.Series.dt (see GH9322) +import sphinx from sphinx.util import rpartition from sphinx.ext.autodoc import Documenter, MethodDocumenter, AttributeDocumenter from sphinx.ext.autosummary import Autosummary @@ -365,7 +366,10 @@ def resolve_name(self, modname, parents, path, base): if not modname: modname = self.env.temp_data.get('autodoc:module') if not modname: - modname = self.env.temp_data.get('py:module') + if sphinx.__version__ > '1.3': + modname = self.env.ref_context.get('py:module') + else: + modname = self.env.temp_data.get('py:module') # ... else, it stays None, which means invalid return modname, parents + [base] diff --git a/doc/source/contributing.rst b/doc/source/contributing.rst index e64ff4c155132..7f336abcaa6d7 100644 --- a/doc/source/contributing.rst +++ b/doc/source/contributing.rst @@ -21,7 +21,7 @@ and `Difficulty Novice `_ where you could start out. -Or maybe through using *pandas* you have an idea of you own or are looking for something +Or maybe through using *pandas* you have an idea of your own or are looking for something in the documentation and thinking 'this can be improved'...you can do something about it! @@ -188,7 +188,7 @@ To work in this environment, Windows users should ``activate`` it as follows:: activate pandas_dev -Mac OSX and Linux users should use:: +Mac OSX / Linux users should use:: source activate pandas_dev @@ -198,10 +198,14 @@ To view your environments:: conda info -e -To return to you home root environment:: +To return to your home root environment in Windows:: deactivate +To return to your home root environment in OSX / Linux:: + + source deactivate + See the full conda docs `here `__. At this point you can easily do an *in-place* install, as detailed in the next section. @@ -347,19 +351,24 @@ How to build the *pandas* documentation Requirements ~~~~~~~~~~~~ -To build the *pandas* docs there are some extra requirements: you will need to +First, you need to have a development environment to be able to build pandas +(see the docs on :ref:`creating a development environment above `). +Further, to build the docs, there are some extra requirements: you will need to have ``sphinx`` and ``ipython`` installed. `numpydoc `_ is used to parse the docstrings that follow the Numpy Docstring Standard (see above), but you don't need to install this because a local copy of numpydoc is included in the *pandas* source code. +`nbconvert `_ and +`nbformat `_ are required to build +the Jupyter notebooks included in the documentation. -It is easiest to :ref:`create a development environment `, then install:: +If you have a conda environment named ``pandas_dev``, you can install the extra +requirements with:: - conda install -n pandas_dev sphinx ipython + conda install -n pandas_dev sphinx ipython nbconvert nbformat -Furthermore, it is recommended to have all `optional dependencies -`_ +Furthermore, it is recommended to have all :ref:`optional dependencies `. installed. This is not strictly necessary, but be aware that you will see some error messages when building the docs. This happens because all the code in the documentation is executed during the doc build, and so code examples using optional dependencies @@ -368,8 +377,7 @@ version of all dependencies. .. warning:: - You need to have ``sphinx`` version 1.2.2 or newer, but older than version 1.3. - Versions before 1.1.3 should also work. + You need to have ``sphinx`` version >= 1.3.2. Building the documentation ~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -567,42 +575,35 @@ To install asv:: pip install git+https://github.com/spacetelescope/asv -If you need to run a benchmark, change your directory to ``/asv_bench/`` and run -the following if you have been developing on ``master``:: +If you need to run a benchmark, change your directory to ``asv_bench/`` and run:: - asv continuous master + asv continuous -f 1.1 upstream/master HEAD -This command uses ``conda`` by default for creating the benchmark +You can replace ``HEAD`` with the name of the branch you are working on, +and report benchmarks that changed by more than 10%. +The command uses ``conda`` by default for creating the benchmark environments. If you want to use virtualenv instead, write:: - asv continuous -E virtualenv master + asv continuous -f 1.1 -E virtualenv upstream/master HEAD The ``-E virtualenv`` option should be added to all ``asv`` commands that run benchmarks. The default value is defined in ``asv.conf.json``. -If you are working on another branch, either of the following can be used:: - - asv continuous master HEAD - asv continuous master your_branch - -This will check out the master revision and run the suite on both master and -your commit. Running the full test suite can take up to one hour and use up -to 3GB of RAM. Usually it is sufficient to paste only a subset of the results into -the pull request to show that the committed changes do not cause unexpected -performance regressions. - -You can run specific benchmarks using the ``-b`` flag, which takes a regular expression. -For example, this will only run tests from a ``pandas/asv_bench/benchmarks/groupby.py`` -file:: +Running the full test suite can take up to one hour and use up to 3GB of RAM. +Usually it is sufficient to paste only a subset of the results into the pull +request to show that the committed changes do not cause unexpected performance +regressions. You can run specific benchmarks using the ``-b`` flag, which +takes a regular expression. For example, this will only run tests from a +``pandas/asv_bench/benchmarks/groupby.py`` file:: - asv continuous master -b groupby + asv continuous -f 1.1 upstream/master HEAD -b ^groupby If you want to only run a specific group of tests from a file, you can do it using ``.`` as a separator. For example:: - asv continuous master -b groupby.groupby_agg_builtins1 + asv continuous -f 1.1 upstream/master HEAD -b groupby.groupby_agg_builtins -will only run a ``groupby_agg_builtins1`` test defined in a ``groupby`` file. +will only run the ``groupby_agg_builtins`` benchmark defined in ``groupby.py``. You can also run the benchmark suite using the version of ``pandas`` already installed in your current Python environment. This can be @@ -625,6 +626,44 @@ This will display stderr from the benchmarks, and use your local Information on how to write a benchmark and how to use asv can be found in the `asv documentation `_. +.. _contributing.gbq_integration_tests: + +Running Google BigQuery Integration Tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +You will need to create a Google BigQuery private key in JSON format in +order to run Google BigQuery integration tests on your local machine and +on Travis-CI. The first step is to create a `service account +`__. + +Integration tests for ``pandas.io.gbq`` are skipped in pull requests because +the credentials that are required for running Google BigQuery integration +tests are `encrypted `__ +on Travis-CI and are only accessible from the pydata/pandas repository. The +credentials won't be available on forks of pandas. Here are the steps to run +gbq integration tests on a forked repository: + +#. First, complete all the steps in the `Encrypting Files Prerequisites + `__ section. +#. Sign into `Travis `__ using your GitHub account. +#. Enable your forked repository of pandas for testing in `Travis + `__. +#. Run the following command from terminal where the current working directory + is the ``ci`` folder:: + + ./travis_encrypt_gbq.sh + +#. Create a new branch from the branch used in your pull request. Commit the + encrypted file called ``travis_gbq.json.enc`` as well as the file + ``travis_gbq_config.txt``, in an otherwise empty commit. DO NOT commit the + ``*.json`` file which contains your unencrypted private key. +#. Your branch should be tested automatically once it is pushed. You can check + the status by visiting your Travis branches page which exists at the + following location: https://travis-ci.org/your-user-name/pandas/branches . + Click on a build job for your branch. Expand the following line in the + build log: ``ci/print_skipped.py /tmp/nosetests.xml`` . Search for the + term ``test_gbq`` and confirm that gbq integration tests are not skipped. + Running the vbench performance test suite (phasing out) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -813,6 +852,11 @@ updated. Pushing them to GitHub again is done by:: This will automatically update your pull request with the latest code and restart the Travis-CI tests. +If your pull request is related to the ``pandas.io.gbq`` module, please see +the section on :ref:`Running Google BigQuery Integration Tests +` to configure a Google BigQuery service +account for your pull request on Travis-CI. + Delete your merged branch (optional) ------------------------------------ diff --git a/doc/source/cookbook.rst b/doc/source/cookbook.rst index 0dbc79415af0b..38a816060e1bc 100644 --- a/doc/source/cookbook.rst +++ b/doc/source/cookbook.rst @@ -679,6 +679,19 @@ The :ref:`Pivot ` docs. 'Employed' : lambda x : sum(x), 'Grade' : lambda x : sum(x) / len(x)}) +`Plot pandas DataFrame with year over year data +`__ + +To create year and month crosstabulation: + +.. ipython:: python + + df = pd.DataFrame({'value': np.random.randn(36)}, + index=pd.date_range('2011-01-01', freq='M', periods=36)) + + pd.pivot_table(df, index=df.index.month, columns=df.index.year, + values='value', aggfunc='sum') + Apply ***** diff --git a/doc/source/dsintro.rst b/doc/source/dsintro.rst index bbb43396a85b9..6063e3e8bce45 100644 --- a/doc/source/dsintro.rst +++ b/doc/source/dsintro.rst @@ -935,125 +935,20 @@ method: minor_axis=['a', 'b', 'c', 'd']) panel.to_frame() - -.. _dsintro.panel4d: - -Panel4D (Experimental) ----------------------- - -``Panel4D`` is a 4-Dimensional named container very much like a ``Panel``, but -having 4 named dimensions. It is intended as a test bed for more N-Dimensional named -containers. - - - **labels**: axis 0, each item corresponds to a Panel contained inside - - **items**: axis 1, each item corresponds to a DataFrame contained inside - - **major_axis**: axis 2, it is the **index** (rows) of each of the - DataFrames - - **minor_axis**: axis 3, it is the **columns** of each of the DataFrames - -``Panel4D`` is a sub-class of ``Panel``, so most methods that work on Panels are -applicable to Panel4D. The following methods are disabled: - - - ``join , to_frame , to_excel , to_sparse , groupby`` - -Construction of Panel4D works in a very similar manner to a ``Panel`` - -From 4D ndarray with optional axis labels -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. ipython:: python - - p4d = pd.Panel4D(np.random.randn(2, 2, 5, 4), - labels=['Label1','Label2'], - items=['Item1', 'Item2'], - major_axis=pd.date_range('1/1/2000', periods=5), - minor_axis=['A', 'B', 'C', 'D']) - p4d - - -From dict of Panel objects -~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. ipython:: python - - data = { 'Label1' : pd.Panel({ 'Item1' : pd.DataFrame(np.random.randn(4, 3)) }), - 'Label2' : pd.Panel({ 'Item2' : pd.DataFrame(np.random.randn(4, 2)) }) } - pd.Panel4D(data) - -Note that the values in the dict need only be **convertible to Panels**. -Thus, they can be any of the other valid inputs to Panel as per above. - -Slicing -~~~~~~~ - -Slicing works in a similar manner to a Panel. ``[]`` slices the first dimension. -``.ix`` allows you to slice arbitrarily and get back lower dimensional objects - -.. ipython:: python - - p4d['Label1'] - -4D -> Panel - -.. ipython:: python - - p4d.ix[:,:,:,'A'] - -4D -> DataFrame - -.. ipython:: python - - p4d.ix[:,:,0,'A'] - -4D -> Series - -.. ipython:: python - - p4d.ix[:,0,0,'A'] - -Transposing -~~~~~~~~~~~ - -A Panel4D can be rearranged using its ``transpose`` method (which does not make a -copy by default unless the data are heterogeneous): - -.. ipython:: python - - p4d.transpose(3, 2, 1, 0) - .. _dsintro.panelnd: +.. _dsintro.panel4d: -PanelND (Experimental) ----------------------- - -PanelND is a module with a set of factory functions to enable a user to construct N-dimensional named -containers like Panel4D, with a custom set of axis labels. Thus a domain-specific container can easily be -created. - -The following creates a Panel5D. A new panel type object must be sliceable into a lower dimensional object. -Here we slice to a Panel4D. - -.. ipython:: python - - from pandas.core import panelnd - Panel5D = panelnd.create_nd_panel_factory( - klass_name = 'Panel5D', - orders = [ 'cool', 'labels','items','major_axis','minor_axis'], - slices = { 'labels' : 'labels', 'items' : 'items', - 'major_axis' : 'major_axis', 'minor_axis' : 'minor_axis' }, - slicer = pd.Panel4D, - aliases = { 'major' : 'major_axis', 'minor' : 'minor_axis' }, - stat_axis = 2) - - p5d = Panel5D(dict(C1 = p4d)) - p5d +Panel4D and PanelND (Deprecated) +-------------------------------- - # print a slice of our 5D - p5d.ix['C1',:,:,0:3,:] +.. warning:: - # transpose it - p5d.transpose(1,2,3,4,0) + In 0.19.0 ``Panel4D`` and ``PanelND`` are deprecated and will be removed in + a future version. The recommended way to represent these types of + n-dimensional data are with the + `xarray package `__. + Pandas provides a :meth:`~Panel4D.to_xarray` method to automate + this conversion. - # look at the shape & dim - p5d.shape - p5d.ndim +See the `docs of a previous version `__ +for documentation on these objects. diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst index 0d7315d20eac3..17ebd1f163f4f 100644 --- a/doc/source/ecosystem.rst +++ b/doc/source/ecosystem.rst @@ -24,8 +24,8 @@ substantial projects that you feel should be on this list, please let us know. Statistics and Machine Learning ------------------------------- -`Statsmodels `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +`Statsmodels `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Statsmodels is the prominent python "statistics and econometrics library" and it has a long-standing special relationship with pandas. Statsmodels provides powerful statistics, @@ -77,14 +77,28 @@ more advanced types of plots then those offered by pandas. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The `Vincent `__ project leverages `Vega `__ -(that in turn, leverages `d3 `__) to create plots . It has great support -for pandas data objects. +(that in turn, leverages `d3 `__) to create +plots. Although functional, as of Summer 2016 the Vincent project has not been updated +in over two years and is `unlikely to receive further updates `__. + +`IPython Vega `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Like Vincent, the `IPython Vega `__ project leverages `Vega +`__ to create plots, but primarily +targets the IPython Notebook environment. `Plotly `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ `Plotly’s `__ `Python API `__ enables interactive figures and web shareability. Maps, 2D, 3D, and live-streaming graphs are rendered with WebGL and `D3.js `__. The library supports plotting directly from a pandas DataFrame and cloud-based collaboration. Users of `matplotlib, ggplot for Python, and Seaborn `__ can convert figures into interactive web-based plots. Plots can be drawn in `IPython Notebooks `__ , edited with R or MATLAB, modified in a GUI, or embedded in apps and dashboards. Plotly is free for unlimited sharing, and has `cloud `__, `offline `__, or `on-premise `__ accounts for private use. +`Pandas-Qt `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Spun off from the main pandas library, the `Pandas-Qt `__ +library enables DataFrame visualization and manipulation in PyQt4 and PySide applications. + .. _ecosystem.ide: IDE @@ -115,8 +129,8 @@ compatible with non-HTML IPython output formats.) qgrid is "an interactive grid for sorting and filtering DataFrames in IPython Notebook" built with SlickGrid. -`Spyder `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +`Spyder `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Spyder is a cross-platform Qt-based open-source Python IDE with editing, testing, debugging, and introspection features. diff --git a/doc/source/faq.rst b/doc/source/faq.rst index e5d659cc31606..d23e0ca59254d 100644 --- a/doc/source/faq.rst +++ b/doc/source/faq.rst @@ -110,78 +110,6 @@ details. Visualizing Data in Qt applications ----------------------------------- -.. warning:: - - The ``qt`` support is **deprecated and will be removed in a future version**. - We refer users to the external package `pandas-qt `_. - -There is experimental support for visualizing DataFrames in PyQt4 and PySide -applications. At the moment you can display and edit the values of the cells -in the DataFrame. Qt will take care of displaying just the portion of the -DataFrame that is currently visible and the edits will be immediately saved to -the underlying DataFrame - -To demonstrate this we will create a simple PySide application that will switch -between two editable DataFrames. For this will use the ``DataFrameModel`` class -that handles the access to the DataFrame, and the ``DataFrameWidget``, which is -just a thin layer around the ``QTableView``. - -.. code-block:: python - - import numpy as np - import pandas as pd - from pandas.sandbox.qtpandas import DataFrameModel, DataFrameWidget - from PySide import QtGui, QtCore - - # Or if you use PyQt4: - # from PyQt4 import QtGui, QtCore - - class MainWidget(QtGui.QWidget): - def __init__(self, parent=None): - super(MainWidget, self).__init__(parent) - - # Create two DataFrames - self.df1 = pd.DataFrame(np.arange(9).reshape(3, 3), - columns=['foo', 'bar', 'baz']) - self.df2 = pd.DataFrame({ - 'int': [1, 2, 3], - 'float': [1.5, 2.5, 3.5], - 'string': ['a', 'b', 'c'], - 'nan': [np.nan, np.nan, np.nan] - }, index=['AAA', 'BBB', 'CCC'], - columns=['int', 'float', 'string', 'nan']) - - # Create the widget and set the first DataFrame - self.widget = DataFrameWidget(self.df1) - - # Create the buttons for changing DataFrames - self.button_first = QtGui.QPushButton('First') - self.button_first.clicked.connect(self.on_first_click) - self.button_second = QtGui.QPushButton('Second') - self.button_second.clicked.connect(self.on_second_click) - - # Set the layout - vbox = QtGui.QVBoxLayout() - vbox.addWidget(self.widget) - hbox = QtGui.QHBoxLayout() - hbox.addWidget(self.button_first) - hbox.addWidget(self.button_second) - vbox.addLayout(hbox) - self.setLayout(vbox) - - def on_first_click(self): - '''Sets the first DataFrame''' - self.widget.setDataFrame(self.df1) - - def on_second_click(self): - '''Sets the second DataFrame''' - self.widget.setDataFrame(self.df2) - - if __name__ == '__main__': - import sys - - # Initialize the application - app = QtGui.QApplication(sys.argv) - mw = MainWidget() - mw.show() - app.exec_() +There is no support for such visualization in pandas. However, the external +package `pandas-qt `_ does +provide this functionality. diff --git a/doc/source/gotchas.rst b/doc/source/gotchas.rst index c79b902d559d5..99d7486cde2d0 100644 --- a/doc/source/gotchas.rst +++ b/doc/source/gotchas.rst @@ -173,7 +173,7 @@ dtype in order to store the NAs. These are summarized by this table: ``integer``, cast to ``float64`` ``boolean``, cast to ``object`` -While this may seem like a heavy trade-off, in practice I have found very few +While this may seem like a heavy trade-off, I have found very few cases where this is an issue in practice. Some explanation for the motivation here in the next section. diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index 02309fe5d6509..c5a77770085d6 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -13,6 +13,7 @@ matplotlib.style.use('ggplot') import matplotlib.pyplot as plt plt.close('all') + from collections import OrderedDict ***************************** Group By: split-apply-combine @@ -487,6 +488,17 @@ must be either implemented on GroupBy or available via :ref:`dispatching grouped.agg({'C' : 'sum', 'D' : 'std'}) +.. note:: + + If you pass a dict to ``aggregate``, the ordering of the output colums is + non-deterministic. If you want to be sure the output columns will be in a specific + order, you can use an ``OrderedDict``. Compare the output of the following two commands: + +.. ipython:: python + + grouped.agg({'D': 'std', 'C': 'mean'}) + grouped.agg(OrderedDict([('D', 'std'), ('C', 'mean')])) + .. _groupby.aggregate.cython: Cython-optimized aggregation functions @@ -1015,6 +1027,23 @@ Regroup columns of a DataFrame according to their sum, and sum the aggregated on df df.groupby(df.sum(), axis=1).sum() +Groupby by Indexer to 'resample' data +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Resampling produces new hypothetical samples(resamples) from already existing observed data or from a model that generates data. These new samples are similar to the pre-existing samples. + +In order to resample to work on indices that are non-datetimelike , the following procedure can be utilized. + +In the following examples, **df.index // 5** returns a binary array which is used to determine what get's selected for the groupby operation. + +.. note:: The below example shows how we can downsample by consolidation of samples into fewer samples. Here by using **df.index // 5**, we are aggregating the samples in bins. By applying **std()** function, we aggregate the information contained in many samples into a small subset of values which is their standard deviation thereby reducing the number of samples. + +.. ipython:: python + + df = pd.DataFrame(np.random.randn(10,2)) + df + df.index // 5 + df.groupby(df.index // 5).std() Returning a Series to propagate names ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/html-styling.ipynb b/doc/source/html-styling.ipynb index 77813a03c704a..e55712b2bb4f6 100644 --- a/doc/source/html-styling.ipynb +++ b/doc/source/html-styling.ipynb @@ -47,14 +47,16 @@ "`Styler.apply` passes each column or row into your DataFrame one-at-a-time or the entire table at once, depending on the `axis` keyword argument.\n", "For columnwise use `axis=0`, rowwise use `axis=1`, and for the entire table at once use `axis=None`.\n", "\n", - "The result of the function application, a CSS attribute-value pair, is stored in an internal dictionary on your ``Styler`` object.\n", + "For `Styler.applymap` your function should take a scalar and return a single string with the CSS attribute-value pair.\n", + "\n", + "For `Styler.apply` your function should take a Series or DataFrame (depending on the axis parameter), and return a Series or DataFrame with an identical shape where each value is a string with a CSS attribute-value pair.\n", "\n", "Let's see some examples." ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": { "collapsed": false }, @@ -79,293 +81,11 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - "\n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " A\n", - " \n", - " B\n", - " \n", - " C\n", - " \n", - " D\n", - " \n", - " E\n", - " \n", - "
None\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " 0\n", - " \n", - " \n", - " 1\n", - " \n", - " \n", - " 1.32921\n", - " \n", - " \n", - " nan\n", - " \n", - " \n", - " -0.31628\n", - " \n", - " \n", - " -0.99081\n", - " \n", - "
\n", - " 1\n", - " \n", - " \n", - " 2\n", - " \n", - " \n", - " -1.07082\n", - " \n", - " \n", - " -1.43871\n", - " \n", - " \n", - " 0.564417\n", - " \n", - " \n", - " 0.295722\n", - " \n", - "
\n", - " 2\n", - " \n", - " \n", - " 3\n", - " \n", - " \n", - " -1.6264\n", - " \n", - " \n", - " 0.219565\n", - " \n", - " \n", - " 0.678805\n", - " \n", - " \n", - " 1.88927\n", - " \n", - "
\n", - " 3\n", - " \n", - " \n", - " 4\n", - " \n", - " \n", - " 0.961538\n", - " \n", - " \n", - " 0.104011\n", - " \n", - " \n", - " -0.481165\n", - " \n", - " \n", - " 0.850229\n", - " \n", - "
\n", - " 4\n", - " \n", - " \n", - " 5\n", - " \n", - " \n", - " 1.45342\n", - " \n", - " \n", - " 1.05774\n", - " \n", - " \n", - " 0.165562\n", - " \n", - " \n", - " 0.515018\n", - " \n", - "
\n", - " 5\n", - " \n", - " \n", - " 6\n", - " \n", - " \n", - " -1.33694\n", - " \n", - " \n", - " 0.562861\n", - " \n", - " \n", - " 1.39285\n", - " \n", - " \n", - " -0.063328\n", - " \n", - "
\n", - " 6\n", - " \n", - " \n", - " 7\n", - " \n", - " \n", - " 0.121668\n", - " \n", - " \n", - " 1.2076\n", - " \n", - " \n", - " -0.00204021\n", - " \n", - " \n", - " 1.6278\n", - " \n", - "
\n", - " 7\n", - " \n", - " \n", - " 8\n", - " \n", - " \n", - " 0.354493\n", - " \n", - " \n", - " 1.03753\n", - " \n", - " \n", - " -0.385684\n", - " \n", - " \n", - " 0.519818\n", - " \n", - "
\n", - " 8\n", - " \n", - " \n", - " 9\n", - " \n", - " \n", - " 1.68658\n", - " \n", - " \n", - " -1.32596\n", - " \n", - " \n", - " 1.42898\n", - " \n", - " \n", - " -2.08935\n", - " \n", - "
\n", - " 9\n", - " \n", - " \n", - " 10\n", - " \n", - " \n", - " -0.12982\n", - " \n", - " \n", - " 0.631523\n", - " \n", - " \n", - " -0.586538\n", - " \n", - " \n", - " 0.29072\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df.style" ] @@ -381,31 +101,11 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "data": { - "text/plain": [ - "['',\n", - " ' \n", - "\n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " A\n", - " \n", - " B\n", - " \n", - " C\n", - " \n", - " D\n", - " \n", - " E\n", - " \n", - "
None\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " 0\n", - " \n", - " \n", - " 1\n", - " \n", - " \n", - " 1.32921\n", - " \n", - " \n", - " nan\n", - " \n", - " \n", - " -0.31628\n", - " \n", - " \n", - " -0.99081\n", - " \n", - "
\n", - " 1\n", - " \n", - " \n", - " 2\n", - " \n", - " \n", - " -1.07082\n", - " \n", - " \n", - " -1.43871\n", - " \n", - " \n", - " 0.564417\n", - " \n", - " \n", - " 0.295722\n", - " \n", - "
\n", - " 2\n", - " \n", - " \n", - " 3\n", - " \n", - " \n", - " -1.6264\n", - " \n", - " \n", - " 0.219565\n", - " \n", - " \n", - " 0.678805\n", - " \n", - " \n", - " 1.88927\n", - " \n", - "
\n", - " 3\n", - " \n", - " \n", - " 4\n", - " \n", - " \n", - " 0.961538\n", - " \n", - " \n", - " 0.104011\n", - " \n", - " \n", - " -0.481165\n", - " \n", - " \n", - " 0.850229\n", - " \n", - "
\n", - " 4\n", - " \n", - " \n", - " 5\n", - " \n", - " \n", - " 1.45342\n", - " \n", - " \n", - " 1.05774\n", - " \n", - " \n", - " 0.165562\n", - " \n", - " \n", - " 0.515018\n", - " \n", - "
\n", - " 5\n", - " \n", - " \n", - " 6\n", - " \n", - " \n", - " -1.33694\n", - " \n", - " \n", - " 0.562861\n", - " \n", - " \n", - " 1.39285\n", - " \n", - " \n", - " -0.063328\n", - " \n", - "
\n", - " 6\n", - " \n", - " \n", - " 7\n", - " \n", - " \n", - " 0.121668\n", - " \n", - " \n", - " 1.2076\n", - " \n", - " \n", - " -0.00204021\n", - " \n", - " \n", - " 1.6278\n", - " \n", - "
\n", - " 7\n", - " \n", - " \n", - " 8\n", - " \n", - " \n", - " 0.354493\n", - " \n", - " \n", - " 1.03753\n", - " \n", - " \n", - " -0.385684\n", - " \n", - " \n", - " 0.519818\n", - " \n", - "
\n", - " 8\n", - " \n", - " \n", - " 9\n", - " \n", - " \n", - " 1.68658\n", - " \n", - " \n", - " -1.32596\n", - " \n", - " \n", - " 1.42898\n", - " \n", - " \n", - " -2.08935\n", - " \n", - "
\n", - " 9\n", - " \n", - " \n", - " 10\n", - " \n", - " \n", - " -0.12982\n", - " \n", - " \n", - " 0.631523\n", - " \n", - " \n", - " -0.586538\n", - " \n", - " \n", - " 0.29072\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "s = df.style.applymap(color_negative_red)\n", "s" @@ -1052,7 +170,9 @@ "source": [ "Notice the similarity with the standard `df.applymap`, which operates on DataFrames elementwise. We want you to be able to resuse your existing knowledge of how to interact with DataFrames.\n", "\n", - "Notice also that our function returned a string containing the CSS attribute and value, separated by a colon just like in a `\n", - "\n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " A\n", - " \n", - " B\n", - " \n", - " C\n", - " \n", - " D\n", - " \n", - " E\n", - " \n", - "
None\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " 0\n", - " \n", - " \n", - " 1\n", - " \n", - " \n", - " 1.32921\n", - " \n", - " \n", - " nan\n", - " \n", - " \n", - " -0.31628\n", - " \n", - " \n", - " -0.99081\n", - " \n", - "
\n", - " 1\n", - " \n", - " \n", - " 2\n", - " \n", - " \n", - " -1.07082\n", - " \n", - " \n", - " -1.43871\n", - " \n", - " \n", - " 0.564417\n", - " \n", - " \n", - " 0.295722\n", - " \n", - "
\n", - " 2\n", - " \n", - " \n", - " 3\n", - " \n", - " \n", - " -1.6264\n", - " \n", - " \n", - " 0.219565\n", - " \n", - " \n", - " 0.678805\n", - " \n", - " \n", - " 1.88927\n", - " \n", - "
\n", - " 3\n", - " \n", - " \n", - " 4\n", - " \n", - " \n", - " 0.961538\n", - " \n", - " \n", - " 0.104011\n", - " \n", - " \n", - " -0.481165\n", - " \n", - " \n", - " 0.850229\n", - " \n", - "
\n", - " 4\n", - " \n", - " \n", - " 5\n", - " \n", - " \n", - " 1.45342\n", - " \n", - " \n", - " 1.05774\n", - " \n", - " \n", - " 0.165562\n", - " \n", - " \n", - " 0.515018\n", - " \n", - "
\n", - " 5\n", - " \n", - " \n", - " 6\n", - " \n", - " \n", - " -1.33694\n", - " \n", - " \n", - " 0.562861\n", - " \n", - " \n", - " 1.39285\n", - " \n", - " \n", - " -0.063328\n", - " \n", - "
\n", - " 6\n", - " \n", - " \n", - " 7\n", - " \n", - " \n", - " 0.121668\n", - " \n", - " \n", - " 1.2076\n", - " \n", - " \n", - " -0.00204021\n", - " \n", - " \n", - " 1.6278\n", - " \n", - "
\n", - " 7\n", - " \n", - " \n", - " 8\n", - " \n", - " \n", - " 0.354493\n", - " \n", - " \n", - " 1.03753\n", - " \n", - " \n", - " -0.385684\n", - " \n", - " \n", - " 0.519818\n", - " \n", - "
\n", - " 8\n", - " \n", - " \n", - " 9\n", - " \n", - " \n", - " 1.68658\n", - " \n", - " \n", - " -1.32596\n", - " \n", - " \n", - " 1.42898\n", - " \n", - " \n", - " -2.08935\n", - " \n", - "
\n", - " 9\n", - " \n", - " \n", - " 10\n", - " \n", - " \n", - " -0.12982\n", - " \n", - " \n", - " 0.631523\n", - " \n", - " \n", - " -0.586538\n", - " \n", - " \n", - " 0.29072\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df.style.apply(highlight_max)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this case the input is a `Series`, one column at a time.\n", + "Notice that the output shape of `highlight_max` matches the input shape, an array with `len(s)` items." + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -1412,693 +228,11 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - "\n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " A\n", - " \n", - " B\n", - " \n", - " C\n", - " \n", - " D\n", - " \n", - " E\n", - " \n", - "
None\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " 0\n", - " \n", - " \n", - " 1\n", - " \n", - " \n", - " 1.32921\n", - " \n", - " \n", - " nan\n", - " \n", - " \n", - " -0.31628\n", - " \n", - " \n", - " -0.99081\n", - " \n", - "
\n", - " 1\n", - " \n", - " \n", - " 2\n", - " \n", - " \n", - " -1.07082\n", - " \n", - " \n", - " -1.43871\n", - " \n", - " \n", - " 0.564417\n", - " \n", - " \n", - " 0.295722\n", - " \n", - "
\n", - " 2\n", - " \n", - " \n", - " 3\n", - " \n", - " \n", - " -1.6264\n", - " \n", - " \n", - " 0.219565\n", - " \n", - " \n", - " 0.678805\n", - " \n", - " \n", - " 1.88927\n", - " \n", - "
\n", - " 3\n", - " \n", - " \n", - " 4\n", - " \n", - " \n", - " 0.961538\n", - " \n", - " \n", - " 0.104011\n", - " \n", - " \n", - " -0.481165\n", - " \n", - " \n", - " 0.850229\n", - " \n", - "
\n", - " 4\n", - " \n", - " \n", - " 5\n", - " \n", - " \n", - " 1.45342\n", - " \n", - " \n", - " 1.05774\n", - " \n", - " \n", - " 0.165562\n", - " \n", - " \n", - " 0.515018\n", - " \n", - "
\n", - " 5\n", - " \n", - " \n", - " 6\n", - " \n", - " \n", - " -1.33694\n", - " \n", - " \n", - " 0.562861\n", - " \n", - " \n", - " 1.39285\n", - " \n", - " \n", - " -0.063328\n", - " \n", - "
\n", - " 6\n", - " \n", - " \n", - " 7\n", - " \n", - " \n", - " 0.121668\n", - " \n", - " \n", - " 1.2076\n", - " \n", - " \n", - " -0.00204021\n", - " \n", - " \n", - " 1.6278\n", - " \n", - "
\n", - " 7\n", - " \n", - " \n", - " 8\n", - " \n", - " \n", - " 0.354493\n", - " \n", - " \n", - " 1.03753\n", - " \n", - " \n", - " -0.385684\n", - " \n", - " \n", - " 0.519818\n", - " \n", - "
\n", - " 8\n", - " \n", - " \n", - " 9\n", - " \n", - " \n", - " 1.68658\n", - " \n", - " \n", - " -1.32596\n", - " \n", - " \n", - " 1.42898\n", - " \n", - " \n", - " -2.08935\n", - " \n", - "
\n", - " 9\n", - " \n", - " \n", - " 10\n", - " \n", - " \n", - " -0.12982\n", - " \n", - " \n", - " 0.631523\n", - " \n", - " \n", - " -0.586538\n", - " \n", - " \n", - " 0.29072\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df.style.\\\n", " applymap(color_negative_red).\\\n", @@ -2121,7 +255,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": { "collapsed": true }, @@ -2141,301 +275,20 @@ " index=data.index, columns=data.columns)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "When using ``Styler.apply(func, axis=None)``, the function must return a DataFrame with the same index and column labels." + ] + }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - "\n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " A\n", - " \n", - " B\n", - " \n", - " C\n", - " \n", - " D\n", - " \n", - " E\n", - " \n", - "
None\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " 0\n", - " \n", - " \n", - " 1\n", - " \n", - " \n", - " 1.32921\n", - " \n", - " \n", - " nan\n", - " \n", - " \n", - " -0.31628\n", - " \n", - " \n", - " -0.99081\n", - " \n", - "
\n", - " 1\n", - " \n", - " \n", - " 2\n", - " \n", - " \n", - " -1.07082\n", - " \n", - " \n", - " -1.43871\n", - " \n", - " \n", - " 0.564417\n", - " \n", - " \n", - " 0.295722\n", - " \n", - "
\n", - " 2\n", - " \n", - " \n", - " 3\n", - " \n", - " \n", - " -1.6264\n", - " \n", - " \n", - " 0.219565\n", - " \n", - " \n", - " 0.678805\n", - " \n", - " \n", - " 1.88927\n", - " \n", - "
\n", - " 3\n", - " \n", - " \n", - " 4\n", - " \n", - " \n", - " 0.961538\n", - " \n", - " \n", - " 0.104011\n", - " \n", - " \n", - " -0.481165\n", - " \n", - " \n", - " 0.850229\n", - " \n", - "
\n", - " 4\n", - " \n", - " \n", - " 5\n", - " \n", - " \n", - " 1.45342\n", - " \n", - " \n", - " 1.05774\n", - " \n", - " \n", - " 0.165562\n", - " \n", - " \n", - " 0.515018\n", - " \n", - "
\n", - " 5\n", - " \n", - " \n", - " 6\n", - " \n", - " \n", - " -1.33694\n", - " \n", - " \n", - " 0.562861\n", - " \n", - " \n", - " 1.39285\n", - " \n", - " \n", - " -0.063328\n", - " \n", - "
\n", - " 6\n", - " \n", - " \n", - " 7\n", - " \n", - " \n", - " 0.121668\n", - " \n", - " \n", - " 1.2076\n", - " \n", - " \n", - " -0.00204021\n", - " \n", - " \n", - " 1.6278\n", - " \n", - "
\n", - " 7\n", - " \n", - " \n", - " 8\n", - " \n", - " \n", - " 0.354493\n", - " \n", - " \n", - " 1.03753\n", - " \n", - " \n", - " -0.385684\n", - " \n", - " \n", - " 0.519818\n", - " \n", - "
\n", - " 8\n", - " \n", - " \n", - " 9\n", - " \n", - " \n", - " 1.68658\n", - " \n", - " \n", - " -1.32596\n", - " \n", - " \n", - " 1.42898\n", - " \n", - " \n", - " -2.08935\n", - " \n", - "
\n", - " 9\n", - " \n", - " \n", - " 10\n", - " \n", - " \n", - " -0.12982\n", - " \n", - " \n", - " 0.631523\n", - " \n", - " \n", - " -0.586538\n", - " \n", - " \n", - " 0.29072\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df.style.apply(highlight_max, color='darkorange', axis=None)" ] @@ -2451,7 +304,9 @@ "- `Styler.applymap(func)` for elementwise styles\n", "- `Styler.apply(func, axis=0)` for columnwise styles\n", "- `Styler.apply(func, axis=1)` for rowwise styles\n", - "- `Styler.apply(func, axis=None)` for tablewise styles" + "- `Styler.apply(func, axis=None)` for tablewise styles\n", + "\n", + "And crucially the input and output shapes of `func` must match. If `x` is the input then ``func(x).shape == x.shape``." ] }, { @@ -2479,311 +334,11 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - "\n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " A\n", - " \n", - " B\n", - " \n", - " C\n", - " \n", - " D\n", - " \n", - " E\n", - " \n", - "
None\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " 0\n", - " \n", - " \n", - " 1\n", - " \n", - " \n", - " 1.32921\n", - " \n", - " \n", - " nan\n", - " \n", - " \n", - " -0.31628\n", - " \n", - " \n", - " -0.99081\n", - " \n", - "
\n", - " 1\n", - " \n", - " \n", - " 2\n", - " \n", - " \n", - " -1.07082\n", - " \n", - " \n", - " -1.43871\n", - " \n", - " \n", - " 0.564417\n", - " \n", - " \n", - " 0.295722\n", - " \n", - "
\n", - " 2\n", - " \n", - " \n", - " 3\n", - " \n", - " \n", - " -1.6264\n", - " \n", - " \n", - " 0.219565\n", - " \n", - " \n", - " 0.678805\n", - " \n", - " \n", - " 1.88927\n", - " \n", - "
\n", - " 3\n", - " \n", - " \n", - " 4\n", - " \n", - " \n", - " 0.961538\n", - " \n", - " \n", - " 0.104011\n", - " \n", - " \n", - " -0.481165\n", - " \n", - " \n", - " 0.850229\n", - " \n", - "
\n", - " 4\n", - " \n", - " \n", - " 5\n", - " \n", - " \n", - " 1.45342\n", - " \n", - " \n", - " 1.05774\n", - " \n", - " \n", - " 0.165562\n", - " \n", - " \n", - " 0.515018\n", - " \n", - "
\n", - " 5\n", - " \n", - " \n", - " 6\n", - " \n", - " \n", - " -1.33694\n", - " \n", - " \n", - " 0.562861\n", - " \n", - " \n", - " 1.39285\n", - " \n", - " \n", - " -0.063328\n", - " \n", - "
\n", - " 6\n", - " \n", - " \n", - " 7\n", - " \n", - " \n", - " 0.121668\n", - " \n", - " \n", - " 1.2076\n", - " \n", - " \n", - " -0.00204021\n", - " \n", - " \n", - " 1.6278\n", - " \n", - "
\n", - " 7\n", - " \n", - " \n", - " 8\n", - " \n", - " \n", - " 0.354493\n", - " \n", - " \n", - " 1.03753\n", - " \n", - " \n", - " -0.385684\n", - " \n", - " \n", - " 0.519818\n", - " \n", - "
\n", - " 8\n", - " \n", - " \n", - " 9\n", - " \n", - " \n", - " 1.68658\n", - " \n", - " \n", - " -1.32596\n", - " \n", - " \n", - " 1.42898\n", - " \n", - " \n", - " -2.08935\n", - " \n", - "
\n", - " 9\n", - " \n", - " \n", - " 10\n", - " \n", - " \n", - " -0.12982\n", - " \n", - " \n", - " 0.631523\n", - " \n", - " \n", - " -0.586538\n", - " \n", - " \n", - " 0.29072\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df.style.apply(highlight_max, subset=['B', 'C', 'D'])" ] @@ -2797,341 +352,11 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - "\n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " A\n", - " \n", - " B\n", - " \n", - " C\n", - " \n", - " D\n", - " \n", - " E\n", - " \n", - "
None\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " 0\n", - " \n", - " \n", - " 1\n", - " \n", - " \n", - " 1.32921\n", - " \n", - " \n", - " nan\n", - " \n", - " \n", - " -0.31628\n", - " \n", - " \n", - " -0.99081\n", - " \n", - "
\n", - " 1\n", - " \n", - " \n", - " 2\n", - " \n", - " \n", - " -1.07082\n", - " \n", - " \n", - " -1.43871\n", - " \n", - " \n", - " 0.564417\n", - " \n", - " \n", - " 0.295722\n", - " \n", - "
\n", - " 2\n", - " \n", - " \n", - " 3\n", - " \n", - " \n", - " -1.6264\n", - " \n", - " \n", - " 0.219565\n", - " \n", - " \n", - " 0.678805\n", - " \n", - " \n", - " 1.88927\n", - " \n", - "
\n", - " 3\n", - " \n", - " \n", - " 4\n", - " \n", - " \n", - " 0.961538\n", - " \n", - " \n", - " 0.104011\n", - " \n", - " \n", - " -0.481165\n", - " \n", - " \n", - " 0.850229\n", - " \n", - "
\n", - " 4\n", - " \n", - " \n", - " 5\n", - " \n", - " \n", - " 1.45342\n", - " \n", - " \n", - " 1.05774\n", - " \n", - " \n", - " 0.165562\n", - " \n", - " \n", - " 0.515018\n", - " \n", - "
\n", - " 5\n", - " \n", - " \n", - " 6\n", - " \n", - " \n", - " -1.33694\n", - " \n", - " \n", - " 0.562861\n", - " \n", - " \n", - " 1.39285\n", - " \n", - " \n", - " -0.063328\n", - " \n", - "
\n", - " 6\n", - " \n", - " \n", - " 7\n", - " \n", - " \n", - " 0.121668\n", - " \n", - " \n", - " 1.2076\n", - " \n", - " \n", - " -0.00204021\n", - " \n", - " \n", - " 1.6278\n", - " \n", - "
\n", - " 7\n", - " \n", - " \n", - " 8\n", - " \n", - " \n", - " 0.354493\n", - " \n", - " \n", - " 1.03753\n", - " \n", - " \n", - " -0.385684\n", - " \n", - " \n", - " 0.519818\n", - " \n", - "
\n", - " 8\n", - " \n", - " \n", - " 9\n", - " \n", - " \n", - " 1.68658\n", - " \n", - " \n", - " -1.32596\n", - " \n", - " \n", - " 1.42898\n", - " \n", - " \n", - " -2.08935\n", - " \n", - "
\n", - " 9\n", - " \n", - " \n", - " 10\n", - " \n", - " \n", - " -0.12982\n", - " \n", - " \n", - " 0.631523\n", - " \n", - " \n", - " -0.586538\n", - " \n", - " \n", - " 0.29072\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df.style.applymap(color_negative_red,\n", " subset=pd.IndexSlice[2:5, ['B', 'D']])" @@ -3162,293 +387,11 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - "\n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " A\n", - " \n", - " B\n", - " \n", - " C\n", - " \n", - " D\n", - " \n", - " E\n", - " \n", - "
None\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " 0\n", - " \n", - " \n", - " 100.00%\n", - " \n", - " \n", - " 132.92%\n", - " \n", - " \n", - " nan%\n", - " \n", - " \n", - " -31.63%\n", - " \n", - " \n", - " -99.08%\n", - " \n", - "
\n", - " 1\n", - " \n", - " \n", - " 200.00%\n", - " \n", - " \n", - " -107.08%\n", - " \n", - " \n", - " -143.87%\n", - " \n", - " \n", - " 56.44%\n", - " \n", - " \n", - " 29.57%\n", - " \n", - "
\n", - " 2\n", - " \n", - " \n", - " 300.00%\n", - " \n", - " \n", - " -162.64%\n", - " \n", - " \n", - " 21.96%\n", - " \n", - " \n", - " 67.88%\n", - " \n", - " \n", - " 188.93%\n", - " \n", - "
\n", - " 3\n", - " \n", - " \n", - " 400.00%\n", - " \n", - " \n", - " 96.15%\n", - " \n", - " \n", - " 10.40%\n", - " \n", - " \n", - " -48.12%\n", - " \n", - " \n", - " 85.02%\n", - " \n", - "
\n", - " 4\n", - " \n", - " \n", - " 500.00%\n", - " \n", - " \n", - " 145.34%\n", - " \n", - " \n", - " 105.77%\n", - " \n", - " \n", - " 16.56%\n", - " \n", - " \n", - " 51.50%\n", - " \n", - "
\n", - " 5\n", - " \n", - " \n", - " 600.00%\n", - " \n", - " \n", - " -133.69%\n", - " \n", - " \n", - " 56.29%\n", - " \n", - " \n", - " 139.29%\n", - " \n", - " \n", - " -6.33%\n", - " \n", - "
\n", - " 6\n", - " \n", - " \n", - " 700.00%\n", - " \n", - " \n", - " 12.17%\n", - " \n", - " \n", - " 120.76%\n", - " \n", - " \n", - " -0.20%\n", - " \n", - " \n", - " 162.78%\n", - " \n", - "
\n", - " 7\n", - " \n", - " \n", - " 800.00%\n", - " \n", - " \n", - " 35.45%\n", - " \n", - " \n", - " 103.75%\n", - " \n", - " \n", - " -38.57%\n", - " \n", - " \n", - " 51.98%\n", - " \n", - "
\n", - " 8\n", - " \n", - " \n", - " 900.00%\n", - " \n", - " \n", - " 168.66%\n", - " \n", - " \n", - " -132.60%\n", - " \n", - " \n", - " 142.90%\n", - " \n", - " \n", - " -208.94%\n", - " \n", - "
\n", - " 9\n", - " \n", - " \n", - " 1000.00%\n", - " \n", - " \n", - " -12.98%\n", - " \n", - " \n", - " 63.15%\n", - " \n", - " \n", - " -58.65%\n", - " \n", - " \n", - " 29.07%\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df.style.format(\"{:.2%}\")" ] @@ -3462,293 +405,11 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - "\n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " A\n", - " \n", - " B\n", - " \n", - " C\n", - " \n", - " D\n", - " \n", - " E\n", - " \n", - "
None\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " 0\n", - " \n", - " \n", - " 1\n", - " \n", - " \n", - " 1000\n", - " \n", - " \n", - " nan\n", - " \n", - " \n", - " -0.32\n", - " \n", - " \n", - " -0.99081\n", - " \n", - "
\n", - " 1\n", - " \n", - " \n", - " 2\n", - " \n", - " \n", - " -100\n", - " \n", - " \n", - " -1.43871\n", - " \n", - " \n", - " +0.56\n", - " \n", - " \n", - " 0.295722\n", - " \n", - "
\n", - " 2\n", - " \n", - " \n", - " 3\n", - " \n", - " \n", - " -200\n", - " \n", - " \n", - " 0.219565\n", - " \n", - " \n", - " +0.68\n", - " \n", - " \n", - " 1.88927\n", - " \n", - "
\n", - " 3\n", - " \n", - " \n", - " 4\n", - " \n", - " \n", - " 1000\n", - " \n", - " \n", - " 0.104011\n", - " \n", - " \n", - " -0.48\n", - " \n", - " \n", - " 0.850229\n", - " \n", - "
\n", - " 4\n", - " \n", - " \n", - " 5\n", - " \n", - " \n", - " 1000\n", - " \n", - " \n", - " 1.05774\n", - " \n", - " \n", - " +0.17\n", - " \n", - " \n", - " 0.515018\n", - " \n", - "
\n", - " 5\n", - " \n", - " \n", - " 6\n", - " \n", - " \n", - " -100\n", - " \n", - " \n", - " 0.562861\n", - " \n", - " \n", - " +1.39\n", - " \n", - " \n", - " -0.063328\n", - " \n", - "
\n", - " 6\n", - " \n", - " \n", - " 7\n", - " \n", - " \n", - " 0000\n", - " \n", - " \n", - " 1.2076\n", - " \n", - " \n", - " -0.00\n", - " \n", - " \n", - " 1.6278\n", - " \n", - "
\n", - " 7\n", - " \n", - " \n", - " 8\n", - " \n", - " \n", - " 0000\n", - " \n", - " \n", - " 1.03753\n", - " \n", - " \n", - " -0.39\n", - " \n", - " \n", - " 0.519818\n", - " \n", - "
\n", - " 8\n", - " \n", - " \n", - " 9\n", - " \n", - " \n", - " 2000\n", - " \n", - " \n", - " -1.32596\n", - " \n", - " \n", - " +1.43\n", - " \n", - " \n", - " -2.08935\n", - " \n", - "
\n", - " 9\n", - " \n", - " \n", - " 10\n", - " \n", - " \n", - " -000\n", - " \n", - " \n", - " 0.631523\n", - " \n", - " \n", - " -0.59\n", - " \n", - " \n", - " 0.29072\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df.style.format({'B': \"{:0<4.0f}\", 'D': '{:+.2f}'})" ] @@ -3762,293 +423,11 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - "\n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " A\n", - " \n", - " B\n", - " \n", - " C\n", - " \n", - " D\n", - " \n", - " E\n", - " \n", - "
None\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " 0\n", - " \n", - " \n", - " 1\n", - " \n", - " \n", - " ±1.33\n", - " \n", - " \n", - " nan\n", - " \n", - " \n", - " -0.31628\n", - " \n", - " \n", - " -0.99081\n", - " \n", - "
\n", - " 1\n", - " \n", - " \n", - " 2\n", - " \n", - " \n", - " ±1.07\n", - " \n", - " \n", - " -1.43871\n", - " \n", - " \n", - " 0.564417\n", - " \n", - " \n", - " 0.295722\n", - " \n", - "
\n", - " 2\n", - " \n", - " \n", - " 3\n", - " \n", - " \n", - " ±1.63\n", - " \n", - " \n", - " 0.219565\n", - " \n", - " \n", - " 0.678805\n", - " \n", - " \n", - " 1.88927\n", - " \n", - "
\n", - " 3\n", - " \n", - " \n", - " 4\n", - " \n", - " \n", - " ±0.96\n", - " \n", - " \n", - " 0.104011\n", - " \n", - " \n", - " -0.481165\n", - " \n", - " \n", - " 0.850229\n", - " \n", - "
\n", - " 4\n", - " \n", - " \n", - " 5\n", - " \n", - " \n", - " ±1.45\n", - " \n", - " \n", - " 1.05774\n", - " \n", - " \n", - " 0.165562\n", - " \n", - " \n", - " 0.515018\n", - " \n", - "
\n", - " 5\n", - " \n", - " \n", - " 6\n", - " \n", - " \n", - " ±1.34\n", - " \n", - " \n", - " 0.562861\n", - " \n", - " \n", - " 1.39285\n", - " \n", - " \n", - " -0.063328\n", - " \n", - "
\n", - " 6\n", - " \n", - " \n", - " 7\n", - " \n", - " \n", - " ±0.12\n", - " \n", - " \n", - " 1.2076\n", - " \n", - " \n", - " -0.00204021\n", - " \n", - " \n", - " 1.6278\n", - " \n", - "
\n", - " 7\n", - " \n", - " \n", - " 8\n", - " \n", - " \n", - " ±0.35\n", - " \n", - " \n", - " 1.03753\n", - " \n", - " \n", - " -0.385684\n", - " \n", - " \n", - " 0.519818\n", - " \n", - "
\n", - " 8\n", - " \n", - " \n", - " 9\n", - " \n", - " \n", - " ±1.69\n", - " \n", - " \n", - " -1.32596\n", - " \n", - " \n", - " 1.42898\n", - " \n", - " \n", - " -2.08935\n", - " \n", - "
\n", - " 9\n", - " \n", - " \n", - " 10\n", - " \n", - " \n", - " ±0.13\n", - " \n", - " \n", - " 0.631523\n", - " \n", - " \n", - " -0.586538\n", - " \n", - " \n", - " 0.29072\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df.style.format({\"B\": lambda x: \"±{:.2f}\".format(abs(x))})" ] @@ -4069,299 +448,11 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - "\n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " A\n", - " \n", - " B\n", - " \n", - " C\n", - " \n", - " D\n", - " \n", - " E\n", - " \n", - "
None\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " 0\n", - " \n", - " \n", - " 1\n", - " \n", - " \n", - " 1.32921\n", - " \n", - " \n", - " nan\n", - " \n", - " \n", - " -0.31628\n", - " \n", - " \n", - " -0.99081\n", - " \n", - "
\n", - " 1\n", - " \n", - " \n", - " 2\n", - " \n", - " \n", - " -1.07082\n", - " \n", - " \n", - " -1.43871\n", - " \n", - " \n", - " 0.564417\n", - " \n", - " \n", - " 0.295722\n", - " \n", - "
\n", - " 2\n", - " \n", - " \n", - " 3\n", - " \n", - " \n", - " -1.6264\n", - " \n", - " \n", - " 0.219565\n", - " \n", - " \n", - " 0.678805\n", - " \n", - " \n", - " 1.88927\n", - " \n", - "
\n", - " 3\n", - " \n", - " \n", - " 4\n", - " \n", - " \n", - " 0.961538\n", - " \n", - " \n", - " 0.104011\n", - " \n", - " \n", - " -0.481165\n", - " \n", - " \n", - " 0.850229\n", - " \n", - "
\n", - " 4\n", - " \n", - " \n", - " 5\n", - " \n", - " \n", - " 1.45342\n", - " \n", - " \n", - " 1.05774\n", - " \n", - " \n", - " 0.165562\n", - " \n", - " \n", - " 0.515018\n", - " \n", - "
\n", - " 5\n", - " \n", - " \n", - " 6\n", - " \n", - " \n", - " -1.33694\n", - " \n", - " \n", - " 0.562861\n", - " \n", - " \n", - " 1.39285\n", - " \n", - " \n", - " -0.063328\n", - " \n", - "
\n", - " 6\n", - " \n", - " \n", - " 7\n", - " \n", - " \n", - " 0.121668\n", - " \n", - " \n", - " 1.2076\n", - " \n", - " \n", - " -0.00204021\n", - " \n", - " \n", - " 1.6278\n", - " \n", - "
\n", - " 7\n", - " \n", - " \n", - " 8\n", - " \n", - " \n", - " 0.354493\n", - " \n", - " \n", - " 1.03753\n", - " \n", - " \n", - " -0.385684\n", - " \n", - " \n", - " 0.519818\n", - " \n", - "
\n", - " 8\n", - " \n", - " \n", - " 9\n", - " \n", - " \n", - " 1.68658\n", - " \n", - " \n", - " -1.32596\n", - " \n", - " \n", - " 1.42898\n", - " \n", - " \n", - " -2.08935\n", - " \n", - "
\n", - " 9\n", - " \n", - " \n", - " 10\n", - " \n", - " \n", - " -0.12982\n", - " \n", - " \n", - " 0.631523\n", - " \n", - " \n", - " -0.586538\n", - " \n", - " \n", - " 0.29072\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df.style.highlight_null(null_color='red')" ] @@ -4375,593 +466,11 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - "\n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " A\n", - " \n", - " B\n", - " \n", - " C\n", - " \n", - " D\n", - " \n", - " E\n", - " \n", - "
None\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " 0\n", - " \n", - " \n", - " 1\n", - " \n", - " \n", - " 1.32921\n", - " \n", - " \n", - " nan\n", - " \n", - " \n", - " -0.31628\n", - " \n", - " \n", - " -0.99081\n", - " \n", - "
\n", - " 1\n", - " \n", - " \n", - " 2\n", - " \n", - " \n", - " -1.07082\n", - " \n", - " \n", - " -1.43871\n", - " \n", - " \n", - " 0.564417\n", - " \n", - " \n", - " 0.295722\n", - " \n", - "
\n", - " 2\n", - " \n", - " \n", - " 3\n", - " \n", - " \n", - " -1.6264\n", - " \n", - " \n", - " 0.219565\n", - " \n", - " \n", - " 0.678805\n", - " \n", - " \n", - " 1.88927\n", - " \n", - "
\n", - " 3\n", - " \n", - " \n", - " 4\n", - " \n", - " \n", - " 0.961538\n", - " \n", - " \n", - " 0.104011\n", - " \n", - " \n", - " -0.481165\n", - " \n", - " \n", - " 0.850229\n", - " \n", - "
\n", - " 4\n", - " \n", - " \n", - " 5\n", - " \n", - " \n", - " 1.45342\n", - " \n", - " \n", - " 1.05774\n", - " \n", - " \n", - " 0.165562\n", - " \n", - " \n", - " 0.515018\n", - " \n", - "
\n", - " 5\n", - " \n", - " \n", - " 6\n", - " \n", - " \n", - " -1.33694\n", - " \n", - " \n", - " 0.562861\n", - " \n", - " \n", - " 1.39285\n", - " \n", - " \n", - " -0.063328\n", - " \n", - "
\n", - " 6\n", - " \n", - " \n", - " 7\n", - " \n", - " \n", - " 0.121668\n", - " \n", - " \n", - " 1.2076\n", - " \n", - " \n", - " -0.00204021\n", - " \n", - " \n", - " 1.6278\n", - " \n", - "
\n", - " 7\n", - " \n", - " \n", - " 8\n", - " \n", - " \n", - " 0.354493\n", - " \n", - " \n", - " 1.03753\n", - " \n", - " \n", - " -0.385684\n", - " \n", - " \n", - " 0.519818\n", - " \n", - "
\n", - " 8\n", - " \n", - " \n", - " 9\n", - " \n", - " \n", - " 1.68658\n", - " \n", - " \n", - " -1.32596\n", - " \n", - " \n", - " 1.42898\n", - " \n", - " \n", - " -2.08935\n", - " \n", - "
\n", - " 9\n", - " \n", - " \n", - " 10\n", - " \n", - " \n", - " -0.12982\n", - " \n", - " \n", - " 0.631523\n", - " \n", - " \n", - " -0.586538\n", - " \n", - " \n", - " 0.29072\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "import seaborn as sns\n", "\n", @@ -4980,333 +489,11 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - "\n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " A\n", - " \n", - " B\n", - " \n", - " C\n", - " \n", - " D\n", - " \n", - " E\n", - " \n", - "
None\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " 0\n", - " \n", - " \n", - " 1\n", - " \n", - " \n", - " 1.32921\n", - " \n", - " \n", - " nan\n", - " \n", - " \n", - " -0.31628\n", - " \n", - " \n", - " -0.99081\n", - " \n", - "
\n", - " 1\n", - " \n", - " \n", - " 2\n", - " \n", - " \n", - " -1.07082\n", - " \n", - " \n", - " -1.43871\n", - " \n", - " \n", - " 0.564417\n", - " \n", - " \n", - " 0.295722\n", - " \n", - "
\n", - " 2\n", - " \n", - " \n", - " 3\n", - " \n", - " \n", - " -1.6264\n", - " \n", - " \n", - " 0.219565\n", - " \n", - " \n", - " 0.678805\n", - " \n", - " \n", - " 1.88927\n", - " \n", - "
\n", - " 3\n", - " \n", - " \n", - " 4\n", - " \n", - " \n", - " 0.961538\n", - " \n", - " \n", - " 0.104011\n", - " \n", - " \n", - " -0.481165\n", - " \n", - " \n", - " 0.850229\n", - " \n", - "
\n", - " 4\n", - " \n", - " \n", - " 5\n", - " \n", - " \n", - " 1.45342\n", - " \n", - " \n", - " 1.05774\n", - " \n", - " \n", - " 0.165562\n", - " \n", - " \n", - " 0.515018\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Uses the full color range\n", "df.loc[:4].style.background_gradient(cmap='viridis')" @@ -5314,383 +501,11 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - "\n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " A\n", - " \n", - " B\n", - " \n", - " C\n", - " \n", - " D\n", - " \n", - " E\n", - " \n", - "
None\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " 0\n", - " \n", - " \n", - " 1\n", - " \n", - " \n", - " 1.32921\n", - " \n", - " \n", - " nan\n", - " \n", - " \n", - " -0.31628\n", - " \n", - " \n", - " -0.99081\n", - " \n", - "
\n", - " 1\n", - " \n", - " \n", - " 2\n", - " \n", - " \n", - " -1.07082\n", - " \n", - " \n", - " -1.43871\n", - " \n", - " \n", - " 0.564417\n", - " \n", - " \n", - " 0.295722\n", - " \n", - "
\n", - " 2\n", - " \n", - " \n", - " 3\n", - " \n", - " \n", - " -1.6264\n", - " \n", - " \n", - " 0.219565\n", - " \n", - " \n", - " 0.678805\n", - " \n", - " \n", - " 1.88927\n", - " \n", - "
\n", - " 3\n", - " \n", - " \n", - " 4\n", - " \n", - " \n", - " 0.961538\n", - " \n", - " \n", - " 0.104011\n", - " \n", - " \n", - " -0.481165\n", - " \n", - " \n", - " 0.850229\n", - " \n", - "
\n", - " 4\n", - " \n", - " \n", - " 5\n", - " \n", - " \n", - " 1.45342\n", - " \n", - " \n", - " 1.05774\n", - " \n", - " \n", - " 0.165562\n", - " \n", - " \n", - " 0.515018\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Compreess the color range\n", "(df.loc[:4]\n", @@ -5708,489 +523,11 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - "\n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " A\n", - " \n", - " B\n", - " \n", - " C\n", - " \n", - " D\n", - " \n", - " E\n", - " \n", - "
None\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " 0\n", - " \n", - " \n", - " 1\n", - " \n", - " \n", - " 1.32921\n", - " \n", - " \n", - " nan\n", - " \n", - " \n", - " -0.31628\n", - " \n", - " \n", - " -0.99081\n", - " \n", - "
\n", - " 1\n", - " \n", - " \n", - " 2\n", - " \n", - " \n", - " -1.07082\n", - " \n", - " \n", - " -1.43871\n", - " \n", - " \n", - " 0.564417\n", - " \n", - " \n", - " 0.295722\n", - " \n", - "
\n", - " 2\n", - " \n", - " \n", - " 3\n", - " \n", - " \n", - " -1.6264\n", - " \n", - " \n", - " 0.219565\n", - " \n", - " \n", - " 0.678805\n", - " \n", - " \n", - " 1.88927\n", - " \n", - "
\n", - " 3\n", - " \n", - " \n", - " 4\n", - " \n", - " \n", - " 0.961538\n", - " \n", - " \n", - " 0.104011\n", - " \n", - " \n", - " -0.481165\n", - " \n", - " \n", - " 0.850229\n", - " \n", - "
\n", - " 4\n", - " \n", - " \n", - " 5\n", - " \n", - " \n", - " 1.45342\n", - " \n", - " \n", - " 1.05774\n", - " \n", - " \n", - " 0.165562\n", - " \n", - " \n", - " 0.515018\n", - " \n", - "
\n", - " 5\n", - " \n", - " \n", - " 6\n", - " \n", - " \n", - " -1.33694\n", - " \n", - " \n", - " 0.562861\n", - " \n", - " \n", - " 1.39285\n", - " \n", - " \n", - " -0.063328\n", - " \n", - "
\n", - " 6\n", - " \n", - " \n", - " 7\n", - " \n", - " \n", - " 0.121668\n", - " \n", - " \n", - " 1.2076\n", - " \n", - " \n", - " -0.00204021\n", - " \n", - " \n", - " 1.6278\n", - " \n", - "
\n", - " 7\n", - " \n", - " \n", - " 8\n", - " \n", - " \n", - " 0.354493\n", - " \n", - " \n", - " 1.03753\n", - " \n", - " \n", - " -0.385684\n", - " \n", - " \n", - " 0.519818\n", - " \n", - "
\n", - " 8\n", - " \n", - " \n", - " 9\n", - " \n", - " \n", - " 1.68658\n", - " \n", - " \n", - " -1.32596\n", - " \n", - " \n", - " 1.42898\n", - " \n", - " \n", - " -2.08935\n", - " \n", - "
\n", - " 9\n", - " \n", - " \n", - " 10\n", - " \n", - " \n", - " -0.12982\n", - " \n", - " \n", - " 0.631523\n", - " \n", - " \n", - " -0.586538\n", - " \n", - " \n", - " 0.29072\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df.style.bar(subset=['A', 'B'], color='#d65f5f')" ] @@ -6204,646 +541,22 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - "\n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " A\n", - " \n", - " B\n", - " \n", - " C\n", - " \n", - " D\n", - " \n", - " E\n", - " \n", - "
None\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " 0\n", - " \n", - " \n", - " 1\n", - " \n", - " \n", - " 1.32921\n", - " \n", - " \n", - " nan\n", - " \n", - " \n", - " -0.31628\n", - " \n", - " \n", - " -0.99081\n", - " \n", - "
\n", - " 1\n", - " \n", - " \n", - " 2\n", - " \n", - " \n", - " -1.07082\n", - " \n", - " \n", - " -1.43871\n", - " \n", - " \n", - " 0.564417\n", - " \n", - " \n", - " 0.295722\n", - " \n", - "
\n", - " 2\n", - " \n", - " \n", - " 3\n", - " \n", - " \n", - " -1.6264\n", - " \n", - " \n", - " 0.219565\n", - " \n", - " \n", - " 0.678805\n", - " \n", - " \n", - " 1.88927\n", - " \n", - "
\n", - " 3\n", - " \n", - " \n", - " 4\n", - " \n", - " \n", - " 0.961538\n", - " \n", - " \n", - " 0.104011\n", - " \n", - " \n", - " -0.481165\n", - " \n", - " \n", - " 0.850229\n", - " \n", - "
\n", - " 4\n", - " \n", - " \n", - " 5\n", - " \n", - " \n", - " 1.45342\n", - " \n", - " \n", - " 1.05774\n", - " \n", - " \n", - " 0.165562\n", - " \n", - " \n", - " 0.515018\n", - " \n", - "
\n", - " 5\n", - " \n", - " \n", - " 6\n", - " \n", - " \n", - " -1.33694\n", - " \n", - " \n", - " 0.562861\n", - " \n", - " \n", - " 1.39285\n", - " \n", - " \n", - " -0.063328\n", - " \n", - "
\n", - " 6\n", - " \n", - " \n", - " 7\n", - " \n", - " \n", - " 0.121668\n", - " \n", - " \n", - " 1.2076\n", - " \n", - " \n", - " -0.00204021\n", - " \n", - " \n", - " 1.6278\n", - " \n", - "
\n", - " 7\n", - " \n", - " \n", - " 8\n", - " \n", - " \n", - " 0.354493\n", - " \n", - " \n", - " 1.03753\n", - " \n", - " \n", - " -0.385684\n", - " \n", - " \n", - " 0.519818\n", - " \n", - "
\n", - " 8\n", - " \n", - " \n", - " 9\n", - " \n", - " \n", - " 1.68658\n", - " \n", - " \n", - " -1.32596\n", - " \n", - " \n", - " 1.42898\n", - " \n", - " \n", - " -2.08935\n", - " \n", - "
\n", - " 9\n", - " \n", - " \n", - " 10\n", - " \n", - " \n", - " -0.12982\n", - " \n", - " \n", - " 0.631523\n", - " \n", - " \n", - " -0.586538\n", - " \n", - " \n", - " 0.29072\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df.style.highlight_max(axis=0)" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - "\n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " A\n", - " \n", - " B\n", - " \n", - " C\n", - " \n", - " D\n", - " \n", - " E\n", - " \n", - "
None\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " 0\n", - " \n", - " \n", - " 1\n", - " \n", - " \n", - " 1.32921\n", - " \n", - " \n", - " nan\n", - " \n", - " \n", - " -0.31628\n", - " \n", - " \n", - " -0.99081\n", - " \n", - "
\n", - " 1\n", - " \n", - " \n", - " 2\n", - " \n", - " \n", - " -1.07082\n", - " \n", - " \n", - " -1.43871\n", - " \n", - " \n", - " 0.564417\n", - " \n", - " \n", - " 0.295722\n", - " \n", - "
\n", - " 2\n", - " \n", - " \n", - " 3\n", - " \n", - " \n", - " -1.6264\n", - " \n", - " \n", - " 0.219565\n", - " \n", - " \n", - " 0.678805\n", - " \n", - " \n", - " 1.88927\n", - " \n", - "
\n", - " 3\n", - " \n", - " \n", - " 4\n", - " \n", - " \n", - " 0.961538\n", - " \n", - " \n", - " 0.104011\n", - " \n", - " \n", - " -0.481165\n", - " \n", - " \n", - " 0.850229\n", - " \n", - "
\n", - " 4\n", - " \n", - " \n", - " 5\n", - " \n", - " \n", - " 1.45342\n", - " \n", - " \n", - " 1.05774\n", - " \n", - " \n", - " 0.165562\n", - " \n", - " \n", - " 0.515018\n", - " \n", - "
\n", - " 5\n", - " \n", - " \n", - " 6\n", - " \n", - " \n", - " -1.33694\n", - " \n", - " \n", - " 0.562861\n", - " \n", - " \n", - " 1.39285\n", - " \n", - " \n", - " -0.063328\n", - " \n", - "
\n", - " 6\n", - " \n", - " \n", - " 7\n", - " \n", - " \n", - " 0.121668\n", - " \n", - " \n", - " 1.2076\n", - " \n", - " \n", - " -0.00204021\n", - " \n", - " \n", - " 1.6278\n", - " \n", - "
\n", - " 7\n", - " \n", - " \n", - " 8\n", - " \n", - " \n", - " 0.354493\n", - " \n", - " \n", - " 1.03753\n", - " \n", - " \n", - " -0.385684\n", - " \n", - " \n", - " 0.519818\n", - " \n", - "
\n", - " 8\n", - " \n", - " \n", - " 9\n", - " \n", - " \n", - " 1.68658\n", - " \n", - " \n", - " -1.32596\n", - " \n", - " \n", - " 1.42898\n", - " \n", - " \n", - " -2.08935\n", - " \n", - "
\n", - " 9\n", - " \n", - " \n", - " 10\n", - " \n", - " \n", - " -0.12982\n", - " \n", - " \n", - " 0.631523\n", - " \n", - " \n", - " -0.586538\n", - " \n", - " \n", - " 0.29072\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df.style.highlight_min(axis=0)" ] @@ -6857,793 +570,11 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - "\n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " A\n", - " \n", - " B\n", - " \n", - " C\n", - " \n", - " D\n", - " \n", - " E\n", - " \n", - "
None\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " 0\n", - " \n", - " \n", - " 1\n", - " \n", - " \n", - " 1.32921\n", - " \n", - " \n", - " nan\n", - " \n", - " \n", - " -0.31628\n", - " \n", - " \n", - " -0.99081\n", - " \n", - "
\n", - " 1\n", - " \n", - " \n", - " 2\n", - " \n", - " \n", - " -1.07082\n", - " \n", - " \n", - " -1.43871\n", - " \n", - " \n", - " 0.564417\n", - " \n", - " \n", - " 0.295722\n", - " \n", - "
\n", - " 2\n", - " \n", - " \n", - " 3\n", - " \n", - " \n", - " -1.6264\n", - " \n", - " \n", - " 0.219565\n", - " \n", - " \n", - " 0.678805\n", - " \n", - " \n", - " 1.88927\n", - " \n", - "
\n", - " 3\n", - " \n", - " \n", - " 4\n", - " \n", - " \n", - " 0.961538\n", - " \n", - " \n", - " 0.104011\n", - " \n", - " \n", - " -0.481165\n", - " \n", - " \n", - " 0.850229\n", - " \n", - "
\n", - " 4\n", - " \n", - " \n", - " 5\n", - " \n", - " \n", - " 1.45342\n", - " \n", - " \n", - " 1.05774\n", - " \n", - " \n", - " 0.165562\n", - " \n", - " \n", - " 0.515018\n", - " \n", - "
\n", - " 5\n", - " \n", - " \n", - " 6\n", - " \n", - " \n", - " -1.33694\n", - " \n", - " \n", - " 0.562861\n", - " \n", - " \n", - " 1.39285\n", - " \n", - " \n", - " -0.063328\n", - " \n", - "
\n", - " 6\n", - " \n", - " \n", - " 7\n", - " \n", - " \n", - " 0.121668\n", - " \n", - " \n", - " 1.2076\n", - " \n", - " \n", - " -0.00204021\n", - " \n", - " \n", - " 1.6278\n", - " \n", - "
\n", - " 7\n", - " \n", - " \n", - " 8\n", - " \n", - " \n", - " 0.354493\n", - " \n", - " \n", - " 1.03753\n", - " \n", - " \n", - " -0.385684\n", - " \n", - " \n", - " 0.519818\n", - " \n", - "
\n", - " 8\n", - " \n", - " \n", - " 9\n", - " \n", - " \n", - " 1.68658\n", - " \n", - " \n", - " -1.32596\n", - " \n", - " \n", - " 1.42898\n", - " \n", - " \n", - " -2.08935\n", - " \n", - "
\n", - " 9\n", - " \n", - " \n", - " 10\n", - " \n", - " \n", - " -0.12982\n", - " \n", - " \n", - " 0.631523\n", - " \n", - " \n", - " -0.586538\n", - " \n", - " \n", - " 0.29072\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df.style.set_properties(**{'background-color': 'black',\n", " 'color': 'lawngreen',\n", @@ -7666,593 +597,11 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - "\n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " A\n", - " \n", - " B\n", - " \n", - " C\n", - " \n", - " D\n", - " \n", - " E\n", - " \n", - "
None\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " 0\n", - " \n", - " \n", - " 1\n", - " \n", - " \n", - " 1.32921\n", - " \n", - " \n", - " nan\n", - " \n", - " \n", - " -0.31628\n", - " \n", - " \n", - " -0.99081\n", - " \n", - "
\n", - " 1\n", - " \n", - " \n", - " 2\n", - " \n", - " \n", - " -1.07082\n", - " \n", - " \n", - " -1.43871\n", - " \n", - " \n", - " 0.564417\n", - " \n", - " \n", - " 0.295722\n", - " \n", - "
\n", - " 2\n", - " \n", - " \n", - " 3\n", - " \n", - " \n", - " -1.6264\n", - " \n", - " \n", - " 0.219565\n", - " \n", - " \n", - " 0.678805\n", - " \n", - " \n", - " 1.88927\n", - " \n", - "
\n", - " 3\n", - " \n", - " \n", - " 4\n", - " \n", - " \n", - " 0.961538\n", - " \n", - " \n", - " 0.104011\n", - " \n", - " \n", - " -0.481165\n", - " \n", - " \n", - " 0.850229\n", - " \n", - "
\n", - " 4\n", - " \n", - " \n", - " 5\n", - " \n", - " \n", - " 1.45342\n", - " \n", - " \n", - " 1.05774\n", - " \n", - " \n", - " 0.165562\n", - " \n", - " \n", - " 0.515018\n", - " \n", - "
\n", - " 5\n", - " \n", - " \n", - " 6\n", - " \n", - " \n", - " -1.33694\n", - " \n", - " \n", - " 0.562861\n", - " \n", - " \n", - " 1.39285\n", - " \n", - " \n", - " -0.063328\n", - " \n", - "
\n", - " 6\n", - " \n", - " \n", - " 7\n", - " \n", - " \n", - " 0.121668\n", - " \n", - " \n", - " 1.2076\n", - " \n", - " \n", - " -0.00204021\n", - " \n", - " \n", - " 1.6278\n", - " \n", - "
\n", - " 7\n", - " \n", - " \n", - " 8\n", - " \n", - " \n", - " 0.354493\n", - " \n", - " \n", - " 1.03753\n", - " \n", - " \n", - " -0.385684\n", - " \n", - " \n", - " 0.519818\n", - " \n", - "
\n", - " 8\n", - " \n", - " \n", - " 9\n", - " \n", - " \n", - " 1.68658\n", - " \n", - " \n", - " -1.32596\n", - " \n", - " \n", - " 1.42898\n", - " \n", - " \n", - " -2.08935\n", - " \n", - "
\n", - " 9\n", - " \n", - " \n", - " 10\n", - " \n", - " \n", - " -0.12982\n", - " \n", - " \n", - " 0.631523\n", - " \n", - " \n", - " -0.586538\n", - " \n", - " \n", - " 0.29072\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df2 = -df\n", "style1 = df.style.applymap(color_negative_red)\n", @@ -8261,593 +610,11 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - "\n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " A\n", - " \n", - " B\n", - " \n", - " C\n", - " \n", - " D\n", - " \n", - " E\n", - " \n", - "
None\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " 0\n", - " \n", - " \n", - " -1\n", - " \n", - " \n", - " -1.32921\n", - " \n", - " \n", - " nan\n", - " \n", - " \n", - " 0.31628\n", - " \n", - " \n", - " 0.99081\n", - " \n", - "
\n", - " 1\n", - " \n", - " \n", - " -2\n", - " \n", - " \n", - " 1.07082\n", - " \n", - " \n", - " 1.43871\n", - " \n", - " \n", - " -0.564417\n", - " \n", - " \n", - " -0.295722\n", - " \n", - "
\n", - " 2\n", - " \n", - " \n", - " -3\n", - " \n", - " \n", - " 1.6264\n", - " \n", - " \n", - " -0.219565\n", - " \n", - " \n", - " -0.678805\n", - " \n", - " \n", - " -1.88927\n", - " \n", - "
\n", - " 3\n", - " \n", - " \n", - " -4\n", - " \n", - " \n", - " -0.961538\n", - " \n", - " \n", - " -0.104011\n", - " \n", - " \n", - " 0.481165\n", - " \n", - " \n", - " -0.850229\n", - " \n", - "
\n", - " 4\n", - " \n", - " \n", - " -5\n", - " \n", - " \n", - " -1.45342\n", - " \n", - " \n", - " -1.05774\n", - " \n", - " \n", - " -0.165562\n", - " \n", - " \n", - " -0.515018\n", - " \n", - "
\n", - " 5\n", - " \n", - " \n", - " -6\n", - " \n", - " \n", - " 1.33694\n", - " \n", - " \n", - " -0.562861\n", - " \n", - " \n", - " -1.39285\n", - " \n", - " \n", - " 0.063328\n", - " \n", - "
\n", - " 6\n", - " \n", - " \n", - " -7\n", - " \n", - " \n", - " -0.121668\n", - " \n", - " \n", - " -1.2076\n", - " \n", - " \n", - " 0.00204021\n", - " \n", - " \n", - " -1.6278\n", - " \n", - "
\n", - " 7\n", - " \n", - " \n", - " -8\n", - " \n", - " \n", - " -0.354493\n", - " \n", - " \n", - " -1.03753\n", - " \n", - " \n", - " 0.385684\n", - " \n", - " \n", - " -0.519818\n", - " \n", - "
\n", - " 8\n", - " \n", - " \n", - " -9\n", - " \n", - " \n", - " -1.68658\n", - " \n", - " \n", - " 1.32596\n", - " \n", - " \n", - " -1.42898\n", - " \n", - " \n", - " 2.08935\n", - " \n", - "
\n", - " 9\n", - " \n", - " \n", - " -10\n", - " \n", - " \n", - " 0.12982\n", - " \n", - " \n", - " -0.631523\n", - " \n", - " \n", - " 0.586538\n", - " \n", - " \n", - " -0.29072\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "style2 = df2.style\n", "style2.use(style1.export())\n", @@ -8898,693 +665,11 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - "\n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " A\n", - " \n", - " B\n", - " \n", - " C\n", - " \n", - " D\n", - " \n", - " E\n", - " \n", - "
None\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " 0\n", - " \n", - " \n", - " 1\n", - " \n", - " \n", - " 1.3\n", - " \n", - " \n", - " nan\n", - " \n", - " \n", - " -0.32\n", - " \n", - " \n", - " -0.99\n", - " \n", - "
\n", - " 1\n", - " \n", - " \n", - " 2\n", - " \n", - " \n", - " -1.1\n", - " \n", - " \n", - " -1.4\n", - " \n", - " \n", - " 0.56\n", - " \n", - " \n", - " 0.3\n", - " \n", - "
\n", - " 2\n", - " \n", - " \n", - " 3\n", - " \n", - " \n", - " -1.6\n", - " \n", - " \n", - " 0.22\n", - " \n", - " \n", - " 0.68\n", - " \n", - " \n", - " 1.9\n", - " \n", - "
\n", - " 3\n", - " \n", - " \n", - " 4\n", - " \n", - " \n", - " 0.96\n", - " \n", - " \n", - " 0.1\n", - " \n", - " \n", - " -0.48\n", - " \n", - " \n", - " 0.85\n", - " \n", - "
\n", - " 4\n", - " \n", - " \n", - " 5\n", - " \n", - " \n", - " 1.5\n", - " \n", - " \n", - " 1.1\n", - " \n", - " \n", - " 0.17\n", - " \n", - " \n", - " 0.52\n", - " \n", - "
\n", - " 5\n", - " \n", - " \n", - " 6\n", - " \n", - " \n", - " -1.3\n", - " \n", - " \n", - " 0.56\n", - " \n", - " \n", - " 1.4\n", - " \n", - " \n", - " -0.063\n", - " \n", - "
\n", - " 6\n", - " \n", - " \n", - " 7\n", - " \n", - " \n", - " 0.12\n", - " \n", - " \n", - " 1.2\n", - " \n", - " \n", - " -0.002\n", - " \n", - " \n", - " 1.6\n", - " \n", - "
\n", - " 7\n", - " \n", - " \n", - " 8\n", - " \n", - " \n", - " 0.35\n", - " \n", - " \n", - " 1\n", - " \n", - " \n", - " -0.39\n", - " \n", - " \n", - " 0.52\n", - " \n", - "
\n", - " 8\n", - " \n", - " \n", - " 9\n", - " \n", - " \n", - " 1.7\n", - " \n", - " \n", - " -1.3\n", - " \n", - " \n", - " 1.4\n", - " \n", - " \n", - " -2.1\n", - " \n", - "
\n", - " 9\n", - " \n", - " \n", - " 10\n", - " \n", - " \n", - " -0.13\n", - " \n", - " \n", - " 0.63\n", - " \n", - " \n", - " -0.59\n", - " \n", - " \n", - " 0.29\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "with pd.option_context('display.precision', 2):\n", " html = (df.style\n", @@ -9602,693 +687,11 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - "\n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " A\n", - " \n", - " B\n", - " \n", - " C\n", - " \n", - " D\n", - " \n", - " E\n", - " \n", - "
None\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " 0\n", - " \n", - " \n", - " 1\n", - " \n", - " \n", - " 1.3\n", - " \n", - " \n", - " nan\n", - " \n", - " \n", - " -0.32\n", - " \n", - " \n", - " -0.99\n", - " \n", - "
\n", - " 1\n", - " \n", - " \n", - " 2\n", - " \n", - " \n", - " -1.1\n", - " \n", - " \n", - " -1.4\n", - " \n", - " \n", - " 0.56\n", - " \n", - " \n", - " 0.3\n", - " \n", - "
\n", - " 2\n", - " \n", - " \n", - " 3\n", - " \n", - " \n", - " -1.6\n", - " \n", - " \n", - " 0.22\n", - " \n", - " \n", - " 0.68\n", - " \n", - " \n", - " 1.9\n", - " \n", - "
\n", - " 3\n", - " \n", - " \n", - " 4\n", - " \n", - " \n", - " 0.96\n", - " \n", - " \n", - " 0.1\n", - " \n", - " \n", - " -0.48\n", - " \n", - " \n", - " 0.85\n", - " \n", - "
\n", - " 4\n", - " \n", - " \n", - " 5\n", - " \n", - " \n", - " 1.5\n", - " \n", - " \n", - " 1.1\n", - " \n", - " \n", - " 0.17\n", - " \n", - " \n", - " 0.52\n", - " \n", - "
\n", - " 5\n", - " \n", - " \n", - " 6\n", - " \n", - " \n", - " -1.3\n", - " \n", - " \n", - " 0.56\n", - " \n", - " \n", - " 1.4\n", - " \n", - " \n", - " -0.063\n", - " \n", - "
\n", - " 6\n", - " \n", - " \n", - " 7\n", - " \n", - " \n", - " 0.12\n", - " \n", - " \n", - " 1.2\n", - " \n", - " \n", - " -0.002\n", - " \n", - " \n", - " 1.6\n", - " \n", - "
\n", - " 7\n", - " \n", - " \n", - " 8\n", - " \n", - " \n", - " 0.35\n", - " \n", - " \n", - " 1\n", - " \n", - " \n", - " -0.39\n", - " \n", - " \n", - " 0.52\n", - " \n", - "
\n", - " 8\n", - " \n", - " \n", - " 9\n", - " \n", - " \n", - " 1.7\n", - " \n", - " \n", - " -1.3\n", - " \n", - " \n", - " 1.4\n", - " \n", - " \n", - " -2.1\n", - " \n", - "
\n", - " 9\n", - " \n", - " \n", - " 10\n", - " \n", - " \n", - " -0.13\n", - " \n", - " \n", - " 0.63\n", - " \n", - " \n", - " -0.59\n", - " \n", - " \n", - " 0.29\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df.style\\\n", " .applymap(color_negative_red)\\\n", @@ -10319,595 +722,11 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Colormaps, with a caption.
\n", - " \n", - " A\n", - " \n", - " B\n", - " \n", - " C\n", - " \n", - " D\n", - " \n", - " E\n", - " \n", - "
None\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " 0\n", - " \n", - " \n", - " 1\n", - " \n", - " \n", - " 1.32921\n", - " \n", - " \n", - " nan\n", - " \n", - " \n", - " -0.31628\n", - " \n", - " \n", - " -0.99081\n", - " \n", - "
\n", - " 1\n", - " \n", - " \n", - " 2\n", - " \n", - " \n", - " -1.07082\n", - " \n", - " \n", - " -1.43871\n", - " \n", - " \n", - " 0.564417\n", - " \n", - " \n", - " 0.295722\n", - " \n", - "
\n", - " 2\n", - " \n", - " \n", - " 3\n", - " \n", - " \n", - " -1.6264\n", - " \n", - " \n", - " 0.219565\n", - " \n", - " \n", - " 0.678805\n", - " \n", - " \n", - " 1.88927\n", - " \n", - "
\n", - " 3\n", - " \n", - " \n", - " 4\n", - " \n", - " \n", - " 0.961538\n", - " \n", - " \n", - " 0.104011\n", - " \n", - " \n", - " -0.481165\n", - " \n", - " \n", - " 0.850229\n", - " \n", - "
\n", - " 4\n", - " \n", - " \n", - " 5\n", - " \n", - " \n", - " 1.45342\n", - " \n", - " \n", - " 1.05774\n", - " \n", - " \n", - " 0.165562\n", - " \n", - " \n", - " 0.515018\n", - " \n", - "
\n", - " 5\n", - " \n", - " \n", - " 6\n", - " \n", - " \n", - " -1.33694\n", - " \n", - " \n", - " 0.562861\n", - " \n", - " \n", - " 1.39285\n", - " \n", - " \n", - " -0.063328\n", - " \n", - "
\n", - " 6\n", - " \n", - " \n", - " 7\n", - " \n", - " \n", - " 0.121668\n", - " \n", - " \n", - " 1.2076\n", - " \n", - " \n", - " -0.00204021\n", - " \n", - " \n", - " 1.6278\n", - " \n", - "
\n", - " 7\n", - " \n", - " \n", - " 8\n", - " \n", - " \n", - " 0.354493\n", - " \n", - " \n", - " 1.03753\n", - " \n", - " \n", - " -0.385684\n", - " \n", - " \n", - " 0.519818\n", - " \n", - "
\n", - " 8\n", - " \n", - " \n", - " 9\n", - " \n", - " \n", - " 1.68658\n", - " \n", - " \n", - " -1.32596\n", - " \n", - " \n", - " 1.42898\n", - " \n", - " \n", - " -2.08935\n", - " \n", - "
\n", - " 9\n", - " \n", - " \n", - " 10\n", - " \n", - " \n", - " -0.12982\n", - " \n", - " \n", - " 0.631523\n", - " \n", - " \n", - " -0.586538\n", - " \n", - " \n", - " 0.29072\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df.style.set_caption('Colormaps, with a caption.')\\\n", " .background_gradient(cmap=cm)" @@ -10931,315 +750,11 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Hover to highlight.
\n", - " \n", - " A\n", - " \n", - " B\n", - " \n", - " C\n", - " \n", - " D\n", - " \n", - " E\n", - " \n", - "
None\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " 0\n", - " \n", - " \n", - " 1\n", - " \n", - " \n", - " 1.32921\n", - " \n", - " \n", - " nan\n", - " \n", - " \n", - " -0.31628\n", - " \n", - " \n", - " -0.99081\n", - " \n", - "
\n", - " 1\n", - " \n", - " \n", - " 2\n", - " \n", - " \n", - " -1.07082\n", - " \n", - " \n", - " -1.43871\n", - " \n", - " \n", - " 0.564417\n", - " \n", - " \n", - " 0.295722\n", - " \n", - "
\n", - " 2\n", - " \n", - " \n", - " 3\n", - " \n", - " \n", - " -1.6264\n", - " \n", - " \n", - " 0.219565\n", - " \n", - " \n", - " 0.678805\n", - " \n", - " \n", - " 1.88927\n", - " \n", - "
\n", - " 3\n", - " \n", - " \n", - " 4\n", - " \n", - " \n", - " 0.961538\n", - " \n", - " \n", - " 0.104011\n", - " \n", - " \n", - " -0.481165\n", - " \n", - " \n", - " 0.850229\n", - " \n", - "
\n", - " 4\n", - " \n", - " \n", - " 5\n", - " \n", - " \n", - " 1.45342\n", - " \n", - " \n", - " 1.05774\n", - " \n", - " \n", - " 0.165562\n", - " \n", - " \n", - " 0.515018\n", - " \n", - "
\n", - " 5\n", - " \n", - " \n", - " 6\n", - " \n", - " \n", - " -1.33694\n", - " \n", - " \n", - " 0.562861\n", - " \n", - " \n", - " 1.39285\n", - " \n", - " \n", - " -0.063328\n", - " \n", - "
\n", - " 6\n", - " \n", - " \n", - " 7\n", - " \n", - " \n", - " 0.121668\n", - " \n", - " \n", - " 1.2076\n", - " \n", - " \n", - " -0.00204021\n", - " \n", - " \n", - " 1.6278\n", - " \n", - "
\n", - " 7\n", - " \n", - " \n", - " 8\n", - " \n", - " \n", - " 0.354493\n", - " \n", - " \n", - " 1.03753\n", - " \n", - " \n", - " -0.385684\n", - " \n", - " \n", - " 0.519818\n", - " \n", - "
\n", - " 8\n", - " \n", - " \n", - " 9\n", - " \n", - " \n", - " 1.68658\n", - " \n", - " \n", - " -1.32596\n", - " \n", - " \n", - " 1.42898\n", - " \n", - " \n", - " -2.08935\n", - " \n", - "
\n", - " 9\n", - " \n", - " \n", - " 10\n", - " \n", - " \n", - " -0.12982\n", - " \n", - " \n", - " 0.631523\n", - " \n", - " \n", - " -0.586538\n", - " \n", - " \n", - " 0.29072\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "from IPython.display import HTML\n", "\n", @@ -11273,6 +788,27 @@ "We hope to collect some useful ones either in pandas, or preferable in a new package that [builds on top](#Extensibility) the tools here." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# CSS Classes\n", + "\n", + "Certain CSS classes are attached to cells.\n", + "\n", + "- Index and Column names include `index_name` and `level` where `k` is its level in a MultiIndex\n", + "- Index label cells include\n", + " + `row_heading`\n", + " + `row` where `n` is the numeric position of the row\n", + " + `level` where `k` is the level in a MultiIndex\n", + "- Column label cells include\n", + " + `col_heading`\n", + " + `col` where `n` is the numeric position of the column\n", + " + `level` where `k` is the level in a MultiIndex\n", + "- Blank cells include `blank`\n", + "- Data cells include `data`" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -11312,592 +848,11 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - "\n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " A\n", - " \n", - " B\n", - " \n", - " C\n", - " \n", - " D\n", - " \n", - " E\n", - " \n", - "
None\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " 0\n", - " \n", - " \n", - " 1\n", - " \n", - " \n", - " 1.32921\n", - " \n", - " \n", - " nan\n", - " \n", - " \n", - " -0.31628\n", - " \n", - " \n", - " -0.99081\n", - " \n", - "
\n", - " 1\n", - " \n", - " \n", - " 2\n", - " \n", - " \n", - " -1.07082\n", - " \n", - " \n", - " -1.43871\n", - " \n", - " \n", - " 0.564417\n", - " \n", - " \n", - " 0.295722\n", - " \n", - "
\n", - " 2\n", - " \n", - " \n", - " 3\n", - " \n", - " \n", - " -1.6264\n", - " \n", - " \n", - " 0.219565\n", - " \n", - " \n", - " 0.678805\n", - " \n", - " \n", - " 1.88927\n", - " \n", - "
\n", - " 3\n", - " \n", - " \n", - " 4\n", - " \n", - " \n", - " 0.961538\n", - " \n", - " \n", - " 0.104011\n", - " \n", - " \n", - " -0.481165\n", - " \n", - " \n", - " 0.850229\n", - " \n", - "
\n", - " 4\n", - " \n", - " \n", - " 5\n", - " \n", - " \n", - " 1.45342\n", - " \n", - " \n", - " 1.05774\n", - " \n", - " \n", - " 0.165562\n", - " \n", - " \n", - " 0.515018\n", - " \n", - "
\n", - " 5\n", - " \n", - " \n", - " 6\n", - " \n", - " \n", - " -1.33694\n", - " \n", - " \n", - " 0.562861\n", - " \n", - " \n", - " 1.39285\n", - " \n", - " \n", - " -0.063328\n", - " \n", - "
\n", - " 6\n", - " \n", - " \n", - " 7\n", - " \n", - " \n", - " 0.121668\n", - " \n", - " \n", - " 1.2076\n", - " \n", - " \n", - " -0.00204021\n", - " \n", - " \n", - " 1.6278\n", - " \n", - "
\n", - " 7\n", - " \n", - " \n", - " 8\n", - " \n", - " \n", - " 0.354493\n", - " \n", - " \n", - " 1.03753\n", - " \n", - " \n", - " -0.385684\n", - " \n", - " \n", - " 0.519818\n", - " \n", - "
\n", - " 8\n", - " \n", - " \n", - " 9\n", - " \n", - " \n", - " 1.68658\n", - " \n", - " \n", - " -1.32596\n", - " \n", - " \n", - " 1.42898\n", - " \n", - " \n", - " -2.08935\n", - " \n", - "
\n", - " 9\n", - " \n", - " \n", - " 10\n", - " \n", - " \n", - " -0.12982\n", - " \n", - " \n", - " 0.631523\n", - " \n", - " \n", - " -0.586538\n", - " \n", - " \n", - " 0.29072\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "from IPython.html import widgets\n", "@widgets.interact\n", @@ -11910,7 +865,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": null, "metadata": { "collapsed": false }, @@ -11931,6821 +886,11 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Hover to magify
\n", - " \n", - " 0\n", - " \n", - " 1\n", - " \n", - " 2\n", - " \n", - " 3\n", - " \n", - " 4\n", - " \n", - " 5\n", - " \n", - " 6\n", - " \n", - " 7\n", - " \n", - " 8\n", - " \n", - " 9\n", - " \n", - " 10\n", - " \n", - " 11\n", - " \n", - " 12\n", - " \n", - " 13\n", - " \n", - " 14\n", - " \n", - " 15\n", - " \n", - " 16\n", - " \n", - " 17\n", - " \n", - " 18\n", - " \n", - " 19\n", - " \n", - " 20\n", - " \n", - " 21\n", - " \n", - " 22\n", - " \n", - " 23\n", - " \n", - " 24\n", - " \n", - "
None\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " 0\n", - " \n", - " \n", - " 0.23\n", - " \n", - " \n", - " 1\n", - " \n", - " \n", - " -0.84\n", - " \n", - " \n", - " -0.59\n", - " \n", - " \n", - " -0.96\n", - " \n", - " \n", - " -0.22\n", - " \n", - " \n", - " -0.62\n", - " \n", - " \n", - " 1.8\n", - " \n", - " \n", - " -2.1\n", - " \n", - " \n", - " 0.87\n", - " \n", - " \n", - " -0.92\n", - " \n", - " \n", - " -0.23\n", - " \n", - " \n", - " 2.2\n", - " \n", - " \n", - " -1.3\n", - " \n", - " \n", - " 0.076\n", - " \n", - " \n", - " -1.2\n", - " \n", - " \n", - " 1.2\n", - " \n", - " \n", - " -1\n", - " \n", - " \n", - " 1.1\n", - " \n", - " \n", - " -0.42\n", - " \n", - " \n", - " 2.3\n", - " \n", - " \n", - " -2.6\n", - " \n", - " \n", - " 2.8\n", - " \n", - " \n", - " 0.68\n", - " \n", - " \n", - " -1.6\n", - " \n", - "
\n", - " 1\n", - " \n", - " \n", - " -1.7\n", - " \n", - " \n", - " 1.6\n", - " \n", - " \n", - " -1.1\n", - " \n", - " \n", - " -1.1\n", - " \n", - " \n", - " 1\n", - " \n", - " \n", - " 0.0037\n", - " \n", - " \n", - " -2.5\n", - " \n", - " \n", - " 3.4\n", - " \n", - " \n", - " -1.7\n", - " \n", - " \n", - " 1.3\n", - " \n", - " \n", - " -0.52\n", - " \n", - " \n", - " -0.015\n", - " \n", - " \n", - " 1.5\n", - " \n", - " \n", - " -1.1\n", - " \n", - " \n", - " -1.9\n", - " \n", - " \n", - " -1.1\n", - " \n", - " \n", - " -0.68\n", - " \n", - " \n", - " -0.81\n", - " \n", - " \n", - " 0.35\n", - " \n", - " \n", - " -0.055\n", - " \n", - " \n", - " 1.8\n", - " \n", - " \n", - " -2.8\n", - " \n", - " \n", - " 2.3\n", - " \n", - " \n", - " 0.78\n", - " \n", - " \n", - " 0.44\n", - " \n", - "
\n", - " 2\n", - " \n", - " \n", - " -0.65\n", - " \n", - " \n", - " 3.2\n", - " \n", - " \n", - " -1.8\n", - " \n", - " \n", - " 0.52\n", - " \n", - " \n", - " 2.2\n", - " \n", - " \n", - " -0.37\n", - " \n", - " \n", - " -3\n", - " \n", - " \n", - " 3.7\n", - " \n", - " \n", - " -1.9\n", - " \n", - " \n", - " 2.5\n", - " \n", - " \n", - " 0.21\n", - " \n", - " \n", - " -0.24\n", - " \n", - " \n", - " -0.1\n", - " \n", - " \n", - " -0.78\n", - " \n", - " \n", - " -3\n", - " \n", - " \n", - " -0.82\n", - " \n", - " \n", - " -0.21\n", - " \n", - " \n", - " -0.23\n", - " \n", - " \n", - " 0.86\n", - " \n", - " \n", - " -0.68\n", - " \n", - " \n", - " 1.4\n", - " \n", - " \n", - " -4.9\n", - " \n", - " \n", - " 3\n", - " \n", - " \n", - " 1.9\n", - " \n", - " \n", - " 0.61\n", - " \n", - "
\n", - " 3\n", - " \n", - " \n", - " -1.6\n", - " \n", - " \n", - " 3.7\n", - " \n", - " \n", - " -2.3\n", - " \n", - " \n", - " 0.43\n", - " \n", - " \n", - " 4.2\n", - " \n", - " \n", - " -0.43\n", - " \n", - " \n", - " -3.9\n", - " \n", - " \n", - " 4.2\n", - " \n", - " \n", - " -2.1\n", - " \n", - " \n", - " 1.1\n", - " \n", - " \n", - " 0.12\n", - " \n", - " \n", - " 0.6\n", - " \n", - " \n", - " -0.89\n", - " \n", - " \n", - " 0.27\n", - " \n", - " \n", - " -3.7\n", - " \n", - " \n", - " -2.7\n", - " \n", - " \n", - " -0.31\n", - " \n", - " \n", - " -1.6\n", - " \n", - " \n", - " 1.4\n", - " \n", - " \n", - " -1.8\n", - " \n", - " \n", - " 0.91\n", - " \n", - " \n", - " -5.8\n", - " \n", - " \n", - " 2.8\n", - " \n", - " \n", - " 2.1\n", - " \n", - " \n", - " 0.28\n", - " \n", - "
\n", - " 4\n", - " \n", - " \n", - " -3.3\n", - " \n", - " \n", - " 4.5\n", - " \n", - " \n", - " -1.9\n", - " \n", - " \n", - " -1.7\n", - " \n", - " \n", - " 5.2\n", - " \n", - " \n", - " -1\n", - " \n", - " \n", - " -3.8\n", - " \n", - " \n", - " 4.7\n", - " \n", - " \n", - " -0.72\n", - " \n", - " \n", - " 1.1\n", - " \n", - " \n", - " -0.18\n", - " \n", - " \n", - " 0.83\n", - " \n", - " \n", - " -0.22\n", - " \n", - " \n", - " -1.1\n", - " \n", - " \n", - " -4.3\n", - " \n", - " \n", - " -2.9\n", - " \n", - " \n", - " -0.97\n", - " \n", - " \n", - " -1.8\n", - " \n", - " \n", - " 1.5\n", - " \n", - " \n", - " -1.8\n", - " \n", - " \n", - " 2.2\n", - " \n", - " \n", - " -6.3\n", - " \n", - " \n", - " 3.3\n", - " \n", - " \n", - " 2.5\n", - " \n", - " \n", - " 2.1\n", - " \n", - "
\n", - " 5\n", - " \n", - " \n", - " -0.84\n", - " \n", - " \n", - " 4.2\n", - " \n", - " \n", - " -1.7\n", - " \n", - " \n", - " -2\n", - " \n", - " \n", - " 5.3\n", - " \n", - " \n", - " -0.99\n", - " \n", - " \n", - " -4.1\n", - " \n", - " \n", - " 3.9\n", - " \n", - " \n", - " -1.1\n", - " \n", - " \n", - " -0.94\n", - " \n", - " \n", - " 1.2\n", - " \n", - " \n", - " 0.087\n", - " \n", - " \n", - " -1.8\n", - " \n", - " \n", - " -0.11\n", - " \n", - " \n", - " -4.5\n", - " \n", - " \n", - " -0.85\n", - " \n", - " \n", - " -2.1\n", - " \n", - " \n", - " -1.4\n", - " \n", - " \n", - " 0.8\n", - " \n", - " \n", - " -1.6\n", - " \n", - " \n", - " 1.5\n", - " \n", - " \n", - " -6.5\n", - " \n", - " \n", - " 2.8\n", - " \n", - " \n", - " 2.1\n", - " \n", - " \n", - " 3.8\n", - " \n", - "
\n", - " 6\n", - " \n", - " \n", - " -0.74\n", - " \n", - " \n", - " 5.4\n", - " \n", - " \n", - " -2.1\n", - " \n", - " \n", - " -1.1\n", - " \n", - " \n", - " 4.2\n", - " \n", - " \n", - " -1.8\n", - " \n", - " \n", - " -3.2\n", - " \n", - " \n", - " 3.8\n", - " \n", - " \n", - " -3.2\n", - " \n", - " \n", - " -1.2\n", - " \n", - " \n", - " 0.34\n", - " \n", - " \n", - " 0.57\n", - " \n", - " \n", - " -1.8\n", - " \n", - " \n", - " 0.54\n", - " \n", - " \n", - " -4.4\n", - " \n", - " \n", - " -1.8\n", - " \n", - " \n", - " -4\n", - " \n", - " \n", - " -2.6\n", - " \n", - " \n", - " -0.2\n", - " \n", - " \n", - " -4.7\n", - " \n", - " \n", - " 1.9\n", - " \n", - " \n", - " -8.5\n", - " \n", - " \n", - " 3.3\n", - " \n", - " \n", - " 2.5\n", - " \n", - " \n", - " 5.8\n", - " \n", - "
\n", - " 7\n", - " \n", - " \n", - " -0.44\n", - " \n", - " \n", - " 4.7\n", - " \n", - " \n", - " -2.3\n", - " \n", - " \n", - " -0.21\n", - " \n", - " \n", - " 5.9\n", - " \n", - " \n", - " -2.6\n", - " \n", - " \n", - " -1.8\n", - " \n", - " \n", - " 5.5\n", - " \n", - " \n", - " -4.5\n", - " \n", - " \n", - " -3.2\n", - " \n", - " \n", - " -1.7\n", - " \n", - " \n", - " 0.18\n", - " \n", - " \n", - " 0.11\n", - " \n", - " \n", - " 0.036\n", - " \n", - " \n", - " -6\n", - " \n", - " \n", - " -0.45\n", - " \n", - " \n", - " -6.2\n", - " \n", - " \n", - " -3.9\n", - " \n", - " \n", - " 0.71\n", - " \n", - " \n", - " -3.9\n", - " \n", - " \n", - " 0.67\n", - " \n", - " \n", - " -7.3\n", - " \n", - " \n", - " 3\n", - " \n", - " \n", - " 3.4\n", - " \n", - " \n", - " 6.7\n", - " \n", - "
\n", - " 8\n", - " \n", - " \n", - " 0.92\n", - " \n", - " \n", - " 5.8\n", - " \n", - " \n", - " -3.3\n", - " \n", - " \n", - " -0.65\n", - " \n", - " \n", - " 6\n", - " \n", - " \n", - " -3.2\n", - " \n", - " \n", - " -1.8\n", - " \n", - " \n", - " 5.6\n", - " \n", - " \n", - " -3.5\n", - " \n", - " \n", - " -1.3\n", - " \n", - " \n", - " -1.6\n", - " \n", - " \n", - " 0.82\n", - " \n", - " \n", - " -2.4\n", - " \n", - " \n", - " -0.4\n", - " \n", - " \n", - " -6.1\n", - " \n", - " \n", - " -0.52\n", - " \n", - " \n", - " -6.6\n", - " \n", - " \n", - " -3.5\n", - " \n", - " \n", - " -0.043\n", - " \n", - " \n", - " -4.6\n", - " \n", - " \n", - " 0.51\n", - " \n", - " \n", - " -5.8\n", - " \n", - " \n", - " 3.2\n", - " \n", - " \n", - " 2.4\n", - " \n", - " \n", - " 5.1\n", - " \n", - "
\n", - " 9\n", - " \n", - " \n", - " 0.38\n", - " \n", - " \n", - " 5.5\n", - " \n", - " \n", - " -4.5\n", - " \n", - " \n", - " -0.8\n", - " \n", - " \n", - " 7.1\n", - " \n", - " \n", - " -2.6\n", - " \n", - " \n", - " -0.44\n", - " \n", - " \n", - " 5.3\n", - " \n", - " \n", - " -2\n", - " \n", - " \n", - " -0.33\n", - " \n", - " \n", - " -0.8\n", - " \n", - " \n", - " 0.26\n", - " \n", - " \n", - " -3.4\n", - " \n", - " \n", - " -0.82\n", - " \n", - " \n", - " -6.1\n", - " \n", - " \n", - " -2.6\n", - " \n", - " \n", - " -8.5\n", - " \n", - " \n", - " -4.5\n", - " \n", - " \n", - " 0.41\n", - " \n", - " \n", - " -4.7\n", - " \n", - " \n", - " 1.9\n", - " \n", - " \n", - " -6.9\n", - " \n", - " \n", - " 2.1\n", - " \n", - " \n", - " 3\n", - " \n", - " \n", - " 5.2\n", - " \n", - "
\n", - " 10\n", - " \n", - " \n", - " 2.1\n", - " \n", - " \n", - " 5.8\n", - " \n", - " \n", - " -3.9\n", - " \n", - " \n", - " -0.98\n", - " \n", - " \n", - " 7.8\n", - " \n", - " \n", - " -2.5\n", - " \n", - " \n", - " -0.59\n", - " \n", - " \n", - " 5.6\n", - " \n", - " \n", - " -2.2\n", - " \n", - " \n", - " -0.71\n", - " \n", - " \n", - " -0.46\n", - " \n", - " \n", - " 1.8\n", - " \n", - " \n", - " -2.8\n", - " \n", - " \n", - " 0.48\n", - " \n", - " \n", - " -6\n", - " \n", - " \n", - " -3.4\n", - " \n", - " \n", - " -7.8\n", - " \n", - " \n", - " -5.5\n", - " \n", - " \n", - " -0.7\n", - " \n", - " \n", - " -4.6\n", - " \n", - " \n", - " -0.52\n", - " \n", - " \n", - " -7.7\n", - " \n", - " \n", - " 1.5\n", - " \n", - " \n", - " 5\n", - " \n", - " \n", - " 5.8\n", - " \n", - "
\n", - " 11\n", - " \n", - " \n", - " 1.9\n", - " \n", - " \n", - " 4.5\n", - " \n", - " \n", - " -2.2\n", - " \n", - " \n", - " -1.4\n", - " \n", - " \n", - " 5.9\n", - " \n", - " \n", - " -0.49\n", - " \n", - " \n", - " 0.017\n", - " \n", - " \n", - " 5.8\n", - " \n", - " \n", - " -1\n", - " \n", - " \n", - " -0.6\n", - " \n", - " \n", - " 0.49\n", - " \n", - " \n", - " 2\n", - " \n", - " \n", - " -1.5\n", - " \n", - " \n", - " 1.9\n", - " \n", - " \n", - " -5.9\n", - " \n", - " \n", - " -4.5\n", - " \n", - " \n", - " -8.2\n", - " \n", - " \n", - " -3.4\n", - " \n", - " \n", - " -2.2\n", - " \n", - " \n", - " -4.3\n", - " \n", - " \n", - " -1.2\n", - " \n", - " \n", - " -7.9\n", - " \n", - " \n", - " 1.4\n", - " \n", - " \n", - " 5.3\n", - " \n", - " \n", - " 5.8\n", - " \n", - "
\n", - " 12\n", - " \n", - " \n", - " 3.2\n", - " \n", - " \n", - " 4.2\n", - " \n", - " \n", - " -3.1\n", - " \n", - " \n", - " -2.3\n", - " \n", - " \n", - " 5.9\n", - " \n", - " \n", - " -2.6\n", - " \n", - " \n", - " 0.33\n", - " \n", - " \n", - " 6.7\n", - " \n", - " \n", - " -2.8\n", - " \n", - " \n", - " -0.2\n", - " \n", - " \n", - " 1.9\n", - " \n", - " \n", - " 2.6\n", - " \n", - " \n", - " -1.5\n", - " \n", - " \n", - " 0.75\n", - " \n", - " \n", - " -5.3\n", - " \n", - " \n", - " -4.5\n", - " \n", - " \n", - " -7.6\n", - " \n", - " \n", - " -2.9\n", - " \n", - " \n", - " -2.2\n", - " \n", - " \n", - " -4.8\n", - " \n", - " \n", - " -1.1\n", - " \n", - " \n", - " -9\n", - " \n", - " \n", - " 2.1\n", - " \n", - " \n", - " 6.4\n", - " \n", - " \n", - " 5.6\n", - " \n", - "
\n", - " 13\n", - " \n", - " \n", - " 2.3\n", - " \n", - " \n", - " 4.5\n", - " \n", - " \n", - " -3.9\n", - " \n", - " \n", - " -2\n", - " \n", - " \n", - " 6.8\n", - " \n", - " \n", - " -3.3\n", - " \n", - " \n", - " -2.2\n", - " \n", - " \n", - " 8\n", - " \n", - " \n", - " -2.6\n", - " \n", - " \n", - " -0.8\n", - " \n", - " \n", - " 0.71\n", - " \n", - " \n", - " 2.3\n", - " \n", - " \n", - " -0.16\n", - " \n", - " \n", - " -0.46\n", - " \n", - " \n", - " -5.1\n", - " \n", - " \n", - " -3.8\n", - " \n", - " \n", - " -7.6\n", - " \n", - " \n", - " -4\n", - " \n", - " \n", - " 0.33\n", - " \n", - " \n", - " -3.7\n", - " \n", - " \n", - " -1\n", - " \n", - " \n", - " -8.7\n", - " \n", - " \n", - " 2.5\n", - " \n", - " \n", - " 5.9\n", - " \n", - " \n", - " 6.7\n", - " \n", - "
\n", - " 14\n", - " \n", - " \n", - " 3.8\n", - " \n", - " \n", - " 4.3\n", - " \n", - " \n", - " -3.9\n", - " \n", - " \n", - " -1.6\n", - " \n", - " \n", - " 6.2\n", - " \n", - " \n", - " -3.2\n", - " \n", - " \n", - " -1.5\n", - " \n", - " \n", - " 5.6\n", - " \n", - " \n", - " -2.9\n", - " \n", - " \n", - " -0.33\n", - " \n", - " \n", - " -0.97\n", - " \n", - " \n", - " 1.7\n", - " \n", - " \n", - " 3.6\n", - " \n", - " \n", - " 0.29\n", - " \n", - " \n", - " -4.2\n", - " \n", - " \n", - " -4.1\n", - " \n", - " \n", - " -6.7\n", - " \n", - " \n", - " -4.5\n", - " \n", - " \n", - " -2.2\n", - " \n", - " \n", - " -2.4\n", - " \n", - " \n", - " -1.6\n", - " \n", - " \n", - " -9.4\n", - " \n", - " \n", - " 3.4\n", - " \n", - " \n", - " 6.1\n", - " \n", - " \n", - " 7.5\n", - " \n", - "
\n", - " 15\n", - " \n", - " \n", - " 5.6\n", - " \n", - " \n", - " 5.3\n", - " \n", - " \n", - " -4\n", - " \n", - " \n", - " -2.3\n", - " \n", - " \n", - " 5.9\n", - " \n", - " \n", - " -3.3\n", - " \n", - " \n", - " -1\n", - " \n", - " \n", - " 5.7\n", - " \n", - " \n", - " -3.1\n", - " \n", - " \n", - " -0.33\n", - " \n", - " \n", - " -1.2\n", - " \n", - " \n", - " 2.2\n", - " \n", - " \n", - " 4.2\n", - " \n", - " \n", - " 1\n", - " \n", - " \n", - " -3.2\n", - " \n", - " \n", - " -4.3\n", - " \n", - " \n", - " -5.7\n", - " \n", - " \n", - " -4.4\n", - " \n", - " \n", - " -2.3\n", - " \n", - " \n", - " -1.4\n", - " \n", - " \n", - " -1.2\n", - " \n", - " \n", - " -11\n", - " \n", - " \n", - " 2.6\n", - " \n", - " \n", - " 6.7\n", - " \n", - " \n", - " 5.9\n", - " \n", - "
\n", - " 16\n", - " \n", - " \n", - " 4.1\n", - " \n", - " \n", - " 4.3\n", - " \n", - " \n", - " -2.4\n", - " \n", - " \n", - " -3.3\n", - " \n", - " \n", - " 6\n", - " \n", - " \n", - " -2.5\n", - " \n", - " \n", - " -0.47\n", - " \n", - " \n", - " 5.3\n", - " \n", - " \n", - " -4.8\n", - " \n", - " \n", - " 1.6\n", - " \n", - " \n", - " 0.23\n", - " \n", - " \n", - " 0.099\n", - " \n", - " \n", - " 5.8\n", - " \n", - " \n", - " 1.8\n", - " \n", - " \n", - " -3.1\n", - " \n", - " \n", - " -3.9\n", - " \n", - " \n", - " -5.5\n", - " \n", - " \n", - " -3\n", - " \n", - " \n", - " -2.1\n", - " \n", - " \n", - " -1.1\n", - " \n", - " \n", - " -0.56\n", - " \n", - " \n", - " -13\n", - " \n", - " \n", - " 2.1\n", - " \n", - " \n", - " 6.2\n", - " \n", - " \n", - " 4.9\n", - " \n", - "
\n", - " 17\n", - " \n", - " \n", - " 5.6\n", - " \n", - " \n", - " 4.6\n", - " \n", - " \n", - " -3.5\n", - " \n", - " \n", - " -3.8\n", - " \n", - " \n", - " 6.6\n", - " \n", - " \n", - " -2.6\n", - " \n", - " \n", - " -0.75\n", - " \n", - " \n", - " 6.6\n", - " \n", - " \n", - " -4.8\n", - " \n", - " \n", - " 3.6\n", - " \n", - " \n", - " -0.29\n", - " \n", - " \n", - " 0.56\n", - " \n", - " \n", - " 5.8\n", - " \n", - " \n", - " 2\n", - " \n", - " \n", - " -2.3\n", - " \n", - " \n", - " -2.3\n", - " \n", - " \n", - " -5\n", - " \n", - " \n", - " -3.2\n", - " \n", - " \n", - " -3.1\n", - " \n", - " \n", - " -2.4\n", - " \n", - " \n", - " 0.84\n", - " \n", - " \n", - " -13\n", - " \n", - " \n", - " 3.6\n", - " \n", - " \n", - " 7.4\n", - " \n", - " \n", - " 4.7\n", - " \n", - "
\n", - " 18\n", - " \n", - " \n", - " 6\n", - " \n", - " \n", - " 5.8\n", - " \n", - " \n", - " -2.8\n", - " \n", - " \n", - " -4.2\n", - " \n", - " \n", - " 7.1\n", - " \n", - " \n", - " -3.3\n", - " \n", - " \n", - " -1.2\n", - " \n", - " \n", - " 7.9\n", - " \n", - " \n", - " -4.9\n", - " \n", - " \n", - " 1.4\n", - " \n", - " \n", - " -0.63\n", - " \n", - " \n", - " 0.35\n", - " \n", - " \n", - " 7.5\n", - " \n", - " \n", - " 0.87\n", - " \n", - " \n", - " -1.5\n", - " \n", - " \n", - " -2.1\n", - " \n", - " \n", - " -4.2\n", - " \n", - " \n", - " -2.5\n", - " \n", - " \n", - " -2.5\n", - " \n", - " \n", - " -2.9\n", - " \n", - " \n", - " 1.9\n", - " \n", - " \n", - " -9.7\n", - " \n", - " \n", - " 3.4\n", - " \n", - " \n", - " 7.1\n", - " \n", - " \n", - " 4.4\n", - " \n", - "
\n", - " 19\n", - " \n", - " \n", - " 4\n", - " \n", - " \n", - " 6.2\n", - " \n", - " \n", - " -4.1\n", - " \n", - " \n", - " -4.1\n", - " \n", - " \n", - " 7.2\n", - " \n", - " \n", - " -4.1\n", - " \n", - " \n", - " -1.5\n", - " \n", - " \n", - " 6.5\n", - " \n", - " \n", - " -5.2\n", - " \n", - " \n", - " -0.24\n", - " \n", - " \n", - " 0.0072\n", - " \n", - " \n", - " 1.2\n", - " \n", - " \n", - " 6.4\n", - " \n", - " \n", - " -2\n", - " \n", - " \n", - " -2.6\n", - " \n", - " \n", - " -1.7\n", - " \n", - " \n", - " -5.2\n", - " \n", - " \n", - " -3.3\n", - " \n", - " \n", - " -2.9\n", - " \n", - " \n", - " -1.7\n", - " \n", - " \n", - " 1.6\n", - " \n", - " \n", - " -11\n", - " \n", - " \n", - " 2.8\n", - " \n", - " \n", - " 7.5\n", - " \n", - " \n", - " 3.9\n", - " \n", - "
\n", - " " - ], - "text/plain": [ - "" - ] - }, - "execution_count": 32, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "np.random.seed(25)\n", "cmap = cmap=sns.diverging_palette(5, 250, as_cmap=True)\n", @@ -18816,7 +961,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.4.3" + "version": "3.5.1" } }, "nbformat": 4, diff --git a/doc/source/index.rst.template b/doc/source/index.rst.template index 6011c22e9cc2e..1996ad75ea92a 100644 --- a/doc/source/index.rst.template +++ b/doc/source/index.rst.template @@ -92,7 +92,7 @@ Some other notes specialized tool. - pandas is a dependency of `statsmodels - `__, making it an important part of the + `__, making it an important part of the statistical computing ecosystem in Python. - pandas has been used extensively in production in financial applications. diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index 7380c543857a2..0a6691936d97d 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -828,6 +828,8 @@ To select a row where each column meets its own criterion: df[row_mask] +.. _indexing.where_mask: + The :meth:`~pandas.DataFrame.where` Method and Masking ------------------------------------------------------ @@ -891,6 +893,15 @@ without creating a copy: df_orig.where(df > 0, -df, inplace=True); df_orig +.. note:: + + The signature for :func:`DataFrame.where` differs from :func:`numpy.where`. + Roughly ``df1.where(m, df2)`` is equivalent to ``np.where(m, df1, df2)``. + + .. ipython:: python + + df.where(df < 0, -df) == np.where(df < 0, df, -df) + **alignment** Furthermore, ``where`` aligns the input boolean condition (ndarray or DataFrame), diff --git a/doc/source/install.rst b/doc/source/install.rst index 0abaa70586d5a..6295e6f6cbb68 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -165,8 +165,9 @@ To install pandas for Python 3 you may need to use the package ``python3-pandas` Debian & Ubuntu, unstable (latest packages), `NeuroDebian `__ , ``sudo apt-get install python-pandas`` Ubuntu, stable, `official Ubuntu repository `__ , ``sudo apt-get install python-pandas`` Ubuntu, unstable (daily builds), `PythonXY PPA `__; activate by: ``sudo add-apt-repository ppa:pythonxy/pythonxy-devel && sudo apt-get update``, ``sudo apt-get install python-pandas`` - OpenSuse & Fedora, stable, `OpenSuse Repository `__ , ``zypper in python-pandas`` - + OpenSuse, stable, `OpenSuse Repository `__ , ``zypper in python-pandas`` + Fedora, stable, `official Fedora repository `__ , ``dnf install python-pandas`` + Centos/RHEL, stable, `EPEL repository `__ , ``yum install python-pandas`` @@ -252,28 +253,32 @@ Optional Dependencies - `pymysql `__: for MySQL. - `SQLite `__: for SQLite, this is included in Python's standard library by default. -* `matplotlib `__: for plotting -* `openpyxl `__, `xlrd/xlwt `__: Needed for Excel I/O -* `XlsxWriter `__: Alternative Excel writer +* `matplotlib `__: for plotting +* For Excel I/O: + + * `xlrd/xlwt `__: Excel reading (xlrd) and writing (xlwt) + * `openpyxl `__: openpyxl version 1.6.1 + or higher (but lower than 2.0.0), or version 2.2 or higher, for writing .xlsx files (xlrd >= 0.9.0) + * `XlsxWriter `__: Alternative Excel writer + * `Jinja2 `__: Template engine for conditional HTML formatting. -* `boto `__: necessary for Amazon S3 - access. +* `boto `__: necessary for Amazon S3 access. * `blosc `__: for msgpack compression using ``blosc`` * One of `PyQt4 `__, `PySide `__, `pygtk `__, `xsel `__, or `xclip - `__: necessary to use - :func:`~pandas.io.clipboard.read_clipboard`. Most package managers on Linux distributions will have ``xclip`` and/or ``xsel`` immediately available for installation. -* Google's `python-gflags `__ , + `__: necessary to use + :func:`~pandas.read_clipboard`. Most package managers on Linux distributions will have ``xclip`` and/or ``xsel`` immediately available for installation. +* Google's `python-gflags <`__ , `oauth2client `__ , `httplib2 `__ and `google-api-python-client `__ : Needed for :mod:`~pandas.io.gbq` * `Backports.lzma `__: Only for Python 2, for writing to and/or reading from an xz compressed DataFrame in CSV; Python 3 support is built into the standard library. * One of the following combinations of libraries is needed to use the - top-level :func:`~pandas.io.html.read_html` function: + top-level :func:`~pandas.read_html` function: * `BeautifulSoup4`_ and `html5lib`_ (Any recent version of `html5lib`_ is okay.) @@ -286,14 +291,14 @@ Optional Dependencies * if you install `BeautifulSoup4`_ you must install either `lxml`_ or `html5lib`_ or both. - :func:`~pandas.io.html.read_html` will **not** work with *only* + :func:`~pandas.read_html` will **not** work with *only* `BeautifulSoup4`_ installed. * You are highly encouraged to read :ref:`HTML reading gotchas `. It explains issues surrounding the installation and usage of the above three libraries * You may need to install an older version of `BeautifulSoup4`_: - - Versions 4.2.1, 4.1.3 and 4.0.2 have been confirmed for 64 and - 32-bit Ubuntu/Debian + Versions 4.2.1, 4.1.3 and 4.0.2 have been confirmed for 64 and 32-bit + Ubuntu/Debian * Additionally, if you're using `Anaconda`_ you should definitely read :ref:`the gotchas about HTML parsing libraries ` diff --git a/doc/source/io.rst b/doc/source/io.rst index 6cf41bbc50fb5..96ec624f4fd3c 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -19,11 +19,10 @@ import matplotlib.pyplot as plt plt.close('all') - from pandas import * - options.display.max_rows=15 import pandas.util.testing as tm - clipdf = DataFrame({'A':[1,2,3],'B':[4,5,6],'C':['p','q','r']}, - index=['x','y','z']) + pd.options.display.max_rows=15 + clipdf = pd.DataFrame({'A':[1,2,3],'B':[4,5,6],'C':['p','q','r']}, + index=['x','y','z']) =============================== IO Tools (Text, CSV, HDF5, ...) @@ -134,6 +133,14 @@ usecols : array-like, default ``None`` inferred from the document header row(s). For example, a valid `usecols` parameter would be [0, 1, 2] or ['foo', 'bar', 'baz']. Using this parameter results in much faster parsing time and lower memory usage. +as_recarray : boolean, default ``False`` + DEPRECATED: this argument will be removed in a future version. Please call + ``pd.read_csv(...).to_records()`` instead. + + Return a NumPy recarray instead of a DataFrame after parsing the data. If + set to ``True``, this option takes precedence over the ``squeeze`` parameter. + In addition, as row indices are not available in such a format, the ``index_col`` + parameter will be ignored. squeeze : boolean, default ``False`` If the parsed data only contains one column then return a Series. prefix : str, default ``None`` @@ -167,6 +174,8 @@ skiprows : list-like or integer, default ``None`` of the file. skipfooter : int, default ``0`` Number of lines at bottom of file to skip (unsupported with engine='c'). +skip_footer : int, default ``0`` + DEPRECATED: use the ``skipfooter`` parameter instead, as they are identical nrows : int, default ``None`` Number of rows of file to read. Useful for reading pieces of large files. low_memory : boolean, default ``True`` @@ -176,11 +185,29 @@ low_memory : boolean, default ``True`` Note that the entire file is read into a single DataFrame regardless, use the ``chunksize`` or ``iterator`` parameter to return the data in chunks. (Only valid with C parser) +buffer_lines : int, default None + DEPRECATED: this argument will be removed in a future version because its + value is not respected by the parser +compact_ints : boolean, default False + DEPRECATED: this argument will be removed in a future version + + If ``compact_ints`` is ``True``, then for any column that is of integer dtype, the + parser will attempt to cast it as the smallest integer ``dtype`` possible, either + signed or unsigned depending on the specification from the ``use_unsigned`` parameter. +use_unsigned : boolean, default False + DEPRECATED: this argument will be removed in a future version + + If integer columns are being compacted (i.e. ``compact_ints=True``), specify whether + the column should be compacted to the smallest signed or unsigned integer dtype. +memory_map : boolean, default False + If a filepath is provided for ``filepath_or_buffer``, map the file object + directly onto memory and access the data directly from there. Using this + option can improve performance because there is no longer any I/O overhead. NA and Missing Data Handling ++++++++++++++++++++++++++++ -na_values : str, list-like or dict, default ``None`` +na_values : scalar, str, list-like, or dict, default ``None`` Additional strings to recognize as NA/NaN. If dict passed, specific per-column NA values. By default the following values are interpreted as NaN: ``'-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A', 'N/A', 'NA', @@ -252,16 +279,23 @@ thousands : str, default ``None`` Thousands separator. decimal : str, default ``'.'`` Character to recognize as decimal point. E.g. use ``','`` for European data. +float_precision : string, default None + Specifies which converter the C engine should use for floating-point values. + The options are ``None`` for the ordinary converter, ``high`` for the + high-precision converter, and ``round_trip`` for the round-trip converter. lineterminator : str (length 1), default ``None`` Character to break file into lines. Only valid with C parser. quotechar : str (length 1) The character used to denote the start and end of a quoted item. Quoted items can include the delimiter and it will be ignored. -quoting : int or ``csv.QUOTE_*`` instance, default ``None`` +quoting : int or ``csv.QUOTE_*`` instance, default ``0`` Control field quoting behavior per ``csv.QUOTE_*`` constants. Use one of ``QUOTE_MINIMAL`` (0), ``QUOTE_ALL`` (1), ``QUOTE_NONNUMERIC`` (2) or - ``QUOTE_NONE`` (3). Default (``None``) results in ``QUOTE_MINIMAL`` - behavior. + ``QUOTE_NONE`` (3). +doublequote : boolean, default ``True`` + When ``quotechar`` is specified and ``quoting`` is not ``QUOTE_NONE``, + indicate whether or not to interpret two consecutive ``quotechar`` elements + **inside** a field as a single ``quotechar`` element. escapechar : str (length 1), default ``None`` One-character string used to escape delimiter when quoting is ``QUOTE_NONE``. comment : str, default ``None`` @@ -402,11 +436,108 @@ individual columns: df = pd.read_csv(StringIO(data), dtype={'b': object, 'c': np.float64}) df.dtypes +Fortunately, ``pandas`` offers more than one way to ensure that your column(s) +contain only one ``dtype``. If you're unfamiliar with these concepts, you can +see :ref:`here` to learn more about dtypes, and +:ref:`here` to learn more about ``object`` conversion in +``pandas``. + + +For instance, you can use the ``converters`` argument +of :func:`~pandas.read_csv`: + +.. ipython:: python + + data = "col_1\n1\n2\n'A'\n4.22" + df = pd.read_csv(StringIO(data), converters={'col_1':str}) + df + df['col_1'].apply(type).value_counts() + +Or you can use the :func:`~pandas.to_numeric` function to coerce the +dtypes after reading in the data, + +.. ipython:: python + + df2 = pd.read_csv(StringIO(data)) + df2['col_1'] = pd.to_numeric(df2['col_1'], errors='coerce') + df2 + df2['col_1'].apply(type).value_counts() + +which would convert all valid parsing to floats, leaving the invalid parsing +as ``NaN``. + +Ultimately, how you deal with reading in columns containing mixed dtypes +depends on your specific needs. In the case above, if you wanted to ``NaN`` out +the data anomalies, then :func:`~pandas.to_numeric` is probably your best option. +However, if you wanted for all the data to be coerced, no matter the type, then +using the ``converters`` argument of :func:`~pandas.read_csv` would certainly be +worth trying. + .. note:: The ``dtype`` option is currently only supported by the C engine. Specifying ``dtype`` with ``engine`` other than 'c' raises a ``ValueError``. +.. note:: + In some cases, reading in abnormal data with columns containing mixed dtypes + will result in an inconsistent dataset. If you rely on pandas to infer the + dtypes of your columns, the parsing engine will go and infer the dtypes for + different chunks of the data, rather than the whole dataset at once. Consequently, + you can end up with column(s) with mixed dtypes. For example, + + .. ipython:: python + :okwarning: + + df = pd.DataFrame({'col_1':range(500000) + ['a', 'b'] + range(500000)}) + df.to_csv('foo') + mixed_df = pd.read_csv('foo') + mixed_df['col_1'].apply(type).value_counts() + mixed_df['col_1'].dtype + + will result with `mixed_df` containing an ``int`` dtype for certain chunks + of the column, and ``str`` for others due to the mixed dtypes from the + data that was read in. It is important to note that the overall column will be + marked with a ``dtype`` of ``object``, which is used for columns with mixed dtypes. + +.. _io.categorical: + +Specifying Categorical dtype +'''''''''''''''''''''''''''' + +.. versionadded:: 0.19.0 + +``Categorical`` columns can be parsed directly by specifying ``dtype='category'`` + +.. ipython:: python + + data = 'col1,col2,col3\na,b,1\na,b,2\nc,d,3' + + pd.read_csv(StringIO(data)) + pd.read_csv(StringIO(data)).dtypes + pd.read_csv(StringIO(data), dtype='category').dtypes + +Individual columns can be parsed as a ``Categorical`` using a dict specification + +.. ipython:: python + + pd.read_csv(StringIO(data), dtype={'col1': 'category'}).dtypes + +.. note:: + + The resulting categories will always be parsed as strings (object dtype). + If the categories are numeric they can be converted using the + :func:`to_numeric` function, or as appropriate, another converter + such as :func:`to_datetime`. + + .. ipython:: python + + df = pd.read_csv(StringIO(data), dtype='category') + df.dtypes + df['col3'] + df['col3'].cat.categories = pd.to_numeric(df['col3'].cat.categories) + df['col3'] + + Naming and Using Columns '''''''''''''''''''''''' @@ -1258,7 +1389,7 @@ class of the csv module. For this, you have to specify ``sep=None``. .. ipython:: python :suppress: - df = DataFrame(np.random.randn(10, 4)) + df = pd.DataFrame(np.random.randn(10, 4)) df.to_csv('tmp.sv', sep='|') df.to_csv('tmp2.sv', sep=':') @@ -1318,7 +1449,7 @@ back to python if C-unsupported options are specified. Currently, C-unsupported options include: - ``sep`` other than a single character (e.g. regex separators) -- ``skip_footer`` +- ``skipfooter`` - ``sep=None`` with ``delim_whitespace=False`` Specifying any of the above options will produce a ``ParserWarning`` unless the @@ -1433,12 +1564,13 @@ with optional parameters: - ``force_ascii`` : force encoded string to be ASCII, default True. - ``date_unit`` : The time unit to encode to, governs timestamp and ISO8601 precision. One of 's', 'ms', 'us' or 'ns' for seconds, milliseconds, microseconds and nanoseconds respectively. Default 'ms'. - ``default_handler`` : The handler to call if an object cannot otherwise be converted to a suitable format for JSON. Takes a single argument, which is the object to convert, and returns a serializable object. +- ``lines`` : If ``records`` orient, then will write each record per line as json. Note ``NaN``'s, ``NaT``'s and ``None`` will be converted to ``null`` and ``datetime`` objects will be converted based on the ``date_format`` and ``date_unit`` parameters. .. ipython:: python - dfj = DataFrame(randn(5, 2), columns=list('AB')) + dfj = pd.DataFrame(randn(5, 2), columns=list('AB')) json = dfj.to_json() json @@ -1450,10 +1582,10 @@ file / string. Consider the following DataFrame and Series: .. ipython:: python - dfjo = DataFrame(dict(A=range(1, 4), B=range(4, 7), C=range(7, 10)), - columns=list('ABC'), index=list('xyz')) + dfjo = pd.DataFrame(dict(A=range(1, 4), B=range(4, 7), C=range(7, 10)), + columns=list('ABC'), index=list('xyz')) dfjo - sjo = Series(dict(x=15, y=16, z=17), name='D') + sjo = pd.Series(dict(x=15, y=16, z=17), name='D') sjo **Column oriented** (the default for ``DataFrame``) serializes the data as @@ -1510,8 +1642,8 @@ Writing in ISO date format .. ipython:: python - dfd = DataFrame(randn(5, 2), columns=list('AB')) - dfd['date'] = Timestamp('20130101') + dfd = pd.DataFrame(randn(5, 2), columns=list('AB')) + dfd['date'] = pd.Timestamp('20130101') dfd = dfd.sort_index(1, ascending=False) json = dfd.to_json(date_format='iso') json @@ -1535,10 +1667,10 @@ Writing to a file, with a date index and a date column .. ipython:: python dfj2 = dfj.copy() - dfj2['date'] = Timestamp('20130101') + dfj2['date'] = pd.Timestamp('20130101') dfj2['ints'] = list(range(5)) dfj2['bools'] = True - dfj2.index = date_range('20130101', periods=5) + dfj2.index = pd.date_range('20130101', periods=5) dfj2.to_json('test.json') open('test.json').read() @@ -1574,7 +1706,7 @@ can be dealt with by specifying a simple ``default_handler``: .. ipython:: python - DataFrame([1.0, 2.0, complex(1.0, 2.0)]).to_json(default_handler=str) + pd.DataFrame([1.0, 2.0, complex(1.0, 2.0)]).to_json(default_handler=str) .. _io.json_reader: @@ -1623,6 +1755,8 @@ is ``None``. To explicitly force ``Series`` parsing, pass ``typ=series`` None. By default the timestamp precision will be detected, if this is not desired then pass one of 's', 'ms', 'us' or 'ns' to force timestamp precision to seconds, milliseconds, microseconds or nanoseconds respectively. +- ``lines`` : reads file as one json object per line. +- ``encoding`` : The encoding to use to decode py3 bytes. The parser will raise one of ``ValueError/TypeError/AssertionError`` if the JSON is not parseable. @@ -1685,7 +1819,7 @@ Preserve string indices: .. ipython:: python - si = DataFrame(np.zeros((4, 4)), + si = pd.DataFrame(np.zeros((4, 4)), columns=list(range(4)), index=[str(i) for i in range(4)]) si @@ -1733,17 +1867,17 @@ data: randfloats = np.random.uniform(-100, 1000, 10000) randfloats.shape = (1000, 10) - dffloats = DataFrame(randfloats, columns=list('ABCDEFGHIJ')) + dffloats = pd.DataFrame(randfloats, columns=list('ABCDEFGHIJ')) jsonfloats = dffloats.to_json() .. ipython:: python - timeit read_json(jsonfloats) + timeit pd.read_json(jsonfloats) .. ipython:: python - timeit read_json(jsonfloats, numpy=True) + timeit pd.read_json(jsonfloats, numpy=True) The speedup is less noticeable for smaller datasets: @@ -1753,11 +1887,11 @@ The speedup is less noticeable for smaller datasets: .. ipython:: python - timeit read_json(jsonfloats) + timeit pd.read_json(jsonfloats) .. ipython:: python - timeit read_json(jsonfloats, numpy=True) + timeit pd.read_json(jsonfloats, numpy=True) .. warning:: @@ -1812,6 +1946,26 @@ into a flat table. json_normalize(data, 'counties', ['state', 'shortname', ['info', 'governor']]) +.. _io.jsonl: + +Line delimited json +''''''''''''''''''' + +.. versionadded:: 0.19.0 + +pandas is able to read and write line-delimited json files that are common in data processing pipelines +using Hadoop or Spark. + +.. ipython:: python + + jsonl = ''' + {"a":1,"b":2} + {"a":3,"b":4} + ''' + df = pd.read_json(jsonl, lines=True) + df + df.to_json(orient='records', lines=True) + HTML ---- @@ -1842,7 +1996,7 @@ Read a URL with no options .. ipython:: python url = 'http://www.fdic.gov/bank/individual/failed/banklist.html' - dfs = read_html(url) + dfs = pd.read_html(url) dfs .. note:: @@ -1862,7 +2016,7 @@ as a string .. ipython:: python with open(file_path, 'r') as f: - dfs = read_html(f.read()) + dfs = pd.read_html(f.read()) dfs You can even pass in an instance of ``StringIO`` if you so desire @@ -1872,7 +2026,7 @@ You can even pass in an instance of ``StringIO`` if you so desire with open(file_path, 'r') as f: sio = StringIO(f.read()) - dfs = read_html(sio) + dfs = pd.read_html(sio) dfs .. note:: @@ -1889,7 +2043,7 @@ Read a URL and match a table that contains specific text .. code-block:: python match = 'Metcalf Bank' - df_list = read_html(url, match=match) + df_list = pd.read_html(url, match=match) Specify a header row (by default ```` elements are used to form the column index); if specified, the header row is taken from the data minus the parsed @@ -1897,48 +2051,77 @@ header elements (```` elements). .. code-block:: python - dfs = read_html(url, header=0) + dfs = pd.read_html(url, header=0) Specify an index column .. code-block:: python - dfs = read_html(url, index_col=0) + dfs = pd.read_html(url, index_col=0) Specify a number of rows to skip .. code-block:: python - dfs = read_html(url, skiprows=0) + dfs = pd.read_html(url, skiprows=0) Specify a number of rows to skip using a list (``xrange`` (Python 2 only) works as well) .. code-block:: python - dfs = read_html(url, skiprows=range(2)) + dfs = pd.read_html(url, skiprows=range(2)) Specify an HTML attribute .. code-block:: python - dfs1 = read_html(url, attrs={'id': 'table'}) - dfs2 = read_html(url, attrs={'class': 'sortable'}) + dfs1 = pd.read_html(url, attrs={'id': 'table'}) + dfs2 = pd.read_html(url, attrs={'class': 'sortable'}) print(np.array_equal(dfs1[0], dfs2[0])) # Should be True +Specify values that should be converted to NaN + +.. code-block:: python + + dfs = pd.read_html(url, na_values=['No Acquirer']) + +.. versionadded:: 0.19 + +Specify whether to keep the default set of NaN values + +.. code-block:: python + + dfs = pd.read_html(url, keep_default_na=False) + +.. versionadded:: 0.19 + +Specify converters for columns. This is useful for numerical text data that has +leading zeros. By default columns that are numerical are cast to numeric +types and the leading zeros are lost. To avoid this, we can convert these +columns to strings. + +.. code-block:: python + + url_mcc = 'https://en.wikipedia.org/wiki/Mobile_country_code' + dfs = pd.read_html(url_mcc, match='Telekom Albania', header=0, converters={'MNC': + str}) + +.. versionadded:: 0.19 + Use some combination of the above .. code-block:: python - dfs = read_html(url, match='Metcalf Bank', index_col=0) + dfs = pd.read_html(url, match='Metcalf Bank', index_col=0) Read in pandas ``to_html`` output (with some loss of floating point precision) .. code-block:: python - df = DataFrame(randn(2, 2)) + df = pd.DataFrame(randn(2, 2)) s = df.to_html(float_format='{0:.40g}'.format) - dfin = read_html(s, index_col=0) + dfin = pd.read_html(s, index_col=0) The ``lxml`` backend will raise an error on a failed parse if that is the only parser you provide (if you only have a single parser you can provide just a @@ -1947,13 +2130,13 @@ for example, the function expects a sequence of strings) .. code-block:: python - dfs = read_html(url, 'Metcalf Bank', index_col=0, flavor=['lxml']) + dfs = pd.read_html(url, 'Metcalf Bank', index_col=0, flavor=['lxml']) or .. code-block:: python - dfs = read_html(url, 'Metcalf Bank', index_col=0, flavor='lxml') + dfs = pd.read_html(url, 'Metcalf Bank', index_col=0, flavor='lxml') However, if you have bs4 and html5lib installed and pass ``None`` or ``['lxml', 'bs4']`` then the parse will most likely succeed. Note that *as soon as a parse @@ -1961,7 +2144,7 @@ succeeds, the function will return*. .. code-block:: python - dfs = read_html(url, 'Metcalf Bank', index_col=0, flavor=['lxml', 'bs4']) + dfs = pd.read_html(url, 'Metcalf Bank', index_col=0, flavor=['lxml', 'bs4']) .. _io.html: @@ -1989,7 +2172,7 @@ in the method ``to_string`` described above. .. ipython:: python - df = DataFrame(randn(2, 2)) + df = pd.DataFrame(randn(2, 2)) df print(df.to_html()) # raw html @@ -2065,7 +2248,7 @@ Finally, the ``escape`` argument allows you to control whether the .. ipython:: python - df = DataFrame({'a': list('&<>'), 'b': randn(3)}) + df = pd.DataFrame({'a': list('&<>'), 'b': randn(3)}) .. ipython:: python @@ -2517,7 +2700,7 @@ DataFrame into clipboard and reading it back. .. ipython:: python - df=pd.DataFrame(randn(5,3)) + df = pd.DataFrame(randn(5,3)) df df.to_clipboard() pd.read_clipboard() @@ -2547,7 +2730,7 @@ any pickled pandas object (or any other pickled object) from file: .. ipython:: python - read_pickle('foo.pkl') + pd.read_pickle('foo.pkl') .. ipython:: python :suppress: @@ -2611,10 +2794,10 @@ both on the writing (serialization), and reading (deserialization). .. ipython:: python - df = DataFrame(np.random.rand(5,2),columns=list('AB')) + df = pd.DataFrame(np.random.rand(5,2),columns=list('AB')) df.to_msgpack('foo.msg') pd.read_msgpack('foo.msg') - s = Series(np.random.rand(5),index=date_range('20130101',periods=5)) + s = pd.Series(np.random.rand(5),index=pd.date_range('20130101',periods=5)) You can pass a list of objects and you will receive them back on deserialization. @@ -2699,7 +2882,7 @@ for some advanced strategies .. ipython:: python - store = HDFStore('store.h5') + store = pd.HDFStore('store.h5') print(store) Objects can be written to the file just like adding key-value pairs to a @@ -2708,13 +2891,13 @@ dict: .. ipython:: python np.random.seed(1234) - index = date_range('1/1/2000', periods=8) - s = Series(randn(5), index=['a', 'b', 'c', 'd', 'e']) - df = DataFrame(randn(8, 3), index=index, - columns=['A', 'B', 'C']) - wp = Panel(randn(2, 5, 4), items=['Item1', 'Item2'], - major_axis=date_range('1/1/2000', periods=5), - minor_axis=['A', 'B', 'C', 'D']) + index = pd.date_range('1/1/2000', periods=8) + s = pd.Series(randn(5), index=['a', 'b', 'c', 'd', 'e']) + df = pd.DataFrame(randn(8, 3), index=index, + columns=['A', 'B', 'C']) + wp = pd.Panel(randn(2, 5, 4), items=['Item1', 'Item2'], + major_axis=pd.date_range('1/1/2000', periods=5), + minor_axis=['A', 'B', 'C', 'D']) # store.put('s', s) is an equivalent method store['s'] = s @@ -2757,7 +2940,7 @@ Closing a Store, Context Manager # Working with, and automatically closing the store with the context # manager - with HDFStore('store.h5') as store: + with pd.HDFStore('store.h5') as store: store.keys() .. ipython:: python @@ -2777,9 +2960,9 @@ similar to how ``read_csv`` and ``to_csv`` work. (new in 0.11.0) .. ipython:: python - df_tl = DataFrame(dict(A=list(range(5)), B=list(range(5)))) + df_tl = pd.DataFrame(dict(A=list(range(5)), B=list(range(5)))) df_tl.to_hdf('store_tl.h5','table',append=True) - read_hdf('store_tl.h5', 'table', where = ['index>2']) + pd.read_hdf('store_tl.h5', 'table', where = ['index>2']) .. ipython:: python :suppress: @@ -2824,7 +3007,7 @@ This is also true for the major axis of a ``Panel``: [[np.nan, np.nan, np.nan], [np.nan,5,6]], [[np.nan, np.nan, np.nan],[np.nan,3,np.nan]]] - panel_with_major_axis_all_missing = Panel(matrix, + panel_with_major_axis_all_missing = pd.Panel(matrix, items=['Item1', 'Item2','Item3'], major_axis=[1,2], minor_axis=['A', 'B', 'C']) @@ -2835,7 +3018,7 @@ This is also true for the major axis of a ``Panel``: dropna = True, format='table', mode='w') - reloaded = read_hdf('file.h5', 'panel') + reloaded = pd.read_hdf('file.h5', 'panel') reloaded @@ -2868,7 +3051,7 @@ This format is specified by default when using ``put`` or ``to_hdf`` or by ``for .. code-block:: python - DataFrame(randn(10,2)).to_hdf('test_fixed.h5','df') + pd.DataFrame(randn(10,2)).to_hdf('test_fixed.h5','df') pd.read_hdf('test_fixed.h5','df',where='index>5') TypeError: cannot pass a where specification when reading a fixed format. @@ -2900,7 +3083,7 @@ enable ``put/append/to_hdf`` to by default store in the ``table`` format. .. ipython:: python - store = HDFStore('store.h5') + store = pd.HDFStore('store.h5') df1 = df[0:4] df2 = df[4:] @@ -2988,14 +3171,14 @@ defaults to `nan`. .. ipython:: python - df_mixed = DataFrame({ 'A' : randn(8), - 'B' : randn(8), - 'C' : np.array(randn(8),dtype='float32'), - 'string' :'string', - 'int' : 1, - 'bool' : True, - 'datetime64' : Timestamp('20010102')}, - index=list(range(8))) + df_mixed = pd.DataFrame({ 'A' : randn(8), + 'B' : randn(8), + 'C' : np.array(randn(8),dtype='float32'), + 'string' :'string', + 'int' : 1, + 'bool' : True, + 'datetime64' : pd.Timestamp('20010102')}, + index=list(range(8))) df_mixed.ix[3:5,['A', 'B', 'string', 'datetime64']] = np.nan store.append('df_mixed', df_mixed, min_itemsize = {'values': 50}) @@ -3014,13 +3197,13 @@ storing/selecting from homogeneous index DataFrames. .. ipython:: python - index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], - ['one', 'two', 'three']], - labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['foo', 'bar']) - df_mi = DataFrame(np.random.randn(10, 3), index=index, - columns=['A', 'B', 'C']) + index = pd.MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], + ['one', 'two', 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['foo', 'bar']) + df_mi = pd.DataFrame(np.random.randn(10, 3), index=index, + columns=['A', 'B', 'C']) df_mi store.append('df_mi',df_mi) @@ -3135,14 +3318,14 @@ Here are some examples: .. ipython:: python - dfq = DataFrame(randn(10,4),columns=list('ABCD'),index=date_range('20130101',periods=10)) + dfq = pd.DataFrame(randn(10,4),columns=list('ABCD'),index=pd.date_range('20130101',periods=10)) store.append('dfq',dfq,format='table',data_columns=True) Use boolean expressions, with in-line function evaluation. .. ipython:: python - store.select('dfq',"index>Timestamp('20130104') & columns=['A', 'B']") + store.select('dfq',"index>pd.Timestamp('20130104') & columns=['A', 'B']") Use and inline column reference @@ -3156,7 +3339,7 @@ Works with a Panel as well. store.append('wp',wp) store - store.select('wp', "major_axis>Timestamp('20000102') & minor_axis=['A', 'B']") + store.select('wp', "major_axis>pd.Timestamp('20000102') & minor_axis=['A', 'B']") The ``columns`` keyword can be supplied to select a list of columns to be returned, this is equivalent to passing a @@ -3201,7 +3384,7 @@ specified in the format: ``()``, where float may be signed (and fra .. ipython:: python from datetime import timedelta - dftd = DataFrame(dict(A = Timestamp('20130101'), B = [ Timestamp('20130101') + timedelta(days=i,seconds=10) for i in range(10) ])) + dftd = pd.DataFrame(dict(A = pd.Timestamp('20130101'), B = [ pd.Timestamp('20130101') + timedelta(days=i,seconds=10) for i in range(10) ])) dftd['C'] = dftd['A']-dftd['B'] dftd store.append('dftd',dftd,data_columns=True) @@ -3237,8 +3420,8 @@ Oftentimes when appending large amounts of data to a store, it is useful to turn .. ipython:: python - df_1 = DataFrame(randn(10,2),columns=list('AB')) - df_2 = DataFrame(randn(10,2),columns=list('AB')) + df_1 = pd.DataFrame(randn(10,2),columns=list('AB')) + df_2 = pd.DataFrame(randn(10,2),columns=list('AB')) st = pd.HDFStore('appends.h5',mode='w') st.append('df', df_1, data_columns=['B'], index=False) @@ -3284,7 +3467,7 @@ be data_columns # on-disk operations store.append('df_dc', df_dc, data_columns = ['B', 'C', 'string', 'string2']) - store.select('df_dc', [ Term('B>0') ]) + store.select('df_dc', [ pd.Term('B>0') ]) # getting creative store.select('df_dc', 'B > 0 & C > 0 & string == foo') @@ -3323,7 +3506,7 @@ The default is 50,000 rows returned in a chunk. .. code-block:: python - for df in read_hdf('store.h5','df', chunksize=3): + for df in pd.read_hdf('store.h5','df', chunksize=3): print(df) Note, that the chunksize keyword applies to the **source** rows. So if you @@ -3335,7 +3518,7 @@ chunks. .. ipython:: python - dfeq = DataFrame({'number': np.arange(1,11)}) + dfeq = pd.DataFrame({'number': np.arange(1,11)}) dfeq store.append('dfeq', dfeq, data_columns=['number']) @@ -3375,7 +3558,7 @@ Sometimes you want to get the coordinates (a.k.a the index locations) of your qu .. ipython:: python - df_coord = DataFrame(np.random.randn(1000,2),index=date_range('20000101',periods=1000)) + df_coord = pd.DataFrame(np.random.randn(1000,2),index=pd.date_range('20000101',periods=1000)) store.append('df_coord',df_coord) c = store.select_as_coordinates('df_coord','index>20020101') c.summary() @@ -3392,10 +3575,10 @@ a datetimeindex which are 5. .. ipython:: python - df_mask = DataFrame(np.random.randn(1000,2),index=date_range('20000101',periods=1000)) + df_mask = pd.DataFrame(np.random.randn(1000,2),index=pd.date_range('20000101',periods=1000)) store.append('df_mask',df_mask) c = store.select_column('df_mask','index') - where = c[DatetimeIndex(c).month==5].index + where = c[pd.DatetimeIndex(c).month==5].index store.select('df_mask',where=where) Storer Object @@ -3440,8 +3623,8 @@ results. .. ipython:: python - df_mt = DataFrame(randn(8, 6), index=date_range('1/1/2000', periods=8), - columns=['A', 'B', 'C', 'D', 'E', 'F']) + df_mt = pd.DataFrame(randn(8, 6), index=pd.date_range('1/1/2000', periods=8), + columns=['A', 'B', 'C', 'D', 'E', 'F']) df_mt['foo'] = 'bar' df_mt.ix[1, ('A', 'B')] = np.nan @@ -3532,7 +3715,7 @@ Compression for all objects within the file .. code-block:: python - store_compressed = HDFStore('store_compressed.h5', complevel=9, complib='blosc') + store_compressed = pd.HDFStore('store_compressed.h5', complevel=9, complib='blosc') Or on-the-fly compression (this only applies to tables). You can turn off file compression for a specific table by passing ``complevel=0`` @@ -3630,8 +3813,8 @@ stored in a more efficient manner. .. ipython:: python - dfcat = DataFrame({ 'A' : Series(list('aabbcdba')).astype('category'), - 'B' : np.random.randn(8) }) + dfcat = pd.DataFrame({ 'A' : pd.Series(list('aabbcdba')).astype('category'), + 'B' : np.random.randn(8) }) dfcat dfcat.dtypes cstore = pd.HDFStore('cats.h5', mode='w') @@ -3688,7 +3871,7 @@ Starting in 0.11.0, passing a ``min_itemsize`` dict will cause all passed column .. ipython:: python - dfs = DataFrame(dict(A = 'foo', B = 'bar'),index=list(range(5))) + dfs = pd.DataFrame(dict(A = 'foo', B = 'bar'),index=list(range(5))) dfs # A and B have a size of 30 @@ -3707,7 +3890,7 @@ You could inadvertently turn an actual ``nan`` value into a missing value. .. ipython:: python - dfss = DataFrame(dict(A = ['foo','bar','nan'])) + dfss = pd.DataFrame(dict(A = ['foo','bar','nan'])) dfss store.append('dfss', dfss) @@ -3741,7 +3924,7 @@ It is possible to write an ``HDFStore`` object that can easily be imported into index=range(100)) df_for_r.head() - store_export = HDFStore('export.h5') + store_export = pd.HDFStore('export.h5') store_export.append('df_for_r', df_for_r, data_columns=df_dc.columns) store_export @@ -3828,9 +4011,10 @@ number of options, please see the docstring. legacy_file_path = os.path.abspath('source/_static/legacy_0.10.h5') .. ipython:: python + :okwarning: # a legacy store - legacy_store = HDFStore(legacy_file_path,'r') + legacy_store = pd.HDFStore(legacy_file_path,'r') legacy_store # copy (and return the new handle) @@ -3875,8 +4059,9 @@ Experimental HDFStore supports ``Panel4D`` storage. .. ipython:: python + :okwarning: - p4d = Panel4D({ 'l1' : wp }) + p4d = pd.Panel4D({ 'l1' : wp }) p4d store.append('p4d', p4d) store @@ -3889,10 +4074,11 @@ store your data. Pass the ``axes`` keyword with a list of dimensions object). This cannot be changed after table creation. .. ipython:: python + :okwarning: store.append('p4d2', p4d, axes=['labels', 'major_axis', 'minor_axis']) store - store.select('p4d2', [ Term('labels=l1'), Term('items=Item1'), Term('minor_axis=A_big_strings') ]) + store.select('p4d2', [ pd.Term('labels=l1'), pd.Term('items=Item1'), pd.Term('minor_axis=A_big_strings') ]) .. ipython:: python :suppress: @@ -3994,7 +4180,7 @@ the database using :func:`~pandas.DataFrame.to_sql`. (42, datetime.datetime(2010,10,19), 'Y', -12.5, False), (63, datetime.datetime(2010,10,20), 'Z', 5.73, True)] - data = DataFrame(d, columns=c) + data = pd.DataFrame(d, columns=c) .. ipython:: python @@ -4244,7 +4430,7 @@ to existing tables. You will need to install some additional dependencies: -- Google's `python-gflags `__ +- Google's `python-gflags `__ - `httplib2 `__ - `google-api-python-client `__ @@ -4291,6 +4477,15 @@ Additional information on service accounts can be found You will need to install an additional dependency: `oauth2client `__. +Authentication via ``application default credentials`` is also possible. This is only valid +if the parameter ``private_key`` is not provided. This method also requires that +the credentials can be fetched from the environment the code is running in. +Otherwise, the OAuth2 client-side authentication is used. +Additional information on +`application default credentials `__. + +.. versionadded:: 0.19.0 + .. note:: The `'private_key'` parameter can be set to either the file path of the service account key @@ -4335,6 +4530,13 @@ destination DataFrame as well as a preferred column order as follows: You can toggle the verbose output via the ``verbose`` flag which defaults to ``True``. +.. note:: + + The ``dialect`` argument can be used to indicate whether to use BigQuery's ``'legacy'`` SQL + or BigQuery's ``'standard'`` SQL (beta). The default value is ``'legacy'``. For more information + on BigQuery's standard SQL, see `BigQuery SQL Reference + `__ + .. _io.bigquery_writer: @@ -4474,7 +4676,7 @@ into a .dta file. The format version of this file is always 115 (Stata 12). .. ipython:: python - df = DataFrame(randn(10, 2), columns=list('AB')) + df = pd.DataFrame(randn(10, 2), columns=list('AB')) df.to_stata('stata.dta') *Stata* data files have limited data type support; only strings with @@ -4699,7 +4901,7 @@ This is an informal comparison of various IO methods, using pandas 0.13.1. .. code-block:: ipython - In [1]: df = DataFrame(randn(1000000,2),columns=list('AB')) + In [1]: df = pd.DataFrame(randn(1000000,2),columns=list('AB')) In [2]: df.info() @@ -4756,7 +4958,7 @@ Reading Space on disk (in bytes) -.. code-block:: +.. code-block:: none 25843712 Apr 8 14:11 test.sql 24007368 Apr 8 14:11 test_fixed.hdf @@ -4773,7 +4975,7 @@ And here's the code import os from pandas.io import sql - df = DataFrame(randn(1000000,2),columns=list('AB')) + df = pd.DataFrame(randn(1000000,2),columns=list('AB')) def test_sql_write(df): if os.path.exists('test.sql'): diff --git a/doc/source/merging.rst b/doc/source/merging.rst index ba675d9aac830..c6541a26c72b4 100644 --- a/doc/source/merging.rst +++ b/doc/source/merging.rst @@ -78,34 +78,35 @@ some configurable handling of "what to do with the other axes": :: pd.concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, - keys=None, levels=None, names=None, verify_integrity=False) + keys=None, levels=None, names=None, verify_integrity=False, + copy=True) -- ``objs``: a sequence or mapping of Series, DataFrame, or Panel objects. If a +- ``objs`` : a sequence or mapping of Series, DataFrame, or Panel objects. If a dict is passed, the sorted keys will be used as the `keys` argument, unless it is passed, in which case the values will be selected (see below). Any None objects will be dropped silently unless they are all None in which case a ValueError will be raised. -- ``axis``: {0, 1, ...}, default 0. The axis to concatenate along. -- ``join``: {'inner', 'outer'}, default 'outer'. How to handle indexes on +- ``axis`` : {0, 1, ...}, default 0. The axis to concatenate along. +- ``join`` : {'inner', 'outer'}, default 'outer'. How to handle indexes on other axis(es). Outer for union and inner for intersection. -- ``join_axes``: list of Index objects. Specific indexes to use for the other +- ``ignore_index`` : boolean, default False. If True, do not use the index + values on the concatenation axis. The resulting axis will be labeled 0, ..., + n - 1. This is useful if you are concatenating objects where the + concatenation axis does not have meaningful indexing information. Note + the index values on the other axes are still respected in the join. +- ``join_axes`` : list of Index objects. Specific indexes to use for the other n - 1 axes instead of performing inner/outer set logic. -- ``keys``: sequence, default None. Construct hierarchical index using the +- ``keys`` : sequence, default None. Construct hierarchical index using the passed keys as the outermost level. If multiple levels passed, should contain tuples. - ``levels`` : list of sequences, default None. Specific levels (unique values) to use for constructing a MultiIndex. Otherwise they will be inferred from the keys. -- ``names``: list, default None. Names for the levels in the resulting +- ``names`` : list, default None. Names for the levels in the resulting hierarchical index. -- ``verify_integrity``: boolean, default False. Check whether the new +- ``verify_integrity`` : boolean, default False. Check whether the new concatenated axis contains duplicates. This can be very expensive relative to the actual data concatenation. -- ``ignore_index`` : boolean, default False. If True, do not use the index - values on the concatenation axis. The resulting axis will be labeled 0, ..., - n - 1. This is useful if you are concatenating objects where the - concatenation axis does not have meaningful indexing information. Note - the index values on the other axes are still respected in the join. - ``copy`` : boolean, default True. If False, do not copy data unnecessarily. Without a little bit of context and example many of these arguments don't make @@ -510,48 +511,45 @@ standard database join operations between DataFrame objects: :: - merge(left, right, how='inner', on=None, left_on=None, right_on=None, - left_index=False, right_index=False, sort=True, - suffixes=('_x', '_y'), copy=True, indicator=False) - -Here's a description of what each argument is for: - - - ``left``: A DataFrame object - - ``right``: Another DataFrame object - - ``on``: Columns (names) to join on. Must be found in both the left and - right DataFrame objects. If not passed and ``left_index`` and - ``right_index`` are ``False``, the intersection of the columns in the - DataFrames will be inferred to be the join keys - - ``left_on``: Columns from the left DataFrame to use as keys. Can either be - column names or arrays with length equal to the length of the DataFrame - - ``right_on``: Columns from the right DataFrame to use as keys. Can either be - column names or arrays with length equal to the length of the DataFrame - - ``left_index``: If ``True``, use the index (row labels) from the left - DataFrame as its join key(s). In the case of a DataFrame with a MultiIndex - (hierarchical), the number of levels must match the number of join keys - from the right DataFrame - - ``right_index``: Same usage as ``left_index`` for the right DataFrame - - ``how``: One of ``'left'``, ``'right'``, ``'outer'``, ``'inner'``. Defaults - to ``inner``. See below for more detailed description of each method - - ``sort``: Sort the result DataFrame by the join keys in lexicographical - order. Defaults to ``True``, setting to ``False`` will improve performance - substantially in many cases - - ``suffixes``: A tuple of string suffixes to apply to overlapping - columns. Defaults to ``('_x', '_y')``. - - ``copy``: Always copy data (default ``True``) from the passed DataFrame - objects, even when reindexing is not necessary. Cannot be avoided in many - cases but may improve performance / memory usage. The cases where copying - can be avoided are somewhat pathological but this option is provided - nonetheless. - - ``indicator``: Add a column to the output DataFrame called ``_merge`` - with information on the source of each row. ``_merge`` is Categorical-type - and takes on a value of ``left_only`` for observations whose merge key - only appears in ``'left'`` DataFrame, ``right_only`` for observations whose - merge key only appears in ``'right'`` DataFrame, and ``both`` if the - observation's merge key is found in both. - - .. versionadded:: 0.17.0 - + pd.merge(left, right, how='inner', on=None, left_on=None, right_on=None, + left_index=False, right_index=False, sort=True, + suffixes=('_x', '_y'), copy=True, indicator=False) + +- ``left``: A DataFrame object +- ``right``: Another DataFrame object +- ``on``: Columns (names) to join on. Must be found in both the left and + right DataFrame objects. If not passed and ``left_index`` and + ``right_index`` are ``False``, the intersection of the columns in the + DataFrames will be inferred to be the join keys +- ``left_on``: Columns from the left DataFrame to use as keys. Can either be + column names or arrays with length equal to the length of the DataFrame +- ``right_on``: Columns from the right DataFrame to use as keys. Can either be + column names or arrays with length equal to the length of the DataFrame +- ``left_index``: If ``True``, use the index (row labels) from the left + DataFrame as its join key(s). In the case of a DataFrame with a MultiIndex + (hierarchical), the number of levels must match the number of join keys + from the right DataFrame +- ``right_index``: Same usage as ``left_index`` for the right DataFrame +- ``how``: One of ``'left'``, ``'right'``, ``'outer'``, ``'inner'``. Defaults + to ``inner``. See below for more detailed description of each method +- ``sort``: Sort the result DataFrame by the join keys in lexicographical + order. Defaults to ``True``, setting to ``False`` will improve performance + substantially in many cases +- ``suffixes``: A tuple of string suffixes to apply to overlapping + columns. Defaults to ``('_x', '_y')``. +- ``copy``: Always copy data (default ``True``) from the passed DataFrame + objects, even when reindexing is not necessary. Cannot be avoided in many + cases but may improve performance / memory usage. The cases where copying + can be avoided are somewhat pathological but this option is provided + nonetheless. +- ``indicator``: Add a column to the output DataFrame called ``_merge`` + with information on the source of each row. ``_merge`` is Categorical-type + and takes on a value of ``left_only`` for observations whose merge key + only appears in ``'left'`` DataFrame, ``right_only`` for observations whose + merge key only appears in ``'right'`` DataFrame, and ``both`` if the + observation's merge key is found in both. + + .. versionadded:: 0.17.0 The return type will be the same as ``left``. If ``left`` is a ``DataFrame`` and ``right`` is a subclass of DataFrame, the return type will still be @@ -573,11 +571,11 @@ terminology used to describe join operations between two SQL-table like structures (DataFrame objects). There are several cases to consider which are very important to understand: - - **one-to-one** joins: for example when joining two DataFrame objects on - their indexes (which must contain unique values) - - **many-to-one** joins: for example when joining an index (unique) to one or - more columns in a DataFrame - - **many-to-many** joins: joining columns on columns. +- **one-to-one** joins: for example when joining two DataFrame objects on + their indexes (which must contain unique values) +- **many-to-one** joins: for example when joining an index (unique) to one or + more columns in a DataFrame +- **many-to-many** joins: joining columns on columns. .. note:: @@ -714,15 +712,15 @@ The merge indicator .. ipython:: python - df1 = DataFrame({'col1':[0,1], 'col_left':['a','b']}) - df2 = DataFrame({'col1':[1,2,2],'col_right':[2,2,2]}) - merge(df1, df2, on='col1', how='outer', indicator=True) + df1 = pd.DataFrame({'col1': [0, 1], 'col_left':['a', 'b']}) + df2 = pd.DataFrame({'col1': [1, 2, 2],'col_right':[2, 2, 2]}) + pd.merge(df1, df2, on='col1', how='outer', indicator=True) -The ``indicator`` argument will also accept string arguments, in which case the indicator function will use the value of the passed string as the name for the indicator column. +The ``indicator`` argument will also accept string arguments, in which case the indicator function will use the value of the passed string as the name for the indicator column. .. ipython:: python - merge(df1, df2, on='col1', how='outer', indicator='indicator_column') + pd.merge(df1, df2, on='col1', how='outer', indicator='indicator_column') .. _merging.join.index: @@ -924,7 +922,7 @@ a level name of the multi-indexed frame. left = pd.DataFrame({'A': ['A0', 'A1', 'A2'], 'B': ['B0', 'B1', 'B2']}, - index=Index(['K0', 'K1', 'K2'], name='key')) + index=pd.Index(['K0', 'K1', 'K2'], name='key')) index = pd.MultiIndex.from_tuples([('K0', 'Y0'), ('K1', 'Y1'), ('K2', 'Y2'), ('K2', 'Y3')], @@ -1055,34 +1053,6 @@ them together on their indexes. The same is true for ``Panel.join``. labels=['left', 'right', 'right2'], vertical=False); plt.close('all'); -.. _merging.ordered_merge: - -Merging Ordered Data -~~~~~~~~~~~~~~~~~~~~ - -New in v0.8.0 is the ordered_merge function for combining time series and other -ordered data. In particular it has an optional ``fill_method`` keyword to -fill/interpolate missing data: - -.. ipython:: python - - left = DataFrame({'k': ['K0', 'K1', 'K1', 'K2'], - 'lv': [1, 2, 3, 4], - 's': ['a', 'b', 'c', 'd']}) - - right = DataFrame({'k': ['K1', 'K2', 'K4'], - 'rv': [1, 2, 3]}) - - result = ordered_merge(left, right, fill_method='ffill', left_by='s') - -.. ipython:: python - :suppress: - - @savefig merging_ordered_merge.png - p.plot([left, right], result, - labels=['left', 'right'], vertical=True); - plt.close('all'); - .. _merging.combine_first.update: Merging together values within Series or DataFrame columns @@ -1132,4 +1102,111 @@ values inplace: @savefig merging_update.png p.plot([df1_copy, df2], df1, labels=['df1', 'df2'], vertical=False); - plt.close('all'); \ No newline at end of file + plt.close('all'); + +.. _merging.time_series: + +Timeseries friendly merging +--------------------------- + +.. _merging.merge_ordered: + +Merging Ordered Data +~~~~~~~~~~~~~~~~~~~~ + +A :func:`merge_ordered` function allows combining time series and other +ordered data. In particular it has an optional ``fill_method`` keyword to +fill/interpolate missing data: + +.. ipython:: python + + left = pd.DataFrame({'k': ['K0', 'K1', 'K1', 'K2'], + 'lv': [1, 2, 3, 4], + 's': ['a', 'b', 'c', 'd']}) + + right = pd.DataFrame({'k': ['K1', 'K2', 'K4'], + 'rv': [1, 2, 3]}) + + pd.merge_ordered(left, right, fill_method='ffill', left_by='s') + +.. _merging.merge_asof: + +Merging AsOf +~~~~~~~~~~~~ + +.. versionadded:: 0.19.0 + +A :func:`merge_asof` is similar to an ordered left-join except that we match on nearest key rather than equal keys. For each row in the ``left`` DataFrame, we select the last row in the ``right`` DataFrame whose ``on`` key is less than the left's key. Both DataFrames must be sorted by the key. + +Optionally an asof merge can perform a group-wise merge. This matches the ``by`` key equally, +in addition to the nearest match on the ``on`` key. + +For example; we might have ``trades`` and ``quotes`` and we want to ``asof`` merge them. + +.. ipython:: python + + trades = pd.DataFrame({ + 'time': pd.to_datetime(['20160525 13:30:00.023', + '20160525 13:30:00.038', + '20160525 13:30:00.048', + '20160525 13:30:00.048', + '20160525 13:30:00.048']), + 'ticker': ['MSFT', 'MSFT', + 'GOOG', 'GOOG', 'AAPL'], + 'price': [51.95, 51.95, + 720.77, 720.92, 98.00], + 'quantity': [75, 155, + 100, 100, 100]}, + columns=['time', 'ticker', 'price', 'quantity']) + + quotes = pd.DataFrame({ + 'time': pd.to_datetime(['20160525 13:30:00.023', + '20160525 13:30:00.023', + '20160525 13:30:00.030', + '20160525 13:30:00.041', + '20160525 13:30:00.048', + '20160525 13:30:00.049', + '20160525 13:30:00.072', + '20160525 13:30:00.075']), + 'ticker': ['GOOG', 'MSFT', 'MSFT', + 'MSFT', 'GOOG', 'AAPL', 'GOOG', + 'MSFT'], + 'bid': [720.50, 51.95, 51.97, 51.99, + 720.50, 97.99, 720.50, 52.01], + 'ask': [720.93, 51.96, 51.98, 52.00, + 720.93, 98.01, 720.88, 52.03]}, + columns=['time', 'ticker', 'bid', 'ask']) + +.. ipython:: python + + trades + quotes + +By default we are taking the asof of the quotes. + +.. ipython:: python + + pd.merge_asof(trades, quotes, + on='time', + by='ticker') + +We only asof within ``2ms`` betwen the quote time and the trade time. + +.. ipython:: python + + pd.merge_asof(trades, quotes, + on='time', + by='ticker', + tolerance=pd.Timedelta('2ms')) + +We only asof within ``10ms`` betwen the quote time and the trade time and we exclude exact matches on time. +Note that though we exclude the exact matches (of the quotes), prior quotes DO propogate to that point +in time. + +.. ipython:: python + + pd.merge_asof(trades, quotes, + on='time', + by='ticker', + tolerance=pd.Timedelta('10ms'), + allow_exact_matches=False) diff --git a/doc/source/options.rst b/doc/source/options.rst index d761d827006be..77cac6d495d13 100644 --- a/doc/source/options.rst +++ b/doc/source/options.rst @@ -71,6 +71,7 @@ with no argument ``describe_option`` will print out the descriptions for all ava .. ipython:: python :suppress: + :okwarning: pd.reset_option("all") @@ -391,6 +392,9 @@ display.width 80 Width of the display in characters. IPython qtconsole, or IDLE do not run in a terminal and hence it is not possible to correctly detect the width. +html.border 1 A ``border=value`` attribute is + inserted in the ```` tag + for the DataFrame HTML repr. io.excel.xls.writer xlwt The default Excel writer engine for 'xls' files. io.excel.xlsm.writer openpyxl The default Excel writer engine for diff --git a/doc/source/r_interface.rst b/doc/source/r_interface.rst index efe403c85f330..f3df1ebdf25cb 100644 --- a/doc/source/r_interface.rst +++ b/doc/source/r_interface.rst @@ -17,7 +17,7 @@ rpy2 / R interface In v0.16.0, the ``pandas.rpy`` interface has been **deprecated and will be removed in a future version**. Similar functionality can be accessed - through the `rpy2 `_ project. + through the `rpy2 `__ project. See the :ref:`updating ` section for a guide to port your code from the ``pandas.rpy`` to ``rpy2`` functions. @@ -73,7 +73,7 @@ The ``convert_to_r_matrix`` function can be replaced by the normal comparison to the ones in pandas, please report this at the `issue tracker `_. -See also the documentation of the `rpy2 `_ project. +See also the documentation of the `rpy2 `__ project. R interface with rpy2 diff --git a/doc/source/release.rst b/doc/source/release.rst index 37778c46a8ec0..df76c90d0f5e6 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -630,7 +630,7 @@ Highlights include: modules are deprecated. We refer users to external packages like `seaborn `_, `pandas-qt `_ and - `rpy2 `_ for similar or equivalent + `rpy2 `_ for similar or equivalent functionality, see :ref:`here ` See the :ref:`v0.16.0 Whatsnew ` overview or the issue tracker on GitHub for an extensive list diff --git a/doc/source/remote_data.rst b/doc/source/remote_data.rst index 842fcb6896680..019aa82fed1aa 100644 --- a/doc/source/remote_data.rst +++ b/doc/source/remote_data.rst @@ -2,34 +2,21 @@ .. currentmodule:: pandas -.. ipython:: python - :suppress: - - import os - import csv - import pandas as pd - - import numpy as np - np.random.seed(123456) - randn = np.random.randn - np.set_printoptions(precision=4, suppress=True) - - import matplotlib.pyplot as plt - plt.close('all') - - from pandas import * - options.display.max_rows=15 - import pandas.util.testing as tm - ****************** Remote Data Access ****************** .. _remote_data.pandas_datareader: -.. warning:: +DataReader +---------- - In pandas 0.17.0, the sub-package ``pandas.io.data`` will be removed in favor of a separately installable `pandas-datareader package `_. This will allow the data modules to be independently updated to your pandas installation. The API for ``pandas-datareader v0.1.1`` is the same as in ``pandas v0.16.1``. (:issue:`8961`) +The sub-package ``pandas.io.data`` is removed in favor of a separately +installable `pandas-datareader package +`_. This will allow the data +modules to be independently updated to your pandas installation. The API for +``pandas-datareader v0.1.1`` is the same as in ``pandas v0.16.1``. +(:issue:`8961`) You should replace the imports of the following: @@ -43,310 +30,6 @@ Remote Data Access from pandas_datareader import data, wb -.. _remote_data.data_reader: - -Functions from :mod:`pandas.io.data` and :mod:`pandas.io.ga` extract data from various Internet sources into a DataFrame. Currently the following sources are supported: - - - :ref:`Yahoo! Finance` - - :ref:`Google Finance` - - :ref:`St.Louis FED (FRED)` - - :ref:`Kenneth French's data library` - - :ref:`World Bank` - - :ref:`Google Analytics` - -It should be noted, that various sources support different kinds of data, so not all sources implement the same methods and the data elements returned might also differ. - -.. _remote_data.yahoo: - -Yahoo! Finance --------------- - -.. ipython:: python - :okwarning: - - import pandas.io.data as web - import datetime - start = datetime.datetime(2010, 1, 1) - end = datetime.datetime(2013, 1, 27) - f = web.DataReader("F", 'yahoo', start, end) - f.ix['2010-01-04'] - -.. _remote_data.yahoo_options: - -Yahoo! Finance Options ----------------------- -***Experimental*** - -The ``Options`` class allows the download of options data from Yahoo! Finance. - -The ``get_all_data`` method downloads and caches option data for all expiry months -and provides a formatted ``DataFrame`` with a hierarchical index, so it is easy to get -to the specific option you want. - -.. ipython:: python - - from pandas.io.data import Options - aapl = Options('aapl', 'yahoo') - data = aapl.get_all_data() - data.iloc[0:5, 0:5] - - # Show the $100 strike puts at all expiry dates: - data.loc[(100, slice(None), 'put'),:].iloc[0:5, 0:5] - - # Show the volume traded of $100 strike puts at all expiry dates: - data.loc[(100, slice(None), 'put'),'Vol'].head() - -If you don't want to download all the data, more specific requests can be made. - -.. ipython:: python - - import datetime - expiry = datetime.date(2016, 1, 1) - data = aapl.get_call_data(expiry=expiry) - data.iloc[0:5:, 0:5] - -Note that if you call ``get_all_data`` first, this second call will happen much faster, -as the data is cached. - -If a given expiry date is not available, data for the next available expiry will be -returned (January 15, 2015 in the above example). - -Available expiry dates can be accessed from the ``expiry_dates`` property. - -.. ipython:: python - - aapl.expiry_dates - data = aapl.get_call_data(expiry=aapl.expiry_dates[0]) - data.iloc[0:5:, 0:5] - -A list-like object containing dates can also be passed to the expiry parameter, -returning options data for all expiry dates in the list. - -.. ipython:: python - - data = aapl.get_near_stock_price(expiry=aapl.expiry_dates[0:3]) - data.iloc[0:5:, 0:5] - -The ``month`` and ``year`` parameters can be used to get all options data for a given month. - -.. _remote_data.google: - -Google Finance --------------- - -.. ipython:: python - - import pandas.io.data as web - import datetime - start = datetime.datetime(2010, 1, 1) - end = datetime.datetime(2013, 1, 27) - f = web.DataReader("F", 'google', start, end) - f.ix['2010-01-04'] - -.. _remote_data.fred: - -FRED ----- - -.. ipython:: python - - import pandas.io.data as web - import datetime - start = datetime.datetime(2010, 1, 1) - end = datetime.datetime(2013, 1, 27) - gdp=web.DataReader("GDP", "fred", start, end) - gdp.ix['2013-01-01'] - - # Multiple series: - inflation = web.DataReader(["CPIAUCSL", "CPILFESL"], "fred", start, end) - inflation.head() -.. _remote_data.ff: - -Fama/French ------------ - -Dataset names are listed at `Fama/French Data Library -`__. - -.. ipython:: python - - import pandas.io.data as web - ip = web.DataReader("5_Industry_Portfolios", "famafrench") - ip[4].ix[192607] - -.. _remote_data.wb: - -World Bank ----------- - -``pandas`` users can easily access thousands of panel data series from the -`World Bank's World Development Indicators `__ -by using the ``wb`` I/O functions. - -Indicators -~~~~~~~~~~ - -Either from exploring the World Bank site, or using the search function included, -every world bank indicator is accessible. - -For example, if you wanted to compare the Gross Domestic Products per capita in -constant dollars in North America, you would use the ``search`` function: - -.. code-block:: ipython - - In [1]: from pandas.io import wb - - In [2]: wb.search('gdp.*capita.*const').iloc[:,:2] - Out[2]: - id name - 3242 GDPPCKD GDP per Capita, constant US$, millions - 5143 NY.GDP.PCAP.KD GDP per capita (constant 2005 US$) - 5145 NY.GDP.PCAP.KN GDP per capita (constant LCU) - 5147 NY.GDP.PCAP.PP.KD GDP per capita, PPP (constant 2005 internation... - -Then you would use the ``download`` function to acquire the data from the World -Bank's servers: - -.. code-block:: ipython - - In [3]: dat = wb.download(indicator='NY.GDP.PCAP.KD', country=['US', 'CA', 'MX'], start=2005, end=2008) - - In [4]: print(dat) - NY.GDP.PCAP.KD - country year - Canada 2008 36005.5004978584 - 2007 36182.9138439757 - 2006 35785.9698172849 - 2005 35087.8925933298 - Mexico 2008 8113.10219480083 - 2007 8119.21298908649 - 2006 7961.96818458178 - 2005 7666.69796097264 - United States 2008 43069.5819857208 - 2007 43635.5852068142 - 2006 43228.111147107 - 2005 42516.3934699993 - -The resulting dataset is a properly formatted ``DataFrame`` with a hierarchical -index, so it is easy to apply ``.groupby`` transformations to it: - -.. code-block:: ipython - - In [6]: dat['NY.GDP.PCAP.KD'].groupby(level=0).mean() - Out[6]: - country - Canada 35765.569188 - Mexico 7965.245332 - United States 43112.417952 - dtype: float64 - -Now imagine you want to compare GDP to the share of people with cellphone -contracts around the world. - -.. code-block:: ipython - - In [7]: wb.search('cell.*%').iloc[:,:2] - Out[7]: - id name - 3990 IT.CEL.SETS.FE.ZS Mobile cellular telephone users, female (% of ... - 3991 IT.CEL.SETS.MA.ZS Mobile cellular telephone users, male (% of po... - 4027 IT.MOB.COV.ZS Population coverage of mobile cellular telepho... - -Notice that this second search was much faster than the first one because -``pandas`` now has a cached list of available data series. - -.. code-block:: ipython - - In [13]: ind = ['NY.GDP.PCAP.KD', 'IT.MOB.COV.ZS'] - In [14]: dat = wb.download(indicator=ind, country='all', start=2011, end=2011).dropna() - In [15]: dat.columns = ['gdp', 'cellphone'] - In [16]: print(dat.tail()) - gdp cellphone - country year - Swaziland 2011 2413.952853 94.9 - Tunisia 2011 3687.340170 100.0 - Uganda 2011 405.332501 100.0 - Zambia 2011 767.911290 62.0 - Zimbabwe 2011 419.236086 72.4 - -Finally, we use the ``statsmodels`` package to assess the relationship between -our two variables using ordinary least squares regression. Unsurprisingly, -populations in rich countries tend to use cellphones at a higher rate: - -.. code-block:: ipython - - In [17]: import numpy as np - In [18]: import statsmodels.formula.api as smf - In [19]: mod = smf.ols("cellphone ~ np.log(gdp)", dat).fit() - In [20]: print(mod.summary()) - OLS Regression Results - ============================================================================== - Dep. Variable: cellphone R-squared: 0.297 - Model: OLS Adj. R-squared: 0.274 - Method: Least Squares F-statistic: 13.08 - Date: Thu, 25 Jul 2013 Prob (F-statistic): 0.00105 - Time: 15:24:42 Log-Likelihood: -139.16 - No. Observations: 33 AIC: 282.3 - Df Residuals: 31 BIC: 285.3 - Df Model: 1 - =============================================================================== - coef std err t P>|t| [95.0% Conf. Int.] - ------------------------------------------------------------------------------- - Intercept 16.5110 19.071 0.866 0.393 -22.384 55.406 - np.log(gdp) 9.9333 2.747 3.616 0.001 4.331 15.535 - ============================================================================== - Omnibus: 36.054 Durbin-Watson: 2.071 - Prob(Omnibus): 0.000 Jarque-Bera (JB): 119.133 - Skew: -2.314 Prob(JB): 1.35e-26 - Kurtosis: 11.077 Cond. No. 45.8 - ============================================================================== - -Country Codes -~~~~~~~~~~~~~ - -.. versionadded:: 0.15.1 - -The ``country`` argument accepts a string or list of mixed -`two `__ or `three `__ character -ISO country codes, as well as dynamic `World Bank exceptions `__ to the ISO standards. - -For a list of the the hard-coded country codes (used solely for error handling logic) see ``pandas.io.wb.country_codes``. - -Problematic Country Codes & Indicators -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. note:: - - The World Bank's country list and indicators are dynamic. As of 0.15.1, - :func:`wb.download()` is more flexible. To achieve this, the warning - and exception logic changed. - -The world bank converts some country codes in their response, which makes error -checking by pandas difficult. Retired indicators still persist in the search. - -Given the new flexibility of 0.15.1, improved error handling by the user -may be necessary for fringe cases. - -To help identify issues: - -There are at least 4 kinds of country codes: - -1. Standard (2/3 digit ISO) - returns data, will warn and error properly. -2. Non-standard (WB Exceptions) - returns data, but will falsely warn. -3. Blank - silently missing from the response. -4. Bad - causes the entire response from WB to fail, always exception inducing. - -There are at least 3 kinds of indicators: - -1. Current - Returns data. -2. Retired - Appears in search results, yet won't return data. -3. Bad - Will not return data. - -Use the ``errors`` argument to control warnings and exceptions. Setting -errors to ignore or warn, won't stop failed responses. (ie, 100% bad -indicators, or a single "bad" (#4 above) country code). - -See docstrings for more info. .. _remote_data.ga: diff --git a/doc/source/sparse.rst b/doc/source/sparse.rst index 257fb2909d42c..d3f921f8762cc 100644 --- a/doc/source/sparse.rst +++ b/doc/source/sparse.rst @@ -9,19 +9,20 @@ import pandas as pd import pandas.util.testing as tm np.set_printoptions(precision=4, suppress=True) - options.display.max_rows = 15 + pd.options.display.max_rows = 15 ********************** Sparse data structures ********************** -We have implemented "sparse" versions of Series, DataFrame, and Panel. These -are not sparse in the typical "mostly 0". You can view these objects as being -"compressed" where any data matching a specific value (NaN/missing by default, -though any value can be chosen) is omitted. A special ``SparseIndex`` object -tracks where data has been "sparsified". This will make much more sense in an -example. All of the standard pandas data structures have a ``to_sparse`` -method: +.. note:: The ``SparsePanel`` class has been removed in 0.19.0 + +We have implemented "sparse" versions of Series and DataFrame. These are not sparse +in the typical "mostly 0". Rather, you can view these objects as being "compressed" +where any data matching a specific value (``NaN`` / missing value, though any value +can be chosen) is omitted. A special ``SparseIndex`` object tracks where data has been +"sparsified". This will make much more sense in an example. All of the standard pandas +data structures have a ``to_sparse`` method: .. ipython:: python @@ -77,9 +78,8 @@ distinct from the ``fill_value``: sparr = pd.SparseArray(arr) sparr -Like the indexed objects (SparseSeries, SparseDataFrame, SparsePanel), a -``SparseArray`` can be converted back to a regular ndarray by calling -``to_dense``: +Like the indexed objects (SparseSeries, SparseDataFrame), a ``SparseArray`` +can be converted back to a regular ndarray by calling ``to_dense``: .. ipython:: python @@ -90,45 +90,96 @@ Like the indexed objects (SparseSeries, SparseDataFrame, SparsePanel), a SparseList ---------- -``SparseList`` is a list-like data structure for managing a dynamic collection -of SparseArrays. To create one, simply call the ``SparseList`` constructor with -a ``fill_value`` (defaulting to ``NaN``): +The ``SparseList`` class has been deprecated and will be removed in a future version. +See the `docs of a previous version `__ +for documentation on ``SparseList``. -.. ipython:: python - spl = pd.SparseList() - spl +SparseIndex objects +------------------- + +Two kinds of ``SparseIndex`` are implemented, ``block`` and ``integer``. We +recommend using ``block`` as it's more memory efficient. The ``integer`` format +keeps an arrays of all of the locations where the data are not equal to the +fill value. The ``block`` format tracks only the locations and sizes of blocks +of data. -The two important methods are ``append`` and ``to_array``. ``append`` can -accept scalar values or any 1-dimensional sequence: +.. _sparse.dtype: + +Sparse Dtypes +------------- + +Sparse data should have the same dtype as its dense representation. Currently, +``float64``, ``int64`` and ``bool`` dtypes are supported. Depending on the original +dtype, ``fill_value`` default changes: + +- ``float64``: ``np.nan`` +- ``int64``: ``0`` +- ``bool``: ``False`` .. ipython:: python - :suppress: + + s = pd.Series([1, np.nan, np.nan]) + s + s.to_sparse() + + s = pd.Series([1, 0, 0]) + s + s.to_sparse() + + s = pd.Series([True, False, True]) + s + s.to_sparse() + +You can change the dtype using ``.astype()``, the result is also sparse. Note that +``.astype()`` also affects to the ``fill_value`` to keep its dense represantation. + .. ipython:: python - spl.append(np.array([1., np.nan, np.nan, 2., 3.])) - spl.append(5) - spl.append(sparr) - spl + s = pd.Series([1, 0, 0, 0, 0]) + s + ss = s.to_sparse() + ss + ss.astype(np.float64) + +It raises if any value cannot be coerced to specified dtype. + +.. code-block:: ipython + + In [1]: ss = pd.Series([1, np.nan, np.nan]).to_sparse() + 0 1.0 + 1 NaN + 2 NaN + dtype: float64 + BlockIndex + Block locations: array([0], dtype=int32) + Block lengths: array([1], dtype=int32) + + In [2]: ss.astype(np.int64) + ValueError: unable to coerce current fill_value nan to int64 dtype + +.. _sparse.calculation: + +Sparse Calculation +------------------ -As you can see, all of the contents are stored internally as a list of -memory-efficient ``SparseArray`` objects. Once you've accumulated all of the -data, you can call ``to_array`` to get a single ``SparseArray`` with all the -data: +You can apply NumPy *ufuncs* to ``SparseArray`` and get a ``SparseArray`` as a result. .. ipython:: python - spl.to_array() + arr = pd.SparseArray([1., np.nan, np.nan, -2., np.nan]) + np.abs(arr) -SparseIndex objects -------------------- -Two kinds of ``SparseIndex`` are implemented, ``block`` and ``integer``. We -recommend using ``block`` as it's more memory efficient. The ``integer`` format -keeps an arrays of all of the locations where the data are not equal to the -fill value. The ``block`` format tracks only the locations and sizes of blocks -of data. +The *ufunc* is also applied to ``fill_value``. This is needed to get +the correct dense result. + +.. ipython:: python + + arr = pd.SparseArray([1., -1, -1, -2., -1], fill_value=-1) + np.abs(arr) + np.abs(arr).to_dense() .. _sparse.scipysparse: diff --git a/doc/source/text.rst b/doc/source/text.rst index 3822c713d7f85..3a4a57ff4da95 100644 --- a/doc/source/text.rst +++ b/doc/source/text.rst @@ -316,7 +316,7 @@ then ``extractall(pat).xs(0, level='match')`` gives the same result as ``Index`` also supports ``.str.extractall``. It returns a ``DataFrame`` which has the same result as a ``Series.str.extractall`` with a default index (starts from 0). -.. versionadded:: 0.18.2 +.. versionadded:: 0.19.0 .. ipython:: python diff --git a/doc/source/themes/nature_with_gtoc/layout.html b/doc/source/themes/nature_with_gtoc/layout.html index fd0755e096023..ddf1e861f5f81 100644 --- a/doc/source/themes/nature_with_gtoc/layout.html +++ b/doc/source/themes/nature_with_gtoc/layout.html @@ -61,3 +61,37 @@

{{ _('Search') }}

{%- endblock %} + +{%- block footer %} + +Scroll To Top + +{% endblock %} \ No newline at end of file diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst index 62601821488d3..4132d25e9be48 100644 --- a/doc/source/timeseries.rst +++ b/doc/source/timeseries.rst @@ -7,7 +7,7 @@ from datetime import datetime, timedelta, time import numpy as np import pandas as pd - from pandas import datetools + from pandas import offsets np.random.seed(123456) randn = np.random.randn randint = np.random.randint @@ -544,8 +544,8 @@ There are several time/date properties that one can access from ``Timestamp`` or second,"The seconds of the datetime" microsecond,"The microseconds of the datetime" nanosecond,"The nanoseconds of the datetime" - date,"Returns datetime.date" - time,"Returns datetime.time" + date,"Returns datetime.date (does not contain timezone information)" + time,"Returns datetime.time (does not contain timezone information)" dayofyear,"The ordinal day of year" weekofyear,"The week ordinal of the year" week,"The week ordinal of the year" @@ -560,6 +560,7 @@ There are several time/date properties that one can access from ``Timestamp`` or is_quarter_end,"Logical indicating if last day of quarter (defined by frequency)" is_year_start,"Logical indicating if first day of year (defined by frequency)" is_year_end,"Logical indicating if last day of year (defined by frequency)" + is_leap_year,"Logical indicating if the date belongs to a leap year" Furthermore, if you have a ``Series`` with datetimelike values, then you can access these properties via the ``.dt`` accessor, see the :ref:`docs ` @@ -567,11 +568,11 @@ DateOffset objects ------------------ In the preceding examples, we created DatetimeIndex objects at various -frequencies by passing in frequency strings like 'M', 'W', and 'BM to the -``freq`` keyword. Under the hood, these frequency strings are being translated -into an instance of pandas ``DateOffset``, which represents a regular -frequency increment. Specific offset logic like "month", "business day", or -"one hour" is represented in its various subclasses. +frequencies by passing in :ref:`frequency strings ` +like 'M', 'W', and 'BM to the ``freq`` keyword. Under the hood, these frequency +strings are being translated into an instance of pandas ``DateOffset``, +which represents a regular frequency increment. Specific offset logic like +"month", "business day", or "one hour" is represented in its various subclasses. .. csv-table:: :header: "Class name", "Description" @@ -589,6 +590,8 @@ frequency increment. Specific offset logic like "month", "business day", or BMonthBegin, "business month begin" CBMonthEnd, "custom business month end" CBMonthBegin, "custom business month begin" + SemiMonthEnd, "15th (or other day_of_month) and calendar month end" + SemiMonthBegin, "15th (or other day_of_month) and calendar month begin" QuarterEnd, "calendar quarter end" QuarterBegin, "calendar quarter begin" BQuarterEnd, "business quarter end" @@ -750,7 +753,7 @@ calculate significantly slower and will raise a ``PerformanceWarning`` rng + BQuarterEnd() -.. _timeseries.alias: +.. _timeseries.custombusinessdays: Custom Business Days (Experimental) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -951,6 +954,8 @@ You can use keyword arguments suported by either ``BusinessHour`` and ``CustomBu # Monday is skipped because it's a holiday, business hour starts from 10:00 dt + bhour_mon * 2 +.. _timeseries.offset_aliases: + Offset Aliases ~~~~~~~~~~~~~~ @@ -967,9 +972,11 @@ frequencies. We will refer to these aliases as *offset aliases* "D", "calendar day frequency" "W", "weekly frequency" "M", "month end frequency" + "SM", "semi-month end frequency (15th and end of month)" "BM", "business month end frequency" "CBM", "custom business month end frequency" "MS", "month start frequency" + "SMS", "semi-month start frequency (1st and 15th)" "BMS", "business month start frequency" "CBMS", "custom business month start frequency" "Q", "quarter end frequency" @@ -1099,48 +1106,6 @@ it is rolled forward to the next anchor point. pd.Timestamp('2014-01-01') + MonthBegin(n=0) pd.Timestamp('2014-01-31') + MonthEnd(n=0) -.. _timeseries.legacyaliases: - -Legacy Aliases -~~~~~~~~~~~~~~ -Note that prior to v0.8.0, time rules had a slightly different look. These are -deprecated in v0.17.0, and removed in future version. - -.. csv-table:: - :header: "Legacy Time Rule", "Offset Alias" - :widths: 15, 65 - - "WEEKDAY", "B" - "EOM", "BM" - "W\@MON", "W\-MON" - "W\@TUE", "W\-TUE" - "W\@WED", "W\-WED" - "W\@THU", "W\-THU" - "W\@FRI", "W\-FRI" - "W\@SAT", "W\-SAT" - "W\@SUN", "W\-SUN" - "Q\@JAN", "BQ\-JAN" - "Q\@FEB", "BQ\-FEB" - "Q\@MAR", "BQ\-MAR" - "A\@JAN", "BA\-JAN" - "A\@FEB", "BA\-FEB" - "A\@MAR", "BA\-MAR" - "A\@APR", "BA\-APR" - "A\@MAY", "BA\-MAY" - "A\@JUN", "BA\-JUN" - "A\@JUL", "BA\-JUL" - "A\@AUG", "BA\-AUG" - "A\@SEP", "BA\-SEP" - "A\@OCT", "BA\-OCT" - "A\@NOV", "BA\-NOV" - "A\@DEC", "BA\-DEC" - - -As you can see, legacy quarterly and annual frequencies are business quarters -and business year ends. Please also note the legacy time rule for milliseconds -``ms`` versus the new offset alias for month start ``MS``. This means that -offset alias parsing is case sensitive. - .. _timeseries.holiday: Holidays / Holiday Calendars @@ -1254,11 +1219,11 @@ objects. ts.shift(1) The shift method accepts an ``freq`` argument which can accept a -``DateOffset`` class or other ``timedelta``-like object or also a :ref:`offset alias `: +``DateOffset`` class or other ``timedelta``-like object or also a :ref:`offset alias `: .. ipython:: python - ts.shift(5, freq=datetools.bday) + ts.shift(5, freq=offsets.BDay()) ts.shift(5, freq='BM') Rather than changing the alignment of the data and the index, ``DataFrame`` and @@ -1281,7 +1246,7 @@ around ``reindex`` which generates a ``date_range`` and calls ``reindex``. .. ipython:: python - dr = pd.date_range('1/1/2010', periods=3, freq=3 * datetools.bday) + dr = pd.date_range('1/1/2010', periods=3, freq=3 * offsets.BDay()) ts = pd.Series(randn(3), index=dr) ts ts.asfreq(BDay()) @@ -1320,7 +1285,11 @@ performing resampling operations during frequency conversion (e.g., converting secondly data into 5-minutely data). This is extremely common in, but not limited to, financial applications. -``resample`` is a time-based groupby, followed by a reduction method on each of its groups. +``.resample()`` is a time-based groupby, followed by a reduction method on each of its groups. + +.. note:: + + ``.resample()`` is similar to using a ``.rolling()`` operation with a time-based offset, see a discussion `here ` See some :ref:`cookbook examples ` for some advanced strategies @@ -1504,6 +1473,30 @@ Furthermore, you can also specify multiple aggregation functions for each column r.agg({'A' : ['sum','std'], 'B' : ['mean','std'] }) +If a ``DataFrame`` does not have a datetimelike index, but instead you want +to resample based on datetimelike column in the frame, it can passed to the +``on`` keyword. + +.. ipython:: python + + df = pd.DataFrame({'date': pd.date_range('2015-01-01', freq='W', periods=5), + 'a': np.arange(5)}, + index=pd.MultiIndex.from_arrays([ + [1,2,3,4,5], + pd.date_range('2015-01-01', freq='W', periods=5)], + names=['v','d'])) + df + df.resample('M', on='date').sum() + +Similarly, if you instead want to resample by a datetimelike +level of ``MultiIndex``, its name or location can be passed to the +``level`` keyword. + +.. ipython:: python + + df.resample('M', level='d').sum() + + .. _timeseries.periods: Time Span Representation @@ -1625,6 +1618,45 @@ objects: idx idx + MonthEnd(3) +``PeriodIndex`` has its own dtype named ``period``, refer to :ref:`Period Dtypes `. + +.. _timeseries.period_dtype: + +Period Dtypes +~~~~~~~~~~~~~ + +.. versionadded:: 0.19.0 + +``PeriodIndex`` has a custom ``period`` dtype. This is a pandas extension +dtype similar to the :ref:`timezone aware dtype ` (``datetime64[ns, tz]``). + +The ``period`` dtype holds the ``freq`` attribute and is represented with +``period[freq]`` like ``period[D]`` or ``period[M]``, using :ref:`frequency strings `. + +.. ipython:: python + + pi = pd.period_range('2016-01-01', periods=3, freq='M') + pi + pi.dtype + +The ``period`` dtype can be used in ``.astype(...)``. It allows one to change the +``freq`` of a ``PeriodIndex`` like ``.asfreq()`` and convert a +``DatetimeIndex`` to ``PeriodIndex`` like ``to_period()``: + +.. ipython:: python + + # change monthly freq to daily freq + pi.astype('period[D]') + + # convert to DatetimeIndex + pi.astype('datetime64[ns]') + + # convert to PeriodIndex + dti = pd.date_range('2011-01-01', freq='M', periods=3) + dti + dti.astype('period[M]') + + PeriodIndex Partial String Indexing ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/visualization.rst b/doc/source/visualization.rst index 7840ae29298b0..6e05c3ff0457a 100644 --- a/doc/source/visualization.rst +++ b/doc/source/visualization.rst @@ -456,28 +456,29 @@ columns: .. _visualization.box.return: -Basically, plot functions return :class:`matplotlib Axes ` as a return value. -In ``boxplot``, the return type can be changed by argument ``return_type``, and whether the subplots is enabled (``subplots=True`` in ``plot`` or ``by`` is specified in ``boxplot``). +.. warning:: -When ``subplots=False`` / ``by`` is ``None``: + The default changed from ``'dict'`` to ``'axes'`` in version 0.19.0. -* if ``return_type`` is ``'dict'``, a dictionary containing the :class:`matplotlib Lines ` is returned. The keys are "boxes", "caps", "fliers", "medians", and "whiskers". - This is the default of ``boxplot`` in historical reason. - Note that ``plot.box()`` returns ``Axes`` by default same as other plots. -* if ``return_type`` is ``'axes'``, a :class:`matplotlib Axes ` containing the boxplot is returned. -* if ``return_type`` is ``'both'`` a namedtuple containing the :class:`matplotlib Axes ` - and :class:`matplotlib Lines ` is returned +In ``boxplot``, the return type can be controlled by the ``return_type``, keyword. The valid choices are ``{"axes", "dict", "both", None}``. +Faceting, created by ``DataFrame.boxplot`` with the ``by`` +keyword, will affect the output type as well: -When ``subplots=True`` / ``by`` is some column of the DataFrame: +================ ======= ========================== +``return_type=`` Faceted Output type +---------------- ------- -------------------------- -* A dict of ``return_type`` is returned, where the keys are the columns - of the DataFrame. The plot has a facet for each column of - the DataFrame, with a separate box for each value of ``by``. +``None`` No axes +``None`` Yes 2-D ndarray of axes +``'axes'`` No axes +``'axes'`` Yes Series of axes +``'dict'`` No dict of artists +``'dict'`` Yes Series of dicts of artists +``'both'`` No namedtuple +``'both'`` Yes Series of namedtuples +================ ======= ========================== -Finally, when calling boxplot on a :class:`Groupby` object, a dict of ``return_type`` -is returned, where the keys are the same as the Groupby object. The plot has a -facet for each key, with each facet containing a box for each column of the -DataFrame. +``Groupby.boxplot`` always returns a Series of ``return_type``. .. ipython:: python :okwarning: @@ -1615,246 +1616,8 @@ Trellis plotting interface .. warning:: - The ``rplot`` trellis plotting interface is **deprecated and will be removed - in a future version**. We refer to external packages like - `seaborn `_ for similar but more - refined functionality. - - The docs below include some example on how to convert your existing code to - ``seaborn``. - -.. ipython:: python - :suppress: - - tips_data = pd.read_csv('data/tips.csv') - iris_data = pd.read_csv('data/iris.data') - plt.close('all') - - -.. note:: - - The tips data set can be downloaded `here - `__. Once you download it execute - - .. code-block:: python - - tips_data = pd.read_csv('tips.csv') - - from the directory where you downloaded the file. - -We import the rplot API: - -.. ipython:: python - :okwarning: - - import pandas.tools.rplot as rplot - -Examples -~~~~~~~~ - -RPlot was an API for producing Trellis plots. These plots allow you to -arrange data in a rectangular grid by values of certain attributes. -In the example below, data from the tips data set is arranged by the attributes -'sex' and 'smoker'. Since both of those attributes can take on one of two -values, the resulting grid has two columns and two rows. A histogram is -displayed for each cell of the grid. - -.. ipython:: python - :okwarning: - - plt.figure() - - plot = rplot.RPlot(tips_data, x='total_bill', y='tip') - plot.add(rplot.TrellisGrid(['sex', 'smoker'])) - plot.add(rplot.GeomHistogram()) - - @savefig rplot1_tips.png - plot.render(plt.gcf()) - -.. ipython:: python - :suppress: - - plt.close('all') - -A similar plot can be made with ``seaborn`` using the ``FacetGrid`` object, -resulting in the following image: - -.. code-block:: python - - import seaborn as sns - g = sns.FacetGrid(tips_data, row="sex", col="smoker") - g.map(plt.hist, "total_bill") - -.. image:: _static/rplot-seaborn-example1.png - - -Example below is the same as previous except the plot is set to kernel density -estimation. A ``seaborn`` example is included beneath. - -.. ipython:: python - :okwarning: - - plt.figure() - - plot = rplot.RPlot(tips_data, x='total_bill', y='tip') - plot.add(rplot.TrellisGrid(['sex', 'smoker'])) - plot.add(rplot.GeomDensity()) - - @savefig rplot2_tips.png - plot.render(plt.gcf()) - -.. ipython:: python - :suppress: - - plt.close('all') - -.. code-block:: python - - g = sns.FacetGrid(tips_data, row="sex", col="smoker") - g.map(sns.kdeplot, "total_bill") - -.. image:: _static/rplot-seaborn-example2.png - -The plot below shows that it is possible to have two or more plots for the same -data displayed on the same Trellis grid cell. - -.. ipython:: python - :okwarning: - - plt.figure() - - plot = rplot.RPlot(tips_data, x='total_bill', y='tip') - plot.add(rplot.TrellisGrid(['sex', 'smoker'])) - plot.add(rplot.GeomScatter()) - plot.add(rplot.GeomPolyFit(degree=2)) - - @savefig rplot3_tips.png - plot.render(plt.gcf()) - -.. ipython:: python - :suppress: - - plt.close('all') - -A seaborn equivalent for a simple scatter plot: - -.. code-block:: python - - g = sns.FacetGrid(tips_data, row="sex", col="smoker") - g.map(plt.scatter, "total_bill", "tip") - -.. image:: _static/rplot-seaborn-example3.png - -and with a regression line, using the dedicated ``seaborn`` ``regplot`` function: - -.. code-block:: python - - g = sns.FacetGrid(tips_data, row="sex", col="smoker", margin_titles=True) - g.map(sns.regplot, "total_bill", "tip", order=2) - -.. image:: _static/rplot-seaborn-example3b.png - - -Below is a similar plot but with 2D kernel density estimation plot superimposed, -followed by a ``seaborn`` equivalent: - -.. ipython:: python - :okwarning: - - plt.figure() - - plot = rplot.RPlot(tips_data, x='total_bill', y='tip') - plot.add(rplot.TrellisGrid(['sex', 'smoker'])) - plot.add(rplot.GeomScatter()) - plot.add(rplot.GeomDensity2D()) - - @savefig rplot4_tips.png - plot.render(plt.gcf()) - -.. ipython:: python - :suppress: - - plt.close('all') - -.. code-block:: python - - g = sns.FacetGrid(tips_data, row="sex", col="smoker") - g.map(plt.scatter, "total_bill", "tip") - g.map(sns.kdeplot, "total_bill", "tip") - -.. image:: _static/rplot-seaborn-example4.png - -It is possible to only use one attribute for grouping data. The example above -only uses 'sex' attribute. If the second grouping attribute is not specified, -the plots will be arranged in a column. - -.. ipython:: python - :okwarning: - - plt.figure() - - plot = rplot.RPlot(tips_data, x='total_bill', y='tip') - plot.add(rplot.TrellisGrid(['sex', '.'])) - plot.add(rplot.GeomHistogram()) - - @savefig rplot5_tips.png - plot.render(plt.gcf()) - -.. ipython:: python - :suppress: - - plt.close('all') - -If the first grouping attribute is not specified the plots will be arranged in a row. - -.. ipython:: python - :okwarning: - - plt.figure() - - plot = rplot.RPlot(tips_data, x='total_bill', y='tip') - plot.add(rplot.TrellisGrid(['.', 'smoker'])) - plot.add(rplot.GeomHistogram()) - - @savefig rplot6_tips.png - plot.render(plt.gcf()) - -.. ipython:: python - :suppress: - - plt.close('all') - -In ``seaborn``, this can also be done by only specifying one of the ``row`` -and ``col`` arguments. - -In the example below the colour and shape of the scatter plot graphical -objects is mapped to 'day' and 'size' attributes respectively. You use -scale objects to specify these mappings. The list of scale classes is -given below with initialization arguments for quick reference. - -.. ipython:: python - :okwarning: - - plt.figure() - - plot = rplot.RPlot(tips_data, x='tip', y='total_bill') - plot.add(rplot.TrellisGrid(['sex', 'smoker'])) - plot.add(rplot.GeomPoint(size=80.0, colour=rplot.ScaleRandomColour('day'), shape=rplot.ScaleShape('size'), alpha=1.0)) - - @savefig rplot7_tips.png - plot.render(plt.gcf()) - -.. ipython:: python - :suppress: - - plt.close('all') - -This can also be done in ``seaborn``, at least for 3 variables: - -.. code-block:: python - - g = sns.FacetGrid(tips_data, row="sex", col="smoker", hue="day") - g.map(plt.scatter, "tip", "total_bill") - g.add_legend() - -.. image:: _static/rplot-seaborn-example6.png + The ``rplot`` trellis plotting interface has been **removed**. Please use + external packages like `seaborn `_ for + similar but more refined functionality and refer to our 0.18.1 documentation + `here `__ + for how to convert to using it. diff --git a/doc/source/whatsnew.rst b/doc/source/whatsnew.rst index 685f1d2086c69..77dc249aeb788 100644 --- a/doc/source/whatsnew.rst +++ b/doc/source/whatsnew.rst @@ -18,7 +18,7 @@ What's New These are new features and improvements of note in each release. -.. include:: whatsnew/v0.18.2.txt +.. include:: whatsnew/v0.19.0.txt .. include:: whatsnew/v0.18.1.txt diff --git a/doc/source/whatsnew/v0.10.0.txt b/doc/source/whatsnew/v0.10.0.txt index ce20de654ffd8..fed3ba3ce3a84 100644 --- a/doc/source/whatsnew/v0.10.0.txt +++ b/doc/source/whatsnew/v0.10.0.txt @@ -383,6 +383,7 @@ Adding experimental support for Panel4D and factory functions to create n-dimens :ref:`Docs ` for NDim. Here is a taste of what to expect. .. ipython:: python + :okwarning: p4d = Panel4D(randn(2, 2, 5, 4), labels=['Label1','Label2'], diff --git a/doc/source/whatsnew/v0.13.0.txt b/doc/source/whatsnew/v0.13.0.txt index e8f2f54b873d6..0944d849cfafd 100644 --- a/doc/source/whatsnew/v0.13.0.txt +++ b/doc/source/whatsnew/v0.13.0.txt @@ -825,7 +825,7 @@ Experimental # Your Google BigQuery Project ID # To find this, see your dashboard: - # https://code.google.com/apis/console/b/0/?noredirect + # https://console.developers.google.com/iam-admin/projects?authuser=0 projectid = xxxxxxxxx; df = gbq.read_gbq(query, project_id = projectid) diff --git a/doc/source/whatsnew/v0.14.1.txt b/doc/source/whatsnew/v0.14.1.txt index 84f2a77203c41..239d6c9c6e0d4 100644 --- a/doc/source/whatsnew/v0.14.1.txt +++ b/doc/source/whatsnew/v0.14.1.txt @@ -156,7 +156,7 @@ Experimental ~~~~~~~~~~~~ - ``pandas.io.data.Options`` has a new method, ``get_all_data`` method, and now consistently returns a - multi-indexed ``DataFrame``, see :ref:`the docs `. (:issue:`5602`) + multi-indexed ``DataFrame`` (:issue:`5602`) - ``io.gbq.read_gbq`` and ``io.gbq.to_gbq`` were refactored to remove the dependency on the Google ``bq.py`` command line client. This submodule now uses ``httplib2`` and the Google ``apiclient`` and ``oauth2client`` API client diff --git a/doc/source/whatsnew/v0.15.1.txt b/doc/source/whatsnew/v0.15.1.txt index 2a4104c2d5dc4..cd9298c74539a 100644 --- a/doc/source/whatsnew/v0.15.1.txt +++ b/doc/source/whatsnew/v0.15.1.txt @@ -144,16 +144,46 @@ API changes Current behavior: - .. ipython:: python - :okwarning: - - from pandas.io.data import Options - aapl = Options('aapl','yahoo') - aapl.get_call_data().iloc[0:5,0:1] - aapl.expiry_dates - aapl.get_near_stock_price(expiry=aapl.expiry_dates[0:3]).iloc[0:5,0:1] + .. code-block:: ipython - See the Options documentation in :ref:`Remote Data ` + In [17]: from pandas.io.data import Options + + In [18]: aapl = Options('aapl','yahoo') + + In [19]: aapl.get_call_data().iloc[0:5,0:1] + Out[19]: + Last + Strike Expiry Type Symbol + 80 2014-11-14 call AAPL141114C00080000 29.05 + 84 2014-11-14 call AAPL141114C00084000 24.80 + 85 2014-11-14 call AAPL141114C00085000 24.05 + 86 2014-11-14 call AAPL141114C00086000 22.76 + 87 2014-11-14 call AAPL141114C00087000 21.74 + + In [20]: aapl.expiry_dates + Out[20]: + [datetime.date(2014, 11, 14), + datetime.date(2014, 11, 22), + datetime.date(2014, 11, 28), + datetime.date(2014, 12, 5), + datetime.date(2014, 12, 12), + datetime.date(2014, 12, 20), + datetime.date(2015, 1, 17), + datetime.date(2015, 2, 20), + datetime.date(2015, 4, 17), + datetime.date(2015, 7, 17), + datetime.date(2016, 1, 15), + datetime.date(2017, 1, 20)] + + In [21]: aapl.get_near_stock_price(expiry=aapl.expiry_dates[0:3]).iloc[0:5,0:1] + Out[21]: + Last + Strike Expiry Type Symbol + 109 2014-11-22 call AAPL141122C00109000 1.48 + 2014-11-28 call AAPL141128C00109000 1.79 + 110 2014-11-14 call AAPL141114C00110000 0.55 + 2014-11-22 call AAPL141122C00110000 1.02 + 2014-11-28 call AAPL141128C00110000 1.32 .. _whatsnew_0151.datetime64_plotting: @@ -225,7 +255,7 @@ Enhancements - Added support for 3-character ISO and non-standard country codes in :func:`io.wb.download()` (:issue:`8482`) -- :ref:`World Bank data requests ` now will warn/raise based +- World Bank data requests now will warn/raise based on an ``errors`` argument, as well as a list of hard-coded country codes and the World Bank's JSON response. In prior versions, the error messages didn't look at the World Bank's JSON response. Problem-inducing input were diff --git a/doc/source/whatsnew/v0.16.0.txt b/doc/source/whatsnew/v0.16.0.txt index 68a558a2b7fd0..4255f4839bca0 100644 --- a/doc/source/whatsnew/v0.16.0.txt +++ b/doc/source/whatsnew/v0.16.0.txt @@ -19,7 +19,7 @@ Highlights include: modules are deprecated. We refer users to external packages like `seaborn `_, `pandas-qt `_ and - `rpy2 `_ for similar or equivalent + `rpy2 `_ for similar or equivalent functionality, see :ref:`here ` Check the :ref:`API Changes ` and :ref:`deprecations ` before updating. @@ -508,7 +508,7 @@ Deprecations We refer users to the external package `pandas-qt `_. (:issue:`9615`) - The ``pandas.rpy`` interface is deprecated and will be removed in a future version. - Similar functionaility can be accessed thru the `rpy2 `_ project (:issue:`9602`) + Similar functionaility can be accessed thru the `rpy2 `_ project (:issue:`9602`) - Adding ``DatetimeIndex/PeriodIndex`` to another ``DatetimeIndex/PeriodIndex`` is being deprecated as a set-operation. This will be changed to a ``TypeError`` in a future version. ``.union()`` should be used for the union set operation. (:issue:`9094`) - Subtracting ``DatetimeIndex/PeriodIndex`` from another ``DatetimeIndex/PeriodIndex`` is being deprecated as a set-operation. This will be changed to an actual numeric subtraction yielding a ``TimeDeltaIndex`` in a future version. ``.difference()`` should be used for the differencing set operation. (:issue:`9094`) diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index ef9785d25f014..fc13224d3fe6e 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -965,7 +965,7 @@ Deprecations - ``TimeSeries`` deprecated in favor of ``Series`` (note that this has been an alias since 0.13.0), (:issue:`10890`) - ``SparsePanel`` deprecated and will be removed in a future version (:issue:`11157`). - ``Series.is_time_series`` deprecated in favor of ``Series.index.is_all_dates`` (:issue:`11135`) -- Legacy offsets (like ``'A@JAN'``) listed in :ref:`here ` are deprecated (note that this has been alias since 0.8.0), (:issue:`10878`) +- Legacy offsets (like ``'A@JAN'``) are deprecated (note that this has been alias since 0.8.0) (:issue:`10878`) - ``WidePanel`` deprecated in favor of ``Panel``, ``LongPanel`` in favor of ``DataFrame`` (note these have been aliases since < 0.11.0), (:issue:`10892`) - ``DataFrame.convert_objects`` has been deprecated in favor of type-specific functions ``pd.to_datetime``, ``pd.to_timestamp`` and ``pd.to_numeric`` (new in 0.17.0) (:issue:`11133`). diff --git a/doc/source/whatsnew/v0.18.0.txt b/doc/source/whatsnew/v0.18.0.txt index 93c76bc80684f..7418cd0e6baa3 100644 --- a/doc/source/whatsnew/v0.18.0.txt +++ b/doc/source/whatsnew/v0.18.0.txt @@ -764,7 +764,7 @@ yields a ``Resampler``. r Downsampling -'''''''''''' +"""""""""""" You can then use this object to perform operations. These are downsampling operations (going from a higher frequency to a lower one). @@ -796,7 +796,7 @@ These accessors can of course, be combined r[['A','B']].agg(['mean','sum']) Upsampling -'''''''''' +"""""""""" .. currentmodule:: pandas.tseries.resample @@ -842,7 +842,7 @@ New API In the new API, you can either downsample OR upsample. The prior implementation would allow you to pass an aggregator function (like ``mean``) even though you were upsampling, providing a bit of confusion. Previous API will work but with deprecations -'''''''''''''''''''''''''''''''''''''''''''' +"""""""""""""""""""""""""""""""""""""""""""" .. warning:: diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt index 51982c42499ff..7f74d8a769e4b 100644 --- a/doc/source/whatsnew/v0.18.1.txt +++ b/doc/source/whatsnew/v0.18.1.txt @@ -374,7 +374,7 @@ New Behavior: df.groupby('c', sort=False).nth(1) -.. _whatsnew_0181.numpy_compatibility +.. _whatsnew_0181.numpy_compatibility: numpy function compatibility ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -658,7 +658,7 @@ Bug Fixes - Bug in ``CategoricalIndex.get_loc`` returns different result from regular ``Index`` (:issue:`12531`) - Bug in ``PeriodIndex.resample`` where name not propagated (:issue:`12769`) - +- Bug in ``date_range`` ``closed`` keyword and timezones (:issue:`12684`). - Bug in ``pd.concat`` raises ``AttributeError`` when input data contains tz-aware datetime and timedelta (:issue:`12620`) - Bug in ``pd.concat`` did not handle empty ``Series`` properly (:issue:`11082`) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt deleted file mode 100644 index 950bf397f43b5..0000000000000 --- a/doc/source/whatsnew/v0.18.2.txt +++ /dev/null @@ -1,376 +0,0 @@ -.. _whatsnew_0182: - -v0.18.2 (July ??, 2016) ------------------------ - -This is a minor bug-fix release from 0.18.1 and includes a large number of -bug fixes along with several new features, enhancements, and performance improvements. -We recommend that all users upgrade to this version. - -Highlights include: - - -.. contents:: What's new in v0.18.2 - :local: - :backlinks: none - -.. _whatsnew_0182.new_features: - -New features -~~~~~~~~~~~~ - -.. _whatsnew_0182.enhancements.read_csv_dupe_col_names_support: - -``pd.read_csv`` has improved support for duplicate column names -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -:ref:`Duplicate column names ` are now supported in ``pd.read_csv()`` whether -they are in the file or passed in as the ``names`` parameter (:issue:`7160`, :issue:`9424`) - -.. ipython :: python - - data = '0,1,2\n3,4,5' - names = ['a', 'b', 'a'] - -Previous behaviour: - -.. code-block:: ipython - - In [2]: pd.read_csv(StringIO(data), names=names) - Out[2]: - a b a - 0 2 1 2 - 1 5 4 5 - -The first 'a' column contains the same data as the second 'a' column, when it should have -contained the array ``[0, 3]``. - -New behaviour: - -.. ipython :: python - - In [2]: pd.read_csv(StringIO(data), names=names) - -.. _whatsnew_0182.enhancements.other: - -Other enhancements -^^^^^^^^^^^^^^^^^^ - -- The ``.tz_localize()`` method of ``DatetimeIndex`` and ``Timestamp`` has gained the ``errors`` keyword, so you can potentially coerce nonexistent timestamps to ``NaT``. The default behaviour remains to raising a ``NonExistentTimeError`` (:issue:`13057`) - -- ``Index`` now supports ``.str.extractall()`` which returns ``DataFrame``, see :ref:`Extract all matches in each subject (extractall) ` (:issue:`10008`, :issue:`13156`) -- ``.to_hdf/read_hdf()`` now accept path objects (e.g. ``pathlib.Path``, ``py.path.local``) for the file path (:issue:`11773`) - - .. ipython:: python - - idx = pd.Index(["a1a2", "b1", "c1"]) - idx.str.extractall("[ab](?P\d)") - -- ``Timestamp`` s can now accept positional and keyword parameters like :func:`datetime.datetime` (:issue:`10758`, :issue:`11630`) - - .. ipython:: python - - pd.Timestamp(2012, 1, 1) - - pd.Timestamp(year=2012, month=1, day=1, hour=8, minute=30) - -- The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``decimal`` option (:issue:`12933`) -- The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``na_filter`` option (:issue:`13321`) - -- ``Index.astype()`` now accepts an optional boolean argument ``copy``, which allows optional copying if the requirements on dtype are satisfied (:issue:`13209`) -- ``Index`` now supports the ``.where()`` function for same shape indexing (:issue:`13170`) - - .. ipython:: python - - idx = pd.Index(['a', 'b', 'c']) - idx.where([True, False, True]) - -- ``Categorical.astype()`` now accepts an optional boolean argument ``copy``, effective when dtype is categorical (:issue:`13209`) -- Consistent with the Python API, ``pd.read_csv()`` will now interpret ``+inf`` as positive infinity (:issue:`13274`) - -- The ``DataFrame`` constructor will now respect key ordering if a list of ``OrderedDict`` objects are passed in (:issue:`13304`) -- ``pd.read_html()`` has gained support for the ``decimal`` option (:issue:`12907`) - -- ``eval``'s upcasting rules for ``float32`` types have been updated to be more consistent with NumPy's rules. New behavior will not upcast to ``float64`` if you multiply a pandas ``float32`` object by a scalar float64. (:issue:`12388`) -- ``Series`` has gained the properties ``.is_monotonic``, ``.is_monotonic_increasing``, ``.is_monotonic_decreasing``, similar to ``Index`` (:issue:`13336`) - -.. _whatsnew_0182.api: - -API changes -~~~~~~~~~~~ - - -- Non-convertible dates in an excel date column will be returned without conversion and the column will be ``object`` dtype, rather than raising an exception (:issue:`10001`) -- An ``UnsupportedFunctionCall`` error is now raised if numpy ufuncs like ``np.mean`` are called on groupby or resample objects (:issue:`12811`) -- Calls to ``.sample()`` will respect the random seed set via ``numpy.random.seed(n)`` (:issue:`13161`) - -.. _whatsnew_0182.api.tolist: - -``Series.tolist()`` will now return Python types -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -``Series.tolist()`` will now return Python types in the output, mimicking NumPy ``.tolist()`` behaviour (:issue:`10904`) - - -.. ipython:: python - - s = pd.Series([1,2,3]) - type(s.tolist()[0]) - -Previous Behavior: - -.. code-block:: ipython - - In [7]: type(s.tolist()[0]) - Out[7]: - - -New Behavior: - -.. ipython:: python - - type(s.tolist()[0]) - -.. _whatsnew_0182.api.promote: - -``Series`` type promotion on assignment -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -A ``Series`` will now correctly promote its dtype for assignment with incompat values to the current dtype (:issue:`13234`) - - -.. ipython:: python - - s = pd.Series() - -Previous Behavior: - -.. code-block:: ipython - - In [2]: s["a"] = pd.Timestamp("2016-01-01") - - In [3]: s["b"] = 3.0 - TypeError: invalid type promotion - -New Behavior: - -.. ipython:: python - - s["a"] = pd.Timestamp("2016-01-01") - s["b"] = 3.0 - s - s.dtype - -.. _whatsnew_0182.api.to_datetime_coerce: - -``.to_datetime()`` when coercing -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -A bug is fixed in ``.to_datetime()`` when passing integers or floats, and no ``unit`` and ``errors='coerce'`` (:issue:`13180`). -Previously if ``.to_datetime()`` encountered mixed integers/floats and strings, but no datetimes with ``errors='coerce'`` it would convert all to ``NaT``. - -Previous Behavior: - -.. code-block:: ipython - - In [2]: pd.to_datetime([1, 'foo'], errors='coerce') - Out[2]: DatetimeIndex(['NaT', 'NaT'], dtype='datetime64[ns]', freq=None) - -This will now convert integers/floats with the default unit of ``ns``. - -.. ipython:: python - - pd.to_datetime([1, 'foo'], errors='coerce') - -.. _whatsnew_0182.api.merging: - -Merging changes -^^^^^^^^^^^^^^^ - -Merging will now preserve the dtype of the join keys (:issue:`8596`) - -.. ipython:: python - - df1 = pd.DataFrame({'key': [1], 'v1': [10]}) - df1 - df2 = pd.DataFrame({'key': [1, 2], 'v1': [20, 30]}) - df2 - -Previous Behavior: - -.. code-block:: ipython - - In [5]: pd.merge(df1, df2, how='outer') - Out[5]: - key v1 - 0 1.0 10.0 - 1 1.0 20.0 - 2 2.0 30.0 - - In [6]: pd.merge(df1, df2, how='outer').dtypes - Out[6]: - key float64 - v1 float64 - dtype: object - -New Behavior: - -We are able to preserve the join keys - -.. ipython:: python - - pd.merge(df1, df2, how='outer') - pd.merge(df1, df2, how='outer').dtypes - -Of course if you have missing values that are introduced, then the -resulting dtype will be upcast (unchanged from previous). - -.. ipython:: python - - pd.merge(df1, df2, how='outer', on='key') - pd.merge(df1, df2, how='outer', on='key').dtypes - -.. _whatsnew_0182.describe: - -``.describe()`` changes -^^^^^^^^^^^^^^^^^^^^^^^ - -Percentile identifiers in the index of a ``.describe()`` output will now be rounded to the least precision that keeps them distinct (:issue:`13104`) - -.. ipython:: python - - s = pd.Series([0, 1, 2, 3, 4]) - df = pd.DataFrame([0, 1, 2, 3, 4]) - -Previous Behavior: - -The percentiles were rounded to at most one decimal place, which could raise ``ValueError`` for a data frame if the percentiles were duplicated. - -.. code-block:: ipython - - In [3]: s.describe(percentiles=[0.0001, 0.0005, 0.001, 0.999, 0.9995, 0.9999]) - Out[3]: - count 5.000000 - mean 2.000000 - std 1.581139 - min 0.000000 - 0.0% 0.000400 - 0.1% 0.002000 - 0.1% 0.004000 - 50% 2.000000 - 99.9% 3.996000 - 100.0% 3.998000 - 100.0% 3.999600 - max 4.000000 - dtype: float64 - - In [4]: df.describe(percentiles=[0.0001, 0.0005, 0.001, 0.999, 0.9995, 0.9999]) - Out[4]: - ... - ValueError: cannot reindex from a duplicate axis - -New Behavior: - -.. ipython:: python - - s.describe(percentiles=[0.0001, 0.0005, 0.001, 0.999, 0.9995, 0.9999]) - df.describe(percentiles=[0.0001, 0.0005, 0.001, 0.999, 0.9995, 0.9999]) - -- Passing duplicated ``percentiles`` will now raise a ``ValueError``. -- Bug in ``.describe()`` on a DataFrame with a mixed-dtype column index, which would previously raise a ``TypeError`` (:issue:`13288`) - -.. _whatsnew_0182.api.other: - -Other API changes -^^^^^^^^^^^^^^^^^ - -- ``Float64Index.astype(int)`` will now raise ``ValueError`` if ``Float64Index`` contains ``NaN`` values (:issue:`13149`) -- ``TimedeltaIndex.astype(int)`` and ``DatetimeIndex.astype(int)`` will now return ``Int64Index`` instead of ``np.array`` (:issue:`13209`) - -.. _whatsnew_0182.deprecations: - -Deprecations -^^^^^^^^^^^^ - - -.. _whatsnew_0182.performance: - -Performance Improvements -~~~~~~~~~~~~~~~~~~~~~~~~ - -- Improved performance of sparse ``IntIndex.intersect`` (:issue:`13082`) -- Improved performance of sparse arithmetic with ``BlockIndex`` when the number of blocks are large, though recommended to use ``IntIndex`` in such cases (:issue:`13082`) -- increased performance of ``DataFrame.quantile()`` as it now operates per-block (:issue:`11623`) - - -- Improved performance of ``DataFrameGroupBy.transform`` (:issue:`12737`) - - -.. _whatsnew_0182.bug_fixes: - -Bug Fixes -~~~~~~~~~ - -- Bug in ``io.json.json_normalize()``, where non-ascii keys raised an exception (:issue:`13213`) -- Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing may raise ``IndexError`` (:issue:`13144`) -- Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing result may have normal ``Index`` (:issue:`13144`) -- Bug in ``SparseDataFrame`` in which ``axis=None`` did not default to ``axis=0`` (:issue:`13048`) -- Bug in ``SparseSeries`` and ``SparseDataFrame`` creation with ``object`` dtype may raise ``TypeError`` (:issue:`11633`) -- Bug when passing a not-default-indexed ``Series`` as ``xerr`` or ``yerr`` in ``.plot()`` (:issue:`11858`) -- Bug in matplotlib ``AutoDataFormatter``; this restores the second scaled formatting and re-adds micro-second scaled formatting (:issue:`13131`) -- Bug in selection from a ``HDFStore`` with a fixed format and ``start`` and/or ``stop`` specified will now return the selected range (:issue:`8287`) - - -- Bug in ``.groupby(..).resample(..)`` when the same object is called multiple times (:issue:`13174`) -- Bug in ``.to_records()`` when index name is a unicode string (:issue:`13172`) - -- Bug in calling ``.memory_usage()`` on object which doesn't implement (:issue:`12924`) - -- Regression in ``Series.quantile`` with nans (also shows up in ``.median()`` and ``.describe()``); furthermore now names the ``Series`` with the quantile (:issue:`13098`, :issue:`13146`) - -- Bug in ``SeriesGroupBy.transform`` with datetime values and missing groups (:issue:`13191`) - -- Bug in ``Series.str.extractall()`` with ``str`` index raises ``ValueError`` (:issue:`13156`) - - -- Bug in ``PeriodIndex`` and ``Period`` subtraction raises ``AttributeError`` (:issue:`13071`) -- Bug in ``PeriodIndex`` construction returning a ``float64`` index in some circumstances (:issue:`13067`) -- Bug in ``.resample(..)`` with a ``PeriodIndex`` not changing its ``freq`` appropriately when empty (:issue:`13067`) -- Bug in ``.resample(..)`` with a ``PeriodIndex`` not retaining its type or name with an empty ``DataFrame``appropriately when empty (:issue:`13212`) -- Bug in ``groupby(..).resample(..)`` where passing some keywords would raise an exception (:issue:`13235`) -- Bug in ``.tz_convert`` on a tz-aware ``DateTimeIndex`` that relied on index being sorted for correct results (:issue: `13306`) - - - -- Bug in ``MultiIndex`` slicing where extra elements were returned when level is non-unique (:issue:`12896`) - - - -- Bug in ``pd.read_csv()`` with ``engine='python'`` in which ``NaN`` values weren't being detected after data was converted to numeric values (:issue:`13314`) -- Bug in ``pd.read_csv()`` in which the ``nrows`` argument was not properly validated for both engines (:issue:`10476`) -- Bug in ``pd.read_csv()`` with ``engine='python'`` in which infinities of mixed-case forms were not being interpreted properly (:issue:`13274`) -- Bug in ``pd.read_csv()`` with ``engine='python'`` in which trailing ``NaN`` values were not being parsed (:issue:`13320`) -- Bug in ``pd.read_csv()`` that prevents ``usecols`` kwarg from accepting single-byte unicode strings (:issue:`13219`) - - - -- Bug in ``Series`` arithmetic raises ``TypeError`` if it contains datetime-like as ``object`` dtype (:issue:`13043`) - - -- Bug in ``pd.to_datetime()`` when passing invalid datatypes (e.g. bool); will now respect the ``errors`` keyword (:issue:`13176`) -- Bug in extension dtype creation where the created types were not is/identical (:issue:`13285`) - -- Bug in ``NaT`` - ``Period`` raises ``AttributeError`` (:issue:`13071`) -- Bug in ``Period`` addition raises ``TypeError`` if ``Period`` is on right hand side (:issue:`13069`) -- Bug in ``Peirod`` and ``Series`` or ``Index`` comparison raises ``TypeError`` (:issue:`13200`) -- Bug in ``pd.set_eng_float_format()`` that would prevent NaN's from formatting (:issue:`11981`) -- Bug in ``.unstack`` with ``Categorical`` dtype resets ``.ordered`` to ``True`` (:issue:`13249`) - - -- Bug in ``groupby`` where ``apply`` returns different result depending on whether first result is ``None`` or not (:issue:`12824`) - - -- Bug in ``pd.to_numeric`` when ``errors='coerce'`` and input contains non-hashable objects (:issue:`13324`) - - -- Bug in ``Categorical.remove_unused_categories()`` changes ``.codes`` dtype to platform int (:issue:`13261`) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 42db0388ca5d9..a3e8f0c314352 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -1,71 +1,1385 @@ .. _whatsnew_0190: -v0.19.0 (????, 2016) --------------------- +v0.19.0 (September ??, 2016) +---------------------------- -This is a major release from 0.18.2 and includes a small number of API changes, several new features, +This is a major release from 0.18.1 and includes number of API changes, several new features, enhancements, and performance improvements along with a large number of bug fixes. We recommend that all users upgrade to this version. Highlights include: +- :func:`merge_asof` for asof-style time-series joining, see :ref:`here ` +- ``.rolling()`` are now time-series aware, see :ref:`here ` +- :func:`read_csv` now supports parsing ``Categorical`` data, see :ref:`here ` +- A function :func:`union_categorical` has been added for combining categoricals, see :ref:`here ` +- ``PeriodIndex`` now has its own ``period`` dtype, and changed to be more consistent with other ``Index`` classes. See :ref:`here ` +- Sparse data structures gained enhanced support of ``int`` and ``bool`` dtypes, see :ref:`here ` +- Comparison operations with ``Series`` no longer ignores the index, see :ref:`here ` for an overview of the API changes. +- Introduction of a pandas development API for utility functions, see :ref:`here `. +- Deprecation of ``Panel4D`` and ``PanelND``. We recommend to represent these types of n-dimensional data with the `xarray package `__. +- Removal of the previously deprecated modules ``pandas.io.data``, ``pandas.io.wb``, ``pandas.tools.rplot``. -Check the :ref:`API Changes ` and :ref:`deprecations ` before updating. +.. warning:: + + pandas >= 0.19.0 will no longer silence numpy ufunc warnings upon import, see :ref:`here `. .. contents:: What's new in v0.19.0 :local: :backlinks: none -.. _whatsnew_0190.enhancements: +.. _whatsnew_0190.new_features: New features ~~~~~~~~~~~~ +.. _whatsnew_0190.dev_api: + +pandas development API +^^^^^^^^^^^^^^^^^^^^^^ + +As part of making pandas API more uniform and accessible in the future, we have created a standard +sub-package of pandas, ``pandas.api`` to hold public API's. We are starting by exposing type +introspection functions in ``pandas.api.types``. More sub-packages and officially sanctioned API's +will be published in future versions of pandas (:issue:`13147`, :issue:`13634`) + +The following are now part of this API: + +.. ipython:: python + + import pprint + from pandas.api import types + funcs = [ f for f in dir(types) if not f.startswith('_') ] + pprint.pprint(funcs) + +.. note:: + + Calling these functions from the internal module ``pandas.core.common`` will now show a ``DeprecationWarning`` (:issue:`13990`) + +.. _whatsnew_0190.enhancements.asof_merge: + +``merge_asof`` for asof-style time-series joining +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +A long-time requested feature has been added through the :func:`merge_asof` function, to +support asof style joining of time-series. (:issue:`1870`, :issue:`13695`, :issue:`13709`, :issue:`13902`). Full documentation is +:ref:`here ` + +The :func:`merge_asof` performs an asof merge, which is similar to a left-join +except that we match on nearest key rather than equal keys. + +.. ipython:: python + + left = pd.DataFrame({'a': [1, 5, 10], + 'left_val': ['a', 'b', 'c']}) + right = pd.DataFrame({'a': [1, 2, 3, 6, 7], + 'right_val': [1, 2, 3, 6, 7]}) + + left + right + +We typically want to match exactly when possible, and use the most +recent value otherwise. + +.. ipython:: python + + pd.merge_asof(left, right, on='a') + +We can also match rows ONLY with prior data, and not an exact match. + +.. ipython:: python + + pd.merge_asof(left, right, on='a', allow_exact_matches=False) + + +In a typical time-series example, we have ``trades`` and ``quotes`` and we want to ``asof-join`` them. +This also illustrates using the ``by`` parameter to group data before merging. + +.. ipython:: python + + trades = pd.DataFrame({ + 'time': pd.to_datetime(['20160525 13:30:00.023', + '20160525 13:30:00.038', + '20160525 13:30:00.048', + '20160525 13:30:00.048', + '20160525 13:30:00.048']), + 'ticker': ['MSFT', 'MSFT', + 'GOOG', 'GOOG', 'AAPL'], + 'price': [51.95, 51.95, + 720.77, 720.92, 98.00], + 'quantity': [75, 155, + 100, 100, 100]}, + columns=['time', 'ticker', 'price', 'quantity']) + + quotes = pd.DataFrame({ + 'time': pd.to_datetime(['20160525 13:30:00.023', + '20160525 13:30:00.023', + '20160525 13:30:00.030', + '20160525 13:30:00.041', + '20160525 13:30:00.048', + '20160525 13:30:00.049', + '20160525 13:30:00.072', + '20160525 13:30:00.075']), + 'ticker': ['GOOG', 'MSFT', 'MSFT', + 'MSFT', 'GOOG', 'AAPL', 'GOOG', + 'MSFT'], + 'bid': [720.50, 51.95, 51.97, 51.99, + 720.50, 97.99, 720.50, 52.01], + 'ask': [720.93, 51.96, 51.98, 52.00, + 720.93, 98.01, 720.88, 52.03]}, + columns=['time', 'ticker', 'bid', 'ask']) + +.. ipython:: python + + trades + quotes + +An asof merge joins on the ``on``, typically a datetimelike field, which is ordered, and +in this case we are using a grouper in the ``by`` field. This is like a left-outer join, except +that forward filling happens automatically taking the most recent non-NaN value. + +.. ipython:: python + + pd.merge_asof(trades, quotes, + on='time', + by='ticker') + +This returns a merged DataFrame with the entries in the same order as the original left +passed DataFrame (``trades`` in this case), with the fields of the ``quotes`` merged. + +.. _whatsnew_0190.enhancements.rolling_ts: + +``.rolling()`` are now time-series aware +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``.rolling()`` objects are now time-series aware and can accept a time-series offset (or convertible) for the ``window`` argument (:issue:`13327`, :issue:`12995`) +See the full documentation :ref:`here `. + +.. ipython:: python + + dft = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}, + index=pd.date_range('20130101 09:00:00', periods=5, freq='s')) + dft + +This is a regular frequency index. Using an integer window parameter works to roll along the window frequency. + +.. ipython:: python + + dft.rolling(2).sum() + dft.rolling(2, min_periods=1).sum() + +Specifying an offset allows a more intuitive specification of the rolling frequency. + +.. ipython:: python + + dft.rolling('2s').sum() + +Using a non-regular, but still monotonic index, rolling with an integer window does not impart any special calculation. + +.. ipython:: python + + + dft = DataFrame({'B': [0, 1, 2, np.nan, 4]}, + index = pd.Index([pd.Timestamp('20130101 09:00:00'), + pd.Timestamp('20130101 09:00:02'), + pd.Timestamp('20130101 09:00:03'), + pd.Timestamp('20130101 09:00:05'), + pd.Timestamp('20130101 09:00:06')], + name='foo')) + + dft + dft.rolling(2).sum() + +Using the time-specification generates variable windows for this sparse data. + +.. ipython:: python + + dft.rolling('2s').sum() + +Furthermore, we now allow an optional ``on`` parameter to specify a column (rather than the +default of the index) in a DataFrame. + +.. ipython:: python + + dft = dft.reset_index() + dft + dft.rolling('2s', on='foo').sum() + +.. _whatsnew_0190.enhancements.read_csv_dupe_col_names_support: + +``read_csv`` has improved support for duplicate column names +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. ipython:: python + :suppress: + + from pandas.compat import StringIO + +:ref:`Duplicate column names ` are now supported in :func:`read_csv` whether +they are in the file or passed in as the ``names`` parameter (:issue:`7160`, :issue:`9424`) + +.. ipython:: python + + data = '0,1,2\n3,4,5' + names = ['a', 'b', 'a'] + +**Previous behavior**: + +.. code-block:: ipython + + In [2]: pd.read_csv(StringIO(data), names=names) + Out[2]: + a b a + 0 2 1 2 + 1 5 4 5 + +The first ``a`` column contained the same data as the second ``a`` column, when it should have +contained the values ``[0, 3]``. + +**New behavior**: + +.. ipython:: python + + pd.read_csv(StringIO(data), names=names) + + +.. _whatsnew_0190.enhancements.read_csv_categorical: + +``read_csv`` supports parsing ``Categorical`` directly +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The :func:`read_csv` function now supports parsing a ``Categorical`` column when +specified as a dtype (:issue:`10153`). Depending on the structure of the data, +this can result in a faster parse time and lower memory usage compared to +converting to ``Categorical`` after parsing. See the io :ref:`docs here `. + +.. ipython:: python + + data = 'col1,col2,col3\na,b,1\na,b,2\nc,d,3' + + pd.read_csv(StringIO(data)) + pd.read_csv(StringIO(data)).dtypes + pd.read_csv(StringIO(data), dtype='category').dtypes + +Individual columns can be parsed as a ``Categorical`` using a dict specification + +.. ipython:: python + + pd.read_csv(StringIO(data), dtype={'col1': 'category'}).dtypes + +.. note:: + + The resulting categories will always be parsed as strings (object dtype). + If the categories are numeric they can be converted using the + :func:`to_numeric` function, or as appropriate, another converter + such as :func:`to_datetime`. + + .. ipython:: python + + df = pd.read_csv(StringIO(data), dtype='category') + df.dtypes + df['col3'] + df['col3'].cat.categories = pd.to_numeric(df['col3'].cat.categories) + df['col3'] + +.. _whatsnew_0190.enhancements.union_categoricals: + +Categorical Concatenation +^^^^^^^^^^^^^^^^^^^^^^^^^ + +- A function :func:`union_categoricals` has been added for combining categoricals, see :ref:`Unioning Categoricals` (:issue:`13361`, :issue:`:13763`, issue:`13846`) + +.. ipython:: python + + from pandas.types.concat import union_categoricals + a = pd.Categorical(["b", "c"]) + b = pd.Categorical(["a", "b"]) + union_categoricals([a, b]) + +- ``concat`` and ``append`` now can concat ``category`` dtypes with different ``categories`` as ``object`` dtype (:issue:`13524`) + +**Previous behavior**: + + .. code-block:: ipython + + In [1]: s1 = pd.Series(['a', 'b'], dtype='category') + In [2]: s2 = pd.Series(['b', 'c'], dtype='category') + In [3]: pd.concat([s1, s2]) + ValueError: incompatible categories in categorical concat + +**New behavior**: + + .. ipython:: python + + pd.concat([s1, s2]) + +.. _whatsnew_0190.enhancements.semi_month_offsets: + +Semi-Month Offsets +^^^^^^^^^^^^^^^^^^ + +Pandas has gained new frequency offsets, ``SemiMonthEnd`` ('SM') and ``SemiMonthBegin`` ('SMS'). +These provide date offsets anchored (by default) to the 15th and end of month, and 15th and 1st of month respectively. +(:issue:`1543`) + +.. ipython:: python + + from pandas.tseries.offsets import SemiMonthEnd, SemiMonthBegin + +SemiMonthEnd: + +.. ipython:: python + + Timestamp('2016-01-01') + SemiMonthEnd() + + pd.date_range('2015-01-01', freq='SM', periods=4) + +SemiMonthBegin: + +.. ipython:: python + + Timestamp('2016-01-01') + SemiMonthBegin() + + pd.date_range('2015-01-01', freq='SMS', periods=4) + +Using the anchoring suffix, you can also specify the day of month to use instead of the 15th. + +.. ipython:: python + + pd.date_range('2015-01-01', freq='SMS-16', periods=4) + + pd.date_range('2015-01-01', freq='SM-14', periods=4) + +.. _whatsnew_0190.enhancements.index: + +New Index methods +^^^^^^^^^^^^^^^^^ + +The following methods and options are added to ``Index``, to be more consistent with the ``Series`` and ``DataFrame`` API. + +``Index`` now supports the ``.where()`` function for same shape indexing (:issue:`13170`) + +.. ipython:: python + idx = pd.Index(['a', 'b', 'c']) + idx.where([True, False, True]) +``Index`` now supports ``.dropna()`` to exclude missing values (:issue:`6194`) + +.. ipython:: python + + idx = pd.Index([1, 2, np.nan, 4]) + idx.dropna() + +For ``MultiIndex``, values are dropped if any level is missing by default. Specifying +``how='all'`` only drops values where all levels are missing. + +.. ipython:: python + + midx = pd.MultiIndex.from_arrays([[1, 2, np.nan, 4], + [1, 2, np.nan, np.nan]]) + midx + midx.dropna() + midx.dropna(how='all') + +``Index`` now supports ``.str.extractall()`` which returns a ``DataFrame``, see the :ref:`docs here ` (:issue:`10008`, :issue:`13156`) + +.. ipython:: python + + idx = pd.Index(["a1a2", "b1", "c1"]) + idx.str.extractall("[ab](?P\d)") + +``Index.astype()`` now accepts an optional boolean argument ``copy``, which allows optional copying if the requirements on dtype are satisfied (:issue:`13209`) + +.. _whatsnew_0190.gbq: + +Google BigQuery Enhancements +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +The :func:`pandas.io.gbq.read_gbq` method has gained the ``dialect`` argument to allow users to specify whether to use BigQuery's legacy SQL or BigQuery's standard SQL. See the :ref:`docs ` for more details (:issue:`13615`). + +.. _whatsnew_0190.errstate: + +Fine-grained numpy errstate +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Previous versions of pandas would permanently silence numpy's ufunc error handling when ``pandas`` was imported. Pandas did this in order to silence the warnings that would arise from using numpy ufuncs on missing data, which are usually represented as ``NaN`` s. Unfortunately, this silenced legitimate warnings arising in non-pandas code in the application. Starting with 0.19.0, pandas will use the ``numpy.errstate`` context manager to silence these warnings in a more fine-grained manner, only around where these operations are actually used in the pandas codebase. (:issue:`13109`, :issue:`13145`) + +After upgrading pandas, you may see *new* ``RuntimeWarnings`` being issued from your code. These are likely legitimate, and the underlying cause likely existed in the code when using previous versions of pandas that simply silenced the warning. Use `numpy.errstate `__ around the source of the ``RuntimeWarning`` to control how these conditions are handled. + +.. _whatsnew_0190.get_dummies_dtypes: + +``get_dummies`` now returns integer dtypes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The ``pd.get_dummies`` function now returns dummy-encoded columns as small integers, rather than floats (:issue:`8725`). This should provide an improved memory footprint. + +**Previous behavior**: + +.. code-block:: ipython + + In [1]: pd.get_dummies(['a', 'b', 'a', 'c']).dtypes + + Out[1]: + a float64 + b float64 + c float64 + dtype: object + +**New behavior**: + +.. ipython:: python + + pd.get_dummies(['a', 'b', 'a', 'c']).dtypes + + +.. _whatsnew_0190.enhancements.to_numeric_downcast: + +Downcast values to smallest possible dtype in ``to_numeric`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``pd.to_numeric()`` now accepts a ``downcast`` parameter, which will downcast the data if possible to smallest specified numerical dtype (:issue:`13352`) + + .. ipython:: python + + s = ['1', 2, 3] + pd.to_numeric(s, downcast='unsigned') + pd.to_numeric(s, downcast='integer') + .. _whatsnew_0190.enhancements.other: Other enhancements ^^^^^^^^^^^^^^^^^^ +- The ``.get_credentials()`` method of ``GbqConnector`` can now first try to fetch `the application default credentials `__. See the :ref:`docs ` for more details (:issue:`13577`). + +- The ``.tz_localize()`` method of ``DatetimeIndex`` and ``Timestamp`` has gained the ``errors`` keyword, so you can potentially coerce nonexistent timestamps to ``NaT``. The default behavior remains to raising a ``NonExistentTimeError`` (:issue:`13057`) + +- ``.to_hdf/read_hdf()`` now accept path objects (e.g. ``pathlib.Path``, ``py.path.local``) for the file path (:issue:`11773`) + +- ``Timestamp`` can now accept positional and keyword parameters similar to :func:`datetime.datetime` (:issue:`10758`, :issue:`11630`) + + .. ipython:: python + + pd.Timestamp(2012, 1, 1) + + pd.Timestamp(year=2012, month=1, day=1, hour=8, minute=30) +- the ``.resample()`` function now accepts a ``on=`` or ``level=`` parameter for resampling on a datetimelike column or ``MultiIndex`` level (:issue:`13500`) + .. ipython:: python + df = pd.DataFrame({'date': pd.date_range('2015-01-01', freq='W', periods=5), + 'a': np.arange(5)}, + index=pd.MultiIndex.from_arrays([ + [1,2,3,4,5], + pd.date_range('2015-01-01', freq='W', periods=5)], + names=['v','d'])) + df + df.resample('M', on='date').sum() + df.resample('M', level='d').sum() +- The ``pd.read_csv()`` with ``engine='python'`` has gained support for the + ``decimal`` (:issue:`12933`), ``na_filter`` (:issue:`13321`) and the ``memory_map`` option (:issue:`13381`). +- Consistent with the Python API, ``pd.read_csv()`` will now interpret ``+inf`` as positive infinity (:issue:`13274`) +- The ``pd.read_html()`` has gained support for the ``na_values``, ``converters``, ``keep_default_na`` options (:issue:`13461`) +- ``Categorical.astype()`` now accepts an optional boolean argument ``copy``, effective when dtype is categorical (:issue:`13209`) +- ``DataFrame`` has gained the ``.asof()`` method to return the last non-NaN values according to the selected subset (:issue:`13358`) +- The ``DataFrame`` constructor will now respect key ordering if a list of ``OrderedDict`` objects are passed in (:issue:`13304`) +- ``pd.read_html()`` has gained support for the ``decimal`` option (:issue:`12907`) +- ``Series`` has gained the properties ``.is_monotonic``, ``.is_monotonic_increasing``, ``.is_monotonic_decreasing``, similar to ``Index`` (:issue:`13336`) +- ``DataFrame.to_sql()`` now allows a single value as the SQL type for all columns (:issue:`11886`). +- ``Series.append`` now supports the ``ignore_index`` option (:issue:`13677`) +- ``.to_stata()`` and ``StataWriter`` can now write variable labels to Stata dta files using a dictionary to make column names to labels (:issue:`13535`, :issue:`13536`) +- ``.to_stata()`` and ``StataWriter`` will automatically convert ``datetime64[ns]`` columns to Stata format ``%tc``, rather than raising a ``ValueError`` (:issue:`12259`) +- ``read_stata()`` and ``StataReader`` raise with a more explicit error message when reading Stata files with repeated value labels when ``convert_categoricals=True`` (:issue:`13923`) +- ``DataFrame.style`` will now render sparsified MultiIndexes (:issue:`11655`) +- ``DataFrame.style`` will now show column level names (e.g. ``DataFrame.columns.names``) (:issue:`13775`) +- ``DataFrame`` has gained support to re-order the columns based on the values + in a row using ``df.sort_values(by='...', axis=1)`` (:issue:`10806`) -.. _whatsnew_0190.api_breaking: + .. ipython:: python -Backwards incompatible API changes -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + df = pd.DataFrame({'A': [2, 7], 'B': [3, 5], 'C': [4, 8]}, + index=['row1', 'row2']) + df + df.sort_values(by='row2', axis=1) + +- Added documentation to :ref:`I/O` regarding the perils of reading in columns with mixed dtypes and how to handle it (:issue:`13746`) +- :meth:`~DataFrame.to_html` now has a ``border`` argument to control the value in the opening ``
`` tag. The default is the value of the ``html.border`` option, which defaults to 1. This also affects the notebook HTML repr, but since Jupyter's CSS includes a border-width attribute, the visual effect is the same. (:issue:`11563`). +- Raise ``ImportError`` in the sql functions when ``sqlalchemy`` is not installed and a connection string is used (:issue:`11920`). +- Compatibility with matplotlib 2.0. Older versions of pandas should also work with matplotlib 2.0 (:issue:`13333`) +- ``Timestamp``, ``Period``, ``DatetimeIndex``, ``PeriodIndex`` and ``.dt`` accessor have gained a ``.is_leap_year`` property to check whether the date belongs to a leap year. (:issue:`13727`) +- ``astype()`` will now accept a dict of column name to data types mapping as the ``dtype`` argument. (:issue:`12086`) +- The ``pd.read_json`` and ``DataFrame.to_json`` has gained support for reading and writing json lines with ``lines`` option see :ref:`Line delimited json ` (:issue:`9180`) .. _whatsnew_0190.api: +API changes +~~~~~~~~~~~ + +``Series.tolist()`` will now return Python types +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``Series.tolist()`` will now return Python types in the output, mimicking NumPy ``.tolist()`` behavior (:issue:`10904`) + + +.. ipython:: python + + s = pd.Series([1,2,3]) + +**Previous behavior**: + +.. code-block:: ipython + + In [7]: type(s.tolist()[0]) + Out[7]: + + +**New behavior**: + +.. ipython:: python + + type(s.tolist()[0]) + +.. _whatsnew_0190.api.series_ops: + +``Series`` operators for different indexes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Following ``Series`` operators have been changed to make all operators consistent, +including ``DataFrame`` (:issue:`1134`, :issue:`4581`, :issue:`13538`) + +- ``Series`` comparison operators now raise ``ValueError`` when ``index`` are different. +- ``Series`` logical operators align both ``index`` of left and right hand side. + +.. warning:: + Until 0.18.1, comparing ``Series`` with the same length, would succeed even if + the ``.index`` are different (the result ignores ``.index``). As of 0.19.0, this will raises ``ValueError`` to be more strict. This section also describes how to keep previous behavior or align different indexes, using the flexible comparison methods like ``.eq``. + + +As a result, ``Series`` and ``DataFrame`` operators behave as below: + +Arithmetic operators +"""""""""""""""""""" + +Arithmetic operators align both ``index`` (no changes). + +.. ipython:: python + + s1 = pd.Series([1, 2, 3], index=list('ABC')) + s2 = pd.Series([2, 2, 2], index=list('ABD')) + s1 + s2 + + df1 = pd.DataFrame([1, 2, 3], index=list('ABC')) + df2 = pd.DataFrame([2, 2, 2], index=list('ABD')) + df1 + df2 + +Comparison operators +"""""""""""""""""""" + +Comparison operators raise ``ValueError`` when ``.index`` are different. + +Previous Behavior (``Series``): + +``Series`` compared values ignoring the ``.index`` as long as both had the same length: + +.. code-block:: ipython + + In [1]: s1 == s2 + Out[1]: + A False + B True + C False + dtype: bool + +**New behavior** (``Series``): + +.. code-block:: ipython + + In [2]: s1 == s2 + Out[2]: + ValueError: Can only compare identically-labeled Series objects + +.. note:: + + To achieve the same result as previous versions (compare values based on locations ignoring ``.index``), compare both ``.values``. + + .. ipython:: python + + s1.values == s2.values + + If you want to compare ``Series`` aligning its ``.index``, see flexible comparison methods section below: + + .. ipython:: python + + s1.eq(s2) + +Current Behavior (``DataFrame``, no change): + +.. code-block:: ipython + + In [3]: df1 == df2 + Out[3]: + ValueError: Can only compare identically-labeled DataFrame objects + +Logical operators +""""""""""""""""" + +Logical operators align both ``.index`` of left and right hand side. + +Previous behavior (``Series``), only left hand side ``index`` was kept: + +.. code-block:: ipython + + In [4]: s1 = pd.Series([True, False, True], index=list('ABC')) + In [5]: s2 = pd.Series([True, True, True], index=list('ABD')) + In [6]: s1 & s2 + Out[6]: + A True + B False + C False + dtype: bool + +**New behavior** (``Series``): + +.. ipython:: python + + s1 = pd.Series([True, False, True], index=list('ABC')) + s2 = pd.Series([True, True, True], index=list('ABD')) + s1 & s2 + +.. note:: + ``Series`` logical operators fill a ``NaN`` result with ``False``. + +.. note:: + To achieve the same result as previous versions (compare values based on only left hand side index), you can use ``reindex_like``: + + .. ipython:: python + + s1 & s2.reindex_like(s1) + +Current Behavior (``DataFrame``, no change): + +.. ipython:: python + + df1 = pd.DataFrame([True, False, True], index=list('ABC')) + df2 = pd.DataFrame([True, True, True], index=list('ABD')) + df1 & df2 + +Flexible comparison methods +""""""""""""""""""""""""""" + +``Series`` flexible comparison methods like ``eq``, ``ne``, ``le``, ``lt``, ``ge`` and ``gt`` now align both ``index``. Use these operators if you want to compare two ``Series`` +which has the different ``index``. + +.. ipython:: python + + s1 = pd.Series([1, 2, 3], index=['a', 'b', 'c']) + s2 = pd.Series([2, 2, 2], index=['b', 'c', 'd']) + s1.eq(s2) + s1.ge(s2) + +Previously, this worked the same as comparison operators (see above). + +.. _whatsnew_0190.api.promote: + +``Series`` type promotion on assignment +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +A ``Series`` will now correctly promote its dtype for assignment with incompat values to the current dtype (:issue:`13234`) + + +.. ipython:: python + + s = pd.Series() + +**Previous behavior**: + +.. code-block:: ipython + + In [2]: s["a"] = pd.Timestamp("2016-01-01") + + In [3]: s["b"] = 3.0 + TypeError: invalid type promotion + +**New behavior**: + +.. ipython:: python + + s["a"] = pd.Timestamp("2016-01-01") + s["b"] = 3.0 + s + s.dtype + +.. _whatsnew_0190.api.to_datetime_coerce: + +``.to_datetime()`` changes +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Previously if ``.to_datetime()`` encountered mixed integers/floats and strings, but no datetimes with ``errors='coerce'`` it would convert all to ``NaT``. +**Previous behavior**: +.. code-block:: ipython + In [2]: pd.to_datetime([1, 'foo'], errors='coerce') + Out[2]: DatetimeIndex(['NaT', 'NaT'], dtype='datetime64[ns]', freq=None) +This will now convert integers/floats with the default unit of ``ns``. + +.. ipython:: python + + pd.to_datetime([1, 'foo'], errors='coerce') + +Bug fixes related to ``.to_datetime()``: + +- Bug in ``pd.to_datetime()`` when passing integers or floats, and no ``unit`` and ``errors='coerce'`` (:issue:`13180`). +- Bug in ``pd.to_datetime()`` when passing invalid datatypes (e.g. bool); will now respect the ``errors`` keyword (:issue:`13176`) +- Bug in ``pd.to_datetime()`` which overflowed on ``int8``, and ``int16`` dtypes (:issue:`13451`) +- Bug in ``pd.to_datetime()`` raise ``AttributeError`` with ``NaN`` and the other string is not valid when ``errors='ignore'`` (:issue:`12424`) +- Bug in ``pd.to_datetime()`` did not cast floats correctly when ``unit`` was specified, resulting in truncated datetime (:issue:`13834`) + +.. _whatsnew_0190.api.merging: + +Merging changes +^^^^^^^^^^^^^^^ + +Merging will now preserve the dtype of the join keys (:issue:`8596`) + +.. ipython:: python + + df1 = pd.DataFrame({'key': [1], 'v1': [10]}) + df1 + df2 = pd.DataFrame({'key': [1, 2], 'v1': [20, 30]}) + df2 + +**Previous behavior**: + +.. code-block:: ipython + + In [5]: pd.merge(df1, df2, how='outer') + Out[5]: + key v1 + 0 1.0 10.0 + 1 1.0 20.0 + 2 2.0 30.0 + + In [6]: pd.merge(df1, df2, how='outer').dtypes + Out[6]: + key float64 + v1 float64 + dtype: object + +**New behavior**: + +We are able to preserve the join keys + +.. ipython:: python + + pd.merge(df1, df2, how='outer') + pd.merge(df1, df2, how='outer').dtypes + +Of course if you have missing values that are introduced, then the +resulting dtype will be upcast, which is unchanged from previous. + +.. ipython:: python + + pd.merge(df1, df2, how='outer', on='key') + pd.merge(df1, df2, how='outer', on='key').dtypes + +.. _whatsnew_0190.api.describe: + +``.describe()`` changes +^^^^^^^^^^^^^^^^^^^^^^^ + +Percentile identifiers in the index of a ``.describe()`` output will now be rounded to the least precision that keeps them distinct (:issue:`13104`) + +.. ipython:: python + + s = pd.Series([0, 1, 2, 3, 4]) + df = pd.DataFrame([0, 1, 2, 3, 4]) + +**Previous behavior**: + +The percentiles were rounded to at most one decimal place, which could raise ``ValueError`` for a data frame if the percentiles were duplicated. + +.. code-block:: ipython + + In [3]: s.describe(percentiles=[0.0001, 0.0005, 0.001, 0.999, 0.9995, 0.9999]) + Out[3]: + count 5.000000 + mean 2.000000 + std 1.581139 + min 0.000000 + 0.0% 0.000400 + 0.1% 0.002000 + 0.1% 0.004000 + 50% 2.000000 + 99.9% 3.996000 + 100.0% 3.998000 + 100.0% 3.999600 + max 4.000000 + dtype: float64 + + In [4]: df.describe(percentiles=[0.0001, 0.0005, 0.001, 0.999, 0.9995, 0.9999]) + Out[4]: + ... + ValueError: cannot reindex from a duplicate axis + +**New behavior**: + +.. ipython:: python + + s.describe(percentiles=[0.0001, 0.0005, 0.001, 0.999, 0.9995, 0.9999]) + df.describe(percentiles=[0.0001, 0.0005, 0.001, 0.999, 0.9995, 0.9999]) + +Furthermore: + +- Passing duplicated ``percentiles`` will now raise a ``ValueError``. +- Bug in ``.describe()`` on a DataFrame with a mixed-dtype column index, which would previously raise a ``TypeError`` (:issue:`13288`) + +.. _whatsnew_0190.api.period: + +``Period`` changes +^^^^^^^^^^^^^^^^^^ + +``PeriodIndex`` now has ``period`` dtype +"""""""""""""""""""""""""""""""""""""""" + +``PeriodIndex`` now has its own ``period`` dtype. The ``period`` dtype is a +pandas extension dtype like ``category`` or the :ref:`timezone aware dtype ` (``datetime64[ns, tz]``). (:issue:`13941`). +As a consequence of this change, ``PeriodIndex`` no longer has an integer dtype: + +**Previous behavior**: + +.. code-block:: ipython + + In [1]: pi = pd.PeriodIndex(['2016-08-01'], freq='D') + + In [2]: pi + Out[2]: PeriodIndex(['2016-08-01'], dtype='int64', freq='D') + + In [3]: pd.api.types.is_integer_dtype(pi) + Out[3]: True + + In [4]: pi.dtype + Out[4]: dtype('int64') + +**New behavior**: + +.. ipython:: python + + pi = pd.PeriodIndex(['2016-08-01'], freq='D') + pi + pd.api.types.is_integer_dtype(pi) + pd.api.types.is_period_dtype(pi) + pi.dtype + type(pi.dtype) + +.. _whatsnew_0190.api.periodnat: + +``Period('NaT')`` now returns ``pd.NaT`` +"""""""""""""""""""""""""""""""""""""""" + +Previously, ``Period`` has its own ``Period('NaT')`` representation different from ``pd.NaT``. Now ``Period('NaT')`` has been changed to return ``pd.NaT``. (:issue:`12759`, :issue:`13582`) + +**Previous behavior**: + +.. code-block:: ipython + + In [5]: pd.Period('NaT', freq='D') + Out[5]: Period('NaT', 'D') + +**New behavior**: + +These result in ``pd.NaT`` without providing ``freq`` option. + +.. ipython:: python + + pd.Period('NaT') + pd.Period(None) + + +To be compatible with ``Period`` addition and subtraction, ``pd.NaT`` now supports addition and subtraction with ``int``. Previously it raised ``ValueError``. + +**Previous behavior**: + +.. code-block:: ipython + + In [5]: pd.NaT + 1 + ... + ValueError: Cannot add integral value to Timestamp without freq. + +**New behavior**: + +.. ipython:: python + + pd.NaT + 1 + pd.NaT - 1 + +``PeriodIndex.values`` now returns array of ``Period`` object +""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" + +``.values`` is changed to return an array of ``Period`` objects, rather than an array +of integers (:issue:`13988`). + +**Previous behavior**: + +.. code-block:: ipython + + In [6]: pi = pd.PeriodIndex(['2011-01', '2011-02'], freq='M') + In [7]: pi.values + array([492, 493]) + +**New behavior**: + +.. ipython:: python + + pi = pd.PeriodIndex(['2011-01', '2011-02'], freq='M') + pi.values + + +.. _whatsnew_0190.api.setops: + +Index ``+`` / ``-`` no longer used for set operations +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Addition and subtraction of the base Index type and of DatetimeIndex +(not the numeric index types) +previously performed set operations (set union and difference). This +behavior was already deprecated since 0.15.0 (in favor using the specific +``.union()`` and ``.difference()`` methods), and is now disabled. When +possible, ``+`` and ``-`` are now used for element-wise operations, for +example for concatenating strings or subtracting datetimes +(:issue:`8227`, :issue:`14127`). + +Previous behavior: + +.. code-block:: ipython + + In [1]: pd.Index(['a', 'b']) + pd.Index(['a', 'c']) + FutureWarning: using '+' to provide set union with Indexes is deprecated, use '|' or .union() + Out[1]: Index(['a', 'b', 'c'], dtype='object') + +**New behavior**: the same operation will now perform element-wise addition: + +.. ipython:: python + + pd.Index(['a', 'b']) + pd.Index(['a', 'c']) + +Note that numeric Index objects already performed element-wise operations. +For example, the behavior of adding two integer Indexes: + +.. ipython:: python + + pd.Index([1, 2, 3]) + pd.Index([2, 3, 4]) + +is unchanged. The base ``Index`` is now made consistent with this behavior. + +Further, because of this change, it is now possible to subtract two +DatetimeIndex objects resulting in a TimedeltaIndex: + +Previous behavior: + +.. code-block:: ipython + + In [1]: pd.DatetimeIndex(['2016-01-01', '2016-01-02']) - pd.DatetimeIndex(['2016-01-02', '2016-01-03']) + FutureWarning: using '-' to provide set differences with datetimelike Indexes is deprecated, use .difference() + Out[1]: DatetimeIndex(['2016-01-01'], dtype='datetime64[ns]', freq=None) + +**New behavior**: + +.. ipython:: python + + pd.DatetimeIndex(['2016-01-01', '2016-01-02']) - pd.DatetimeIndex(['2016-01-02', '2016-01-03']) + + +.. _whatsnew_0190.api.difference: + +``Index.difference`` and ``.symmetric_difference`` changes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``Index.difference`` and ``Index.symmetric_difference`` will now, more consistently, treat ``NaN`` values as any other values. (:issue:`13514`) + +.. ipython:: python + + idx1 = pd.Index([1, 2, 3, np.nan]) + idx2 = pd.Index([0, 1, np.nan]) + +**Previous behavior**: + +.. code-block:: ipython + + In [3]: idx1.difference(idx2) + Out[3]: Float64Index([nan, 2.0, 3.0], dtype='float64') + + In [4]: idx1.symmetric_difference(idx2) + Out[4]: Float64Index([0.0, nan, 2.0, 3.0], dtype='float64') + +**New behavior**: + +.. ipython:: python + + idx1.difference(idx2) + idx1.symmetric_difference(idx2) + +.. _whatsnew_0190.api.unique_index: + +``Index.unique`` consistently returns ``Index`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``Index.unique()`` now returns unique values as an +``Index`` of the appropriate ``dtype``. (:issue:`13395`). +Previously, most ``Index`` classes returned ``np.ndarray``, and ``DatetimeIndex``, +``TimedeltaIndex`` and ``PeriodIndex`` returned ``Index`` to keep metadata like timezone. + +**Previous behavior**: + +.. code-block:: ipython + + In [1]: pd.Index([1, 2, 3]).unique() + Out[1]: array([1, 2, 3]) + + In [2]: pd.DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03'], tz='Asia/Tokyo').unique() + Out[2]: + DatetimeIndex(['2011-01-01 00:00:00+09:00', '2011-01-02 00:00:00+09:00', + '2011-01-03 00:00:00+09:00'], + dtype='datetime64[ns, Asia/Tokyo]', freq=None) + +**New behavior**: + +.. ipython:: python + + pd.Index([1, 2, 3]).unique() + pd.DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03'], tz='Asia/Tokyo').unique() + +.. _whatsnew_0190.api.multiindex: + +``MultiIndex`` constructors, ``groupby`` and ``set_index`` preserve categorical dtypes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``MultiIndex.from_arrays`` and ``MultiIndex.from_product`` will now preserve categorical dtype +in ``MultiIndex`` levels. (:issue:`13743`, :issue:`13854`) + +.. ipython:: python + + cat = pd.Categorical(['a', 'b'], categories=list("bac")) + lvl1 = ['foo', 'bar'] + midx = pd.MultiIndex.from_arrays([cat, lvl1]) + midx + +**Previous behavior**: + +.. code-block:: ipython + + In [4]: midx.levels[0] + Out[4]: Index(['b', 'a', 'c'], dtype='object') + + In [5]: midx.get_level_values[0] + Out[5]: Index(['a', 'b'], dtype='object') + +**New behavior**: the single level is now a ``CategoricalIndex``: + +.. ipython:: python + + midx.levels[0] + midx.get_level_values(0) + +An analogous change has been made to ``MultiIndex.from_product``. +As a consequence, ``groupby`` and ``set_index`` also preserve categorical dtypes in indexes + +.. ipython:: python + + df = pd.DataFrame({'A': [0, 1], 'B': [10, 11], 'C': cat}) + df_grouped = df.groupby(by=['A', 'C']).first() + df_set_idx = df.set_index(['A', 'C']) + +**Previous behavior**: + +.. code-block:: ipython + + In [11]: df_grouped.index.levels[1] + Out[11]: Index(['b', 'a', 'c'], dtype='object', name='C') + In [12]: df_grouped.reset_index().dtypes + Out[12]: + A int64 + C object + B float64 + dtype: object + + In [13]: df_set_idx.index.levels[1] + Out[13]: Index(['b', 'a', 'c'], dtype='object', name='C') + In [14]: df_set_idx.reset_index().dtypes + Out[14]: + A int64 + C object + B int64 + dtype: object + +**New behavior**: + +.. ipython:: python + + df_grouped.index.levels[1] + df_grouped.reset_index().dtypes + + df_set_idx.index.levels[1] + df_set_idx.reset_index().dtypes + +.. _whatsnew_0190.api.autogenerated_chunksize_index: + +``read_csv`` will progressively enumerate chunks +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +When :func:`read_csv` is called with ``chunksize=n`` and without specifying an index, +each chunk used to have an independently generated index from ``0`` to ``n-1``. +They are now given instead a progressive index, starting from ``0`` for the first chunk, +from ``n`` for the second, and so on, so that, when concatenated, they are identical to +the result of calling :func:`read_csv` without the ``chunksize=`` argument. +(:issue:`12185`) + +.. ipython :: python + + data = 'A,B\n0,1\n2,3\n4,5\n6,7' + +**Previous behavior**: + +.. code-block:: ipython + + In [2]: pd.concat(pd.read_csv(StringIO(data), chunksize=2)) + Out[2]: + A B + 0 0 1 + 1 2 3 + 0 4 5 + 1 6 7 + +**New behavior**: + +.. ipython :: python + + pd.concat(pd.read_csv(StringIO(data), chunksize=2)) + +.. _whatsnew_0190.sparse: + +Sparse Changes +^^^^^^^^^^^^^^ + +These changes allow pandas to handle sparse data with more dtypes, and for work to make a smoother experience with data handling. + +``int64`` and ``bool`` support enhancements +""""""""""""""""""""""""""""""""""""""""""" + +Sparse data structures now gained enhanced support of ``int64`` and ``bool`` ``dtype`` (:issue:`667`, :issue:`13849`). + +Previously, sparse data were ``float64`` dtype by default, even if all inputs were of ``int`` or ``bool`` dtype. You had to specify ``dtype`` explicitly to create sparse data with ``int64`` dtype. Also, ``fill_value`` had to be specified explicitly because the default was ``np.nan`` which doesn't appear in ``int64`` or ``bool`` data. + +.. code-block:: ipython + + In [1]: pd.SparseArray([1, 2, 0, 0]) + Out[1]: + [1.0, 2.0, 0.0, 0.0] + Fill: nan + IntIndex + Indices: array([0, 1, 2, 3], dtype=int32) + + # specifying int64 dtype, but all values are stored in sp_values because + # fill_value default is np.nan + In [2]: pd.SparseArray([1, 2, 0, 0], dtype=np.int64) + Out[2]: + [1, 2, 0, 0] + Fill: nan + IntIndex + Indices: array([0, 1, 2, 3], dtype=int32) + + In [3]: pd.SparseArray([1, 2, 0, 0], dtype=np.int64, fill_value=0) + Out[3]: + [1, 2, 0, 0] + Fill: 0 + IntIndex + Indices: array([0, 1], dtype=int32) + +As of v0.19.0, sparse data keeps the input dtype, and uses more appropriate ``fill_value`` defaults (``0`` for ``int64`` dtype, ``False`` for ``bool`` dtype). + +.. ipython:: python + + pd.SparseArray([1, 2, 0, 0], dtype=np.int64) + pd.SparseArray([True, False, False, False]) + +See the :ref:`docs ` for more details. + +Operators now preserve dtypes +""""""""""""""""""""""""""""" + +- Sparse data structure now can preserve ``dtype`` after arithmetic ops (:issue:`13848`) + + .. ipython:: python + + s = pd.SparseSeries([0, 2, 0, 1], fill_value=0, dtype=np.int64) + s.dtype + + s + 1 + +- Sparse data structure now support ``astype`` to convert internal ``dtype`` (:issue:`13900`) + + .. ipython:: python + + s = pd.SparseSeries([1., 0., 2., 0.], fill_value=0) + s + s.astype(np.int64) + + ``astype`` fails if data contains values which cannot be converted to specified ``dtype``. + Note that the limitation is applied to ``fill_value`` which default is ``np.nan``. + + .. code-block:: ipython + + In [7]: pd.SparseSeries([1., np.nan, 2., np.nan], fill_value=np.nan).astype(np.int64) + Out[7]: + ValueError: unable to coerce current fill_value nan to int64 dtype + +Other sparse fixes +"""""""""""""""""" + +- Subclassed ``SparseDataFrame`` and ``SparseSeries`` now preserve class types when slicing or transposing. (:issue:`13787`) +- ``SparseArray`` with ``bool`` dtype now supports logical (bool) operators (:issue:`14000`) +- Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing may raise ``IndexError`` (:issue:`13144`) +- Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing result may have normal ``Index`` (:issue:`13144`) +- Bug in ``SparseDataFrame`` in which ``axis=None`` did not default to ``axis=0`` (:issue:`13048`) +- Bug in ``SparseSeries`` and ``SparseDataFrame`` creation with ``object`` dtype may raise ``TypeError`` (:issue:`11633`) +- Bug in ``SparseDataFrame`` doesn't respect passed ``SparseArray`` or ``SparseSeries`` 's dtype and ``fill_value`` (:issue:`13866`) +- Bug in ``SparseArray`` and ``SparseSeries`` don't apply ufunc to ``fill_value`` (:issue:`13853`) +- Bug in ``SparseSeries.abs`` incorrectly keeps negative ``fill_value`` (:issue:`13853`) +- Bug in single row slicing on multi-type ``SparseDataFrame`` s, types were previously forced to float (:issue:`13917`) +- Bug in ``SparseSeries`` slicing changes integer dtype to float (:issue:`8292`) +- Bug in ``SparseDataFarme`` comparison ops may raise ``TypeError`` (:issue:`13001`) +- Bug in ``SparseDataFarme.isnull`` raises ``ValueError`` (:issue:`8276`) +- Bug in ``SparseSeries`` representation with ``bool`` dtype may raise ``IndexError`` (:issue:`13110`) +- Bug in ``SparseSeries`` and ``SparseDataFrame`` of ``bool`` or ``int64`` dtype may display its values like ``float64`` dtype (:issue:`13110`) +- Bug in sparse indexing using ``SparseArray`` with ``bool`` dtype may return incorrect result (:issue:`13985`) +- Bug in ``SparseArray`` created from ``SparseSeries`` may lose ``dtype`` (:issue:`13999`) +- Bug in ``SparseSeries`` comparison with dense returns normal ``Series`` rather than ``SparseSeries`` (:issue:`13999`) + + +.. _whatsnew_0190.indexer_dtype: + +Indexer dtype Changes +^^^^^^^^^^^^^^^^^^^^^ + +.. note:: + + This change only affects 64 bit python running on Windows, and only affects relatively advanced + indexing operations + +Methods such as ``Index.get_indexer`` that return an indexer array, coerce that array to a "platform int", so that it can be +directly used in 3rd party library operations like ``numpy.take``. Previously, a platform int was defined as ``np.int_`` +which corresponds to a C integer, but the correct type, and what is being used now, is ``np.intp``, which corresponds +to the C integer size that can hold a pointer. (:issue:`3033`, :issue:`13972`) + +These types are the same on many platform, but for 64 bit python on Windows, +``np.int_`` is 32 bits, and ``np.intp`` is 64 bits. Changing this behavior improves performance for many +operations on that platform. + +**Previous behavior**: + +.. code-block:: ipython + + In [1]: i = pd.Index(['a', 'b', 'c']) + + In [2]: i.get_indexer(['b', 'b', 'c']).dtype + Out[2]: dtype('int32') + +**New behavior**: + +.. code-block:: ipython + + In [1]: i = pd.Index(['a', 'b', 'c']) + + In [2]: i.get_indexer(['b', 'b', 'c']).dtype + Out[2]: dtype('int64') + + +.. _whatsnew_0190.api.other: Other API Changes ^^^^^^^^^^^^^^^^^ +- ``Timestamp.to_pydatetime`` will issue a ``UserWarning`` when ``warn=True``, and the instance has a non-zero number of nanoseconds, previously this would print a message to stdout. (:issue:`14101`) +- Non-convertible dates in an excel date column will be returned without conversion and the column will be ``object`` dtype, rather than raising an exception (:issue:`10001`) +- ``Series.unique()`` with datetime and timezone now returns return array of ``Timestamp`` with timezone (:issue:`13565`) +- ``pd.Timedelta(None)`` is now accepted and will return ``NaT``, mirroring ``pd.Timestamp`` (:issue:`13687`) +- ``Panel.to_sparse()`` will raise a ``NotImplementedError`` exception when called (:issue:`13778`) +- ``Index.reshape()`` will raise a ``NotImplementedError`` exception when called (:issue:`12882`) +- ``.filter()`` enforces mutual exclusion of the keyword arguments. (:issue:`12399`) +- ``eval``'s upcasting rules for ``float32`` types have been updated to be more consistent with NumPy's rules. New behavior will not upcast to ``float64`` if you multiply a pandas ``float32`` object by a scalar float64. (:issue:`12388`) +- An ``UnsupportedFunctionCall`` error is now raised if NumPy ufuncs like ``np.mean`` are called on groupby or resample objects (:issue:`12811`) +- ``__setitem__`` will no longer apply a callable rhs as a function instead of storing it. Call ``where`` directly to get the previous behavior. (:issue:`13299`) +- Calls to ``.sample()`` will respect the random seed set via ``numpy.random.seed(n)`` (:issue:`13161`) +- ``Styler.apply`` is now more strict about the outputs your function must return. For ``axis=0`` or ``axis=1``, the output shape must be identical. For ``axis=None``, the output must be a DataFrame with identical columns and index labels. (:issue:`13222`) +- ``Float64Index.astype(int)`` will now raise ``ValueError`` if ``Float64Index`` contains ``NaN`` values (:issue:`13149`) +- ``TimedeltaIndex.astype(int)`` and ``DatetimeIndex.astype(int)`` will now return ``Int64Index`` instead of ``np.array`` (:issue:`13209`) +- Passing ``Period`` with multiple frequencies to normal ``Index`` now returns ``Index`` with ``object`` dtype (:issue:`13664`) +- ``PeriodIndex.fillna`` with ``Period`` has different freq now coerces to ``object`` dtype (:issue:`13664`) +- Faceted boxplots from ``DataFrame.boxplot(by=col)`` now return a ``Series`` when ``return_type`` is not None. Previously these returned an ``OrderedDict``. Note that when ``return_type=None``, the default, these still return a 2-D NumPy array. (:issue:`12216`, :issue:`7096`) +- ``pd.read_hdf`` will now raise a ``ValueError`` instead of ``KeyError``, if a mode other than ``r``, ``r+`` and ``a`` is supplied. (:issue:`13623`) +- ``pd.read_csv()``, ``pd.read_table()``, and ``pd.read_hdf()`` raise the builtin ``FileNotFoundError`` exception for Python 3.x when called on a nonexistent file; this is back-ported as ``IOError`` in Python 2.x (:issue:`14086`) +- More informative exceptions are passed through the csv parser. The exception type would now be the original exception type instead of ``CParserError``. (:issue:`13652`) +- ``pd.read_csv()`` in the C engine will now issue a ``ParserWarning`` or raise a ``ValueError`` when ``sep`` encoded is more than one character long (:issue:`14065`) +- ``DataFrame.values`` will now return ``float64`` with a ``DataFrame`` of mixed ``int64`` and ``uint64`` dtypes, conforming to ``np.find_common_type`` (:issue:`10364`, :issue:`13917`) + .. _whatsnew_0190.deprecations: Deprecations -^^^^^^^^^^^^ - - - +~~~~~~~~~~~~ +- ``Categorical.reshape`` has been deprecated and will be removed in a subsequent release (:issue:`12882`) +- ``Series.reshape`` has been deprecated and will be removed in a subsequent release (:issue:`12882`) +- ``PeriodIndex.to_datetime`` has been deprecated in favor of ``PeriodIndex.to_timestamp`` (:issue:`8254`) +- ``Timestamp.to_datetime`` has been deprecated in favor of ``Timestamp.to_pydatetime`` (:issue:`8254`) +- ``pandas.core.datetools`` module has been deprecated and will be removed in a subsequent release (:issue:`14094`) +- ``Index.to_datetime`` and ``DatetimeIndex.to_datetime`` have been deprecated in favor of ``pd.to_datetime`` (:issue:`8254`) +- ``SparseList`` has been deprecated and will be removed in a future version (:issue:`13784`) +- ``DataFrame.to_html()`` and ``DataFrame.to_latex()`` have dropped the ``colSpace`` parameter in favor of ``col_space`` (:issue:`13857`) +- ``DataFrame.to_sql()`` has deprecated the ``flavor`` parameter, as it is superfluous when SQLAlchemy is not installed (:issue:`13611`) +- ``compact_ints`` and ``use_unsigned`` have been deprecated in ``pd.read_csv()`` and will be removed in a future version (:issue:`13320`) +- ``buffer_lines`` has been deprecated in ``pd.read_csv()`` and will be removed in a future version (:issue:`13360`) +- ``as_recarray`` has been deprecated in ``pd.read_csv()`` and will be removed in a future version (:issue:`13373`) +- ``skip_footer`` has been deprecated in ``pd.read_csv()`` in favor of ``skipfooter`` and will be removed in a future version (:issue:`13349`) +- top-level ``pd.ordered_merge()`` has been renamed to ``pd.merge_ordered()`` and the original name will be removed in a future version (:issue:`13358`) +- ``Timestamp.offset`` property (and named arg in the constructor), has been deprecated in favor of ``freq`` (:issue:`12160`) +- ``pd.tseries.util.pivot_annual`` is deprecated. Use ``pivot_table`` as alternative, an example is :ref:`here ` (:issue:`736`) +- ``pd.tseries.util.isleapyear`` has been deprecated and will be removed in a subsequent release. Datetime-likes now have a ``.is_leap_year`` property. (:issue:`13727`) +- ``Panel4D`` and ``PanelND`` constructors are deprecated and will be removed in a future version. The recommended way to represent these types of n-dimensional data are with the `xarray package `__. Pandas provides a :meth:`~Panel4D.to_xarray` method to automate this conversion. (:issue:`13564`) +- ``pandas.tseries.frequencies.get_standard_freq`` is deprecated. Use ``pandas.tseries.frequencies.to_offset(freq).rule_code`` instead. (:issue:`13874`) +- ``pandas.tseries.frequencies.to_offset``'s ``freqstr`` keyword is deprecated in favor of ``freq``. (:issue:`13874`) +- ``Categorical.from_array`` has been deprecated and will be removed in a future version (:issue:`13854`) .. _whatsnew_0190.prior_deprecations: Removal of prior version deprecations/changes -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - - +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +- The ``SparsePanel`` class has been removed (:issue:`13778`) +- The ``pd.sandbox`` module has been removed in favor of the external library ``pandas-qt`` (:issue:`13670`) +- The ``pandas.io.data`` and ``pandas.io.wb`` modules are removed in favor of + the `pandas-datareader package `__ (:issue:`13724`). +- The ``pandas.tools.rplot`` module has been removed in favor of + the `seaborn package `__ (:issue:`13855`) +- ``DataFrame.to_csv()`` has dropped the ``engine`` parameter, as was deprecated in 0.17.1 (:issue:`11274`, :issue:`13419`) +- ``DataFrame.to_dict()`` has dropped the ``outtype`` parameter in favor of ``orient`` (:issue:`13627`, :issue:`8486`) +- ``pd.Categorical`` has dropped setting of the ``ordered`` attribute directly in favor of the ``set_ordered`` method (:issue:`13671`) +- ``pd.Categorical`` has dropped the ``levels`` attribute in favor of ``categories`` (:issue:`8376`) +- ``DataFrame.to_sql()`` has dropped the ``mysql`` option for the ``flavor`` parameter (:issue:`13611`) +- ``Panel.shift()`` has dropped the ``lags`` parameter in favor of ``periods`` (:issue:`14041`) +- ``pd.Index`` has dropped the ``diff`` method in favor of ``difference`` (:issue:`13669`) +- ``pd.DataFrame`` has dropped the ``to_wide`` method in favor of ``to_panel`` (:issue:`14039`) +- ``Series.to_csv`` has dropped the ``nanRep`` parameter in favor of ``na_rep`` (:issue:`13804`) +- ``Series.xs``, ``DataFrame.xs``, ``Panel.xs``, ``Panel.major_xs``, and ``Panel.minor_xs`` have dropped the ``copy`` parameter (:issue:`13781`) +- ``str.split`` has dropped the ``return_type`` parameter in favor of ``expand`` (:issue:`13701`) +- Removal of the legacy time rules (offset aliases), deprecated since 0.17.0 (this has been alias since 0.8.0) (:issue:`13590`, :issue:`13868`). Now legacy time rules raises ``ValueError``. For the list of currently supported offsets, see :ref:`here `. +- The default value for the ``return_type`` parameter for ``DataFrame.plot.box`` and ``DataFrame.boxplot`` changed from ``None`` to ``"axes"``. These methods will now return a matplotlib axes by default instead of a dictionary of artists. See :ref:`here ` (:issue:`6581`). +- The ``tquery`` and ``uquery`` functions in the ``pandas.io.sql`` module are removed (:issue:`5950`). .. _whatsnew_0190.performance: @@ -73,11 +1387,184 @@ Removal of prior version deprecations/changes Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - - +- Improved performance of sparse ``IntIndex.intersect`` (:issue:`13082`) +- Improved performance of sparse arithmetic with ``BlockIndex`` when the number of blocks are large, though recommended to use ``IntIndex`` in such cases (:issue:`13082`) +- Improved performance of ``DataFrame.quantile()`` as it now operates per-block (:issue:`11623`) +- Improved performance of float64 hash table operations, fixing some very slow indexing and groupby operations in python 3 (:issue:`13166`, :issue:`13334`) +- Improved performance of ``DataFrameGroupBy.transform`` (:issue:`12737`) +- Improved performance of ``Index`` and ``Series`` ``.duplicated`` (:issue:`10235`) +- Improved performance of ``Index.difference`` (:issue:`12044`) +- Improved performance of ``RangeIndex.is_monotonic_increasing`` and ``is_monotonic_decreasing`` (:issue:`13749`) +- Improved performance of datetime string parsing in ``DatetimeIndex`` (:issue:`13692`) +- Improved performance of hashing ``Period`` (:issue:`12817`) +- Improved performance of ``factorize`` of datetime with timezone (:issue:`13750`) .. _whatsnew_0190.bug_fixes: Bug Fixes ~~~~~~~~~ + +- Bug in ``groupby().shift()``, which could cause a segfault or corruption in rare circumstances when grouping by columns with missing values (:issue:`13813`) +- Bug in ``groupby().cumsum()`` calculating ``cumprod`` when ``axis=1``. (:issue:`13994`) +- Bug in ``pd.to_timedelta()`` in which the ``errors`` parameter was not being respected (:issue:`13613`) +- Bug in ``io.json.json_normalize()``, where non-ascii keys raised an exception (:issue:`13213`) +- Bug when passing a not-default-indexed ``Series`` as ``xerr`` or ``yerr`` in ``.plot()`` (:issue:`11858`) +- Bug in area plot draws legend incorrectly if subplot is enabled or legend is moved after plot (matplotlib 1.5.0 is required to draw area plot legend properly) (:issue:`9161`, :issue:`13544`) +- Bug in ``DataFrame`` assignment with an object-dtyped ``Index`` where the resultant column is mutable to the original object. (:issue:`13522`) +- Bug in matplotlib ``AutoDataFormatter``; this restores the second scaled formatting and re-adds micro-second scaled formatting (:issue:`13131`) +- Bug in selection from a ``HDFStore`` with a fixed format and ``start`` and/or ``stop`` specified will now return the selected range (:issue:`8287`) +- Bug in ``Categorical.from_codes()`` where an unhelpful error was raised when an invalid ``ordered`` parameter was passed in (:issue:`14058`) +- Bug in ``Series`` construction from a tuple of integers on windows not returning default dtype (int64) (:issue:`13646`) + +- Bug in ``.groupby(..).resample(..)`` when the same object is called multiple times (:issue:`13174`) +- Bug in ``.to_records()`` when index name is a unicode string (:issue:`13172`) + +- Bug in calling ``.memory_usage()`` on object which doesn't implement (:issue:`12924`) + +- Regression in ``Series.quantile`` with nans (also shows up in ``.median()`` and ``.describe()`` ); furthermore now names the ``Series`` with the quantile (:issue:`13098`, :issue:`13146`) + +- Bug in ``SeriesGroupBy.transform`` with datetime values and missing groups (:issue:`13191`) +- Bug where empty ``Series`` were incorrectly coerced in datetime-like numeric operations (:issue:`13844`) +- Bug in ``Categorical`` constructor when passed a ``Categorical`` containing datetimes with timezones (:issue:`14190`) +- Bug in ``Series.str.extractall()`` with ``str`` index raises ``ValueError`` (:issue:`13156`) +- Bug in ``Series.str.extractall()`` with single group and quantifier (:issue:`13382`) +- Bug in ``DatetimeIndex`` and ``Period`` subtraction raises ``ValueError`` or ``AttributeError`` rather than ``TypeError`` (:issue:`13078`) +- Bug in ``Index`` and ``Series`` created with ``NaN`` and ``NaT`` mixed data may not have ``datetime64`` dtype (:issue:`13324`) +- Bug in ``Index`` and ``Series`` may ignore ``np.datetime64('nat')`` and ``np.timdelta64('nat')`` to infer dtype (:issue:`13324`) +- Bug in ``PeriodIndex`` and ``Period`` subtraction raises ``AttributeError`` (:issue:`13071`) +- Bug in ``PeriodIndex`` construction returning a ``float64`` index in some circumstances (:issue:`13067`) +- Bug in ``.resample(..)`` with a ``PeriodIndex`` not changing its ``freq`` appropriately when empty (:issue:`13067`) +- Bug in ``.resample(..)`` with a ``PeriodIndex`` not retaining its type or name with an empty ``DataFrame`` appropriately when empty (:issue:`13212`) +- Bug in ``groupby(..).apply(..)`` when the passed function returns scalar values per group (:issue:`13468`). +- Bug in ``groupby(..).resample(..)`` where passing some keywords would raise an exception (:issue:`13235`) +- Bug in ``.tz_convert`` on a tz-aware ``DateTimeIndex`` that relied on index being sorted for correct results (:issue:`13306`) +- Bug in ``.tz_localize`` with ``dateutil.tz.tzlocal`` may return incorrect result (:issue:`13583`) +- Bug in ``DatetimeTZDtype`` dtype with ``dateutil.tz.tzlocal`` cannot be regarded as valid dtype (:issue:`13583`) +- Bug in ``pd.read_hdf()`` where attempting to load an HDF file with a single dataset, that had one or more categorical columns, failed unless the key argument was set to the name of the dataset. (:issue:`13231`) +- Bug in ``.rolling()`` that allowed a negative integer window in contruction of the ``Rolling()`` object, but would later fail on aggregation (:issue:`13383`) +- Bug in ``Series`` indexing with tuple-valued data and a numeric index (:issue:`13509`) + +- Bug in printing ``pd.DataFrame`` where unusual elements with the ``object`` dtype were causing segfaults (:issue:`13717`) +- Bug in ranking ``Series`` which could result in segfaults (:issue:`13445`) +- Bug in various index types, which did not propagate the name of passed index (:issue:`12309`) +- Bug in ``DatetimeIndex``, which did not honour the ``copy=True`` (:issue:`13205`) +- Bug in ``DatetimeIndex.is_normalized`` returns incorrectly for normalized date_range in case of local timezones (:issue:`13459`) + +- Bug in ``pd.concat`` and ``.append`` may coerces ``datetime64`` and ``timedelta`` to ``object`` dtype containing python built-in ``datetime`` or ``timedelta`` rather than ``Timestamp`` or ``Timedelta`` (:issue:`13626`) +- Bug in ``PeriodIndex.append`` may raises ``AttributeError`` when the result is ``object`` dtype (:issue:`13221`) +- Bug in ``CategoricalIndex.append`` may accept normal ``list`` (:issue:`13626`) +- Bug in ``pd.concat`` and ``.append`` with the same timezone get reset to UTC (:issue:`7795`) +- Bug in ``Series`` and ``DataFrame`` ``.append`` raises ``AmbiguousTimeError`` if data contains datetime near DST boundary (:issue:`13626`) + + +- Bug in ``DataFrame.to_csv()`` in which float values were being quoted even though quotations were specified for non-numeric values only (:issue:`12922`, :issue:`13259`) +- Bug in ``DataFrame.describe()`` raising ``ValueError`` with only boolean columns (:issue:`13898`) +- Bug in ``MultiIndex`` slicing where extra elements were returned when level is non-unique (:issue:`12896`) +- Bug in ``.str.replace`` does not raise ``TypeError`` for invalid replacement (:issue:`13438`) +- Bug in ``MultiIndex.from_arrays`` which didn't check for input array lengths matching (:issue:`13599`) + + +- Bug in ``pd.read_csv()`` which may cause a segfault or corruption when iterating in large chunks over a stream/file under rare circumstances (:issue:`13703`) +- Bug in ``pd.read_csv()`` which caused errors to be raised when a dictionary containing scalars is passed in for ``na_values`` (:issue:`12224`) +- Bug in ``pd.read_csv()`` which caused BOM files to be incorrectly parsed by not ignoring the BOM (:issue:`4793`) +- Bug in ``pd.read_csv()`` with ``engine='python'`` which raised errors when a numpy array was passed in for ``usecols`` (:issue:`12546`) +- Bug in ``pd.read_csv()`` where the index columns were being incorrectly parsed when parsed as dates with a ``thousands`` parameter (:issue:`14066`) +- Bug in ``pd.read_csv()`` with ``engine='python'`` in which ``NaN`` values weren't being detected after data was converted to numeric values (:issue:`13314`) +- Bug in ``pd.read_csv()`` in which the ``nrows`` argument was not properly validated for both engines (:issue:`10476`) +- Bug in ``pd.read_csv()`` with ``engine='python'`` in which infinities of mixed-case forms were not being interpreted properly (:issue:`13274`) +- Bug in ``pd.read_csv()`` with ``engine='python'`` in which trailing ``NaN`` values were not being parsed (:issue:`13320`) +- Bug in ``pd.read_csv()`` with ``engine='python'`` when reading from a ``tempfile.TemporaryFile`` on Windows with Python 3 (:issue:`13398`) +- Bug in ``pd.read_csv()`` that prevents ``usecols`` kwarg from accepting single-byte unicode strings (:issue:`13219`) +- Bug in ``pd.read_csv()`` that prevents ``usecols`` from being an empty set (:issue:`13402`) +- Bug in ``pd.read_csv()`` in the C engine where the NULL character was not being parsed as NULL (:issue:`14012`) +- Bug in ``pd.read_csv()`` with ``engine='c'`` in which NULL ``quotechar`` was not accepted even though ``quoting`` was specified as ``None`` (:issue:`13411`) +- Bug in ``pd.read_csv()`` with ``engine='c'`` in which fields were not properly cast to float when quoting was specified as non-numeric (:issue:`13411`) +- Bug in ``pd.read_csv()`` in Python 2.x with non-UTF8 encoded, multi-character separated data (:issue:`3404`) +- Bug in ``pd.read_csv()``, where aliases for utf-xx (e.g. UTF-xx, UTF_xx, utf_xx) raised UnicodeDecodeError (:issue:`13549`) +- Bug in ``pd.read_csv``, ``pd.read_table``, ``pd.read_fwf``, ``pd.read_stata`` and ``pd.read_sas`` where files were opened by parsers but not closed if both ``chunksize`` and ``iterator`` were ``None``. (:issue:`13940`) +- Bug in ``StataReader``, ``StataWriter``, ``XportReader`` and ``SAS7BDATReader`` where a file was not properly closed when an error was raised. (:issue:`13940`) + + +- Bug in ``pd.pivot_table()`` where ``margins_name`` is ignored when ``aggfunc`` is a list (:issue:`13354`) +- Bug in ``pd.Series.str.zfill``, ``center``, ``ljust``, ``rjust``, and ``pad`` when passing non-integers, did not raise ``TypeError`` (:issue:`13598`) +- Bug in checking for any null objects in a ``TimedeltaIndex``, which always returned ``True`` (:issue:`13603`) + + + +- Bug in ``Series`` arithmetic raises ``TypeError`` if it contains datetime-like as ``object`` dtype (:issue:`13043`) + +- Bug ``Series.isnull()`` and ``Series.notnull()`` ignore ``Period('NaT')`` (:issue:`13737`) +- Bug ``Series.fillna()`` and ``Series.dropna()`` don't affect to ``Period('NaT')`` (:issue:`13737` +- Bug in ``.fillna(value=np.nan)`` incorrectly raises ``KeyError`` on a ``category`` dtyped ``Series`` (:issue:`14021`) + +- Bug in extension dtype creation where the created types were not is/identical (:issue:`13285`) +- Bug in ``.resample(..)`` where incorrect warnings were triggered by IPython introspection (:issue:`13618`) +- Bug in ``NaT`` - ``Period`` raises ``AttributeError`` (:issue:`13071`) +- Bug in ``Series`` comparison may output incorrect result if rhs contains ``NaT`` (:issue:`9005`) +- Bug in ``Series`` and ``Index`` comparison may output incorrect result if it contains ``NaT`` with ``object`` dtype (:issue:`13592`) +- Bug in ``Period`` addition raises ``TypeError`` if ``Period`` is on right hand side (:issue:`13069`) +- Bug in ``Peirod`` and ``Series`` or ``Index`` comparison raises ``TypeError`` (:issue:`13200`) +- Bug in ``pd.set_eng_float_format()`` that would prevent NaN's from formatting (:issue:`11981`) +- Bug in ``.unstack`` with ``Categorical`` dtype resets ``.ordered`` to ``True`` (:issue:`13249`) +- Clean some compile time warnings in datetime parsing (:issue:`13607`) +- Bug in ``factorize`` raises ``AmbiguousTimeError`` if data contains datetime near DST boundary (:issue:`13750`) +- Bug in ``.set_index`` raises ``AmbiguousTimeError`` if new index contains DST boundary and multi levels (:issue:`12920`) +- Bug in ``.shift`` raises ``AmbiguousTimeError`` if data contains datetime near DST boundary (:issue:`13926`) +- Bug in ``pd.read_hdf()`` returns incorrect result when a ``DataFrame`` with a ``categorical`` column and a query which doesn't match any values (:issue:`13792`) + + +- Bug in ``Series`` comparison operators when dealing with zero dim NumPy arrays (:issue:`13006`) +- Bug in ``.combine_first`` may return incorrect ``dtype`` (:issue:`7630`, :issue:`10567`) +- Bug in ``groupby`` where ``apply`` returns different result depending on whether first result is ``None`` or not (:issue:`12824`) +- Bug in ``groupby(..).nth()`` where the group key is included inconsistently if called after ``.head()/.tail()`` (:issue:`12839`) +- Bug in ``.to_html``, ``.to_latex`` and ``.to_string`` silently ignore custom datetime formatter passed through the ``formatters`` key word (:issue:`10690`) +- Bug in ``DataFrame.iterrows()``, not yielding a ``Series`` subclasse if defined (:issue:`13977`) + +- Bug in ``pd.to_numeric`` when ``errors='coerce'`` and input contains non-hashable objects (:issue:`13324`) +- Bug in invalid ``Timedelta`` arithmetic and comparison may raise ``ValueError`` rather than ``TypeError`` (:issue:`13624`) +- Bug in invalid datetime parsing in ``to_datetime`` and ``DatetimeIndex`` may raise ``TypeError`` rather than ``ValueError`` (:issue:`11169`, :issue:`11287`) +- Bug in ``Index`` created with tz-aware ``Timestamp`` and mismatched ``tz`` option incorrectly coerces timezone (:issue:`13692`) +- Bug in ``DatetimeIndex`` with nanosecond frequency does not include timestamp specified with ``end`` (:issue:`13672`) +- Bug in ```Series``` when setting a slice with a ```np.timedelta64``` (:issue:`14155`) + +- Bug in ``Index`` raises ``OutOfBoundsDatetime`` if ``datetime`` exceeds ``datetime64[ns]`` bounds, rather than coercing to ``object`` dtype (:issue:`13663`) +- Bug in ``Index`` may ignore specified ``datetime64`` or ``timedelta64`` passed as ``dtype`` (:issue:`13981`) +- Bug in ``RangeIndex`` can be created without no arguments rather than raises ``TypeError`` (:issue:`13793`) +- Bug in ``.value_counts()`` raises ``OutOfBoundsDatetime`` if data exceeds ``datetime64[ns]`` bounds (:issue:`13663`) +- Bug in ``DatetimeIndex`` may raise ``OutOfBoundsDatetime`` if input ``np.datetime64`` has other unit than ``ns`` (:issue:`9114`) +- Bug in ``Series`` creation with ``np.datetime64`` which has other unit than ``ns`` as ``object`` dtype results in incorrect values (:issue:`13876`) +- Bug in ``resample`` with timedelta data where data was casted to float (:issue:`13119`). +- Bug in ``pd.isnull()`` ``pd.notnull()`` raise ``TypeError`` if input datetime-like has other unit than ``ns`` (:issue:`13389`) +- Bug in ``pd.merge()`` may raise ``TypeError`` if input datetime-like has other unit than ``ns`` (:issue:`13389`) + +- Bug in ``HDFStore``/``read_hdf()`` discarded ``DatetimeIndex.name`` if ``tz`` was set (:issue:`13884`) + +- Bug in ``Categorical.remove_unused_categories()`` changes ``.codes`` dtype to platform int (:issue:`13261`) +- Bug in ``groupby`` with ``as_index=False`` returns all NaN's when grouping on multiple columns including a categorical one (:issue:`13204`) +- Bug in ``df.groupby(...)[...]`` where getitem with ``Int64Index`` raised an error (:issue:`13731`) + +- Bug in the CSS classes assigned to ``DataFrame.style`` for index names. Previously they were assigned ``"col_heading level col"`` where ``n`` was the number of levels + 1. Now they are assigned ``"index_name level"``, where ``n`` is the correct level for that MultiIndex. +- Bug where ``pd.read_gbq()`` could throw ``ImportError: No module named discovery`` as a result of a naming conflict with another python package called apiclient (:issue:`13454`) +- Bug in ``Index.union`` returns an incorrect result with a named empty index (:issue:`13432`) +- Bugs in ``Index.difference`` and ``DataFrame.join`` raise in Python3 when using mixed-integer indexes (:issue:`13432`, :issue:`12814`) +- Bug in subtract tz-aware ``datetime.datetime`` from tz-aware ``datetime64`` series (:issue:`14088`) +- Bug in ``.to_excel()`` when DataFrame contains a MultiIndex which contains a label with a NaN value (:issue:`13511`) +- Bug in invalid frequency offset string like "D1", "-2-3H" may not raise ``ValueError (:issue:`13930`) +- Bug in ``concat`` and ``groupby`` for hierarchical frames with ``RangeIndex`` levels (:issue:`13542`). +- Bug in ``Series.str.contains()`` for Series containing only ``NaN`` values of ``object`` dtype (:issue:`14171`) +- Bug in ``agg()`` function on groupby dataframe changes dtype of ``datetime64[ns]`` column to ``float64`` (:issue:`12821`) +- Bug in using NumPy ufunc with ``PeriodIndex`` to add or subtract integer raise ``IncompatibleFrequency``. Note that using standard operator like ``+`` or ``-`` is recommended, because standard operators use more efficient path (:issue:`13980`) +- Bug in operations on ``NaT`` returning ``float`` instead of ``datetime64[ns]`` (:issue:`12941`) +- Bug in ``Series`` flexible arithmetic methods (like ``.add()``) raises ``ValueError`` when ``axis=None`` (:issue:`13894`) +- Bug in ``DataFrame.to_csv()`` with ``MultiIndex`` columns in which a stray empty line was added (:issue:`6618`) +- Bug in ``DatetimeIndex``, ``TimedeltaIndex`` and ``PeriodIndex.equals()`` may return ``True`` when input isn't ``Index`` but contains the same values (:issue:`13107`) + + +- Bug in ``Index`` raises ``KeyError`` displaying incorrect column when column is not in the df and columns contains duplicate values (:issue:`13822`) +- Bug in ``Period`` and ``PeriodIndex`` creating wrong dates when frequency has combined offset aliases (:issue:`13874`) +- Bug in ``.to_string()`` when called with an integer ``line_width`` and ``index=False`` raises an UnboundLocalError exception because ``idx`` referenced before assignment. + +- Bug in ``eval()`` where the ``resolvers`` argument would not accept a list (:issue:`14095`) +- Bugs in ``stack``, ``get_dummies``, ``make_axis_dummies`` which don't preserve categorical dtypes in (multi)indexes (:issue:`13854`) +- ``PeridIndex`` can now accept ``list`` and ``array`` which contains ``pd.NaT`` (:issue:`13430`) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt new file mode 100644 index 0000000000000..4aee6f72b1d53 --- /dev/null +++ b/doc/source/whatsnew/v0.20.0.txt @@ -0,0 +1,84 @@ +.. _whatsnew_0200: + +v0.20.0 (????, 2016) +-------------------- + +This is a major release from 0.19 and includes a small number of API changes, several new features, +enhancements, and performance improvements along with a large number of bug fixes. We recommend that all +users upgrade to this version. + +Highlights include: + + +Check the :ref:`API Changes ` and :ref:`deprecations ` before updating. + +.. contents:: What's new in v0.19.0 + :local: + :backlinks: none + +.. _whatsnew_0200.enhancements: + +New features +~~~~~~~~~~~~ + + + + + +.. _whatsnew_0200.enhancements.other: + +Other enhancements +^^^^^^^^^^^^^^^^^^ + + + + + + +.. _whatsnew_0200.api_breaking: + +Backwards incompatible API changes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. _whatsnew_0200.api: + + + + + + +Other API Changes +^^^^^^^^^^^^^^^^^ + +.. _whatsnew_0200.deprecations: + +Deprecations +^^^^^^^^^^^^ + + + + + +.. _whatsnew_0200.prior_deprecations: + +Removal of prior version deprecations/changes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + + + + + +.. _whatsnew_0200.performance: + +Performance Improvements +~~~~~~~~~~~~~~~~~~~~~~~~ + + + + + +.. _whatsnew_0200.bug_fixes: + +Bug Fixes +~~~~~~~~~ + diff --git a/doc/source/whatsnew/v0.8.0.txt b/doc/source/whatsnew/v0.8.0.txt index a76c4e487d5d8..4136c108fba57 100644 --- a/doc/source/whatsnew/v0.8.0.txt +++ b/doc/source/whatsnew/v0.8.0.txt @@ -59,7 +59,7 @@ Time series changes and improvements aggregation functions, and control over how the intervals and result labeling are defined. A suite of high performance Cython/C-based resampling functions (including Open-High-Low-Close) have also been implemented. -- Revamp of :ref:`frequency aliases ` and support for +- Revamp of :ref:`frequency aliases ` and support for **frequency shortcuts** like '15min', or '1h30min' - New :ref:`DatetimeIndex class ` supports both fixed frequency and irregular time @@ -134,7 +134,7 @@ Other new features - Move to klib-based hash tables for indexing; better performance and less memory usage than Python's dict - Add first, last, min, max, and prod optimized GroupBy functions -- New :ref:`ordered_merge ` function +- New :ref:`ordered_merge ` function - Add flexible :ref:`comparison ` instance methods eq, ne, lt, gt, etc. to DataFrame, Series - Improve :ref:`scatter_matrix ` plotting @@ -241,7 +241,7 @@ matplotlib knows how to handle ``datetime.datetime`` but not Timestamp objects. While I recommend that you plot time series using ``TimeSeries.plot``, you can either use ``to_pydatetime`` or register a converter for the Timestamp type. See `matplotlib documentation -`__ for more on this. +`__ for more on this. .. warning:: @@ -271,4 +271,3 @@ unique. In many cases it will no longer fail (some method like ``append`` still check for uniqueness unless disabled). However, all is not lost: you can inspect ``index.is_unique`` and raise an exception explicitly if it is ``False`` or go to a different code branch. - diff --git a/doc/sphinxext/ipython_sphinxext/ipython_directive.py b/doc/sphinxext/ipython_sphinxext/ipython_directive.py index ad7ada8e4eea3..b8b0935cd5b96 100644 --- a/doc/sphinxext/ipython_sphinxext/ipython_directive.py +++ b/doc/sphinxext/ipython_sphinxext/ipython_directive.py @@ -802,10 +802,14 @@ def setup(self): # reset the execution count if we haven't processed this doc #NOTE: this may be borked if there are multiple seen_doc tmp files #check time stamp? - if not self.state.document.current_source in self.seen_docs: + if self.state.document.current_source not in self.seen_docs: self.shell.IP.history_manager.reset() self.shell.IP.execution_count = 1 - self.shell.IP.prompt_manager.width = 0 + try: + self.shell.IP.prompt_manager.width = 0 + except AttributeError: + # GH14003: class promptManager has removed after IPython 5.x + pass self.seen_docs.add(self.state.document.current_source) # and attach to shell so we don't have to pass them around diff --git a/doc/sphinxext/numpydoc/phantom_import.py b/doc/sphinxext/numpydoc/phantom_import.py index 9a60b4a35b18f..4b4fec863a0e3 100755 --- a/doc/sphinxext/numpydoc/phantom_import.py +++ b/doc/sphinxext/numpydoc/phantom_import.py @@ -11,7 +11,7 @@ can be used to get the current docstrings from a Pydocweb instance without needing to rebuild the documented module. -.. [1] http://code.google.com/p/pydocweb +.. [1] https://github.com/pv/pydocweb """ from __future__ import division, absolute_import, print_function diff --git a/pandas/__init__.py b/pandas/__init__.py index 53642fdcfeb31..2d91c97144e3c 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -16,7 +16,7 @@ if missing_dependencies: raise ImportError("Missing required dependencies {0}".format(missing_dependencies)) - +del hard_dependencies, dependency, missing_dependencies # numpy compat from pandas.compat.numpy import * @@ -43,7 +43,8 @@ from pandas.io.api import * from pandas.computation.api import * -from pandas.tools.merge import merge, concat, ordered_merge +from pandas.tools.merge import (merge, concat, ordered_merge, + merge_ordered, merge_asof) from pandas.tools.pivot import pivot_table, crosstab from pandas.tools.plotting import scatter_matrix, plot_params from pandas.tools.tile import cut, qcut diff --git a/pandas/algos.pyx b/pandas/algos.pyx index a31b35ba4afc6..de5c5fc661d4d 100644 --- a/pandas/algos.pyx +++ b/pandas/algos.pyx @@ -1,3 +1,5 @@ +# cython: profile=False + from numpy cimport * cimport numpy as np import numpy as np @@ -11,6 +13,7 @@ cdef float64_t FP_ERR = 1e-13 cimport util from libc.stdlib cimport malloc, free +from libc.string cimport memmove from numpy cimport NPY_INT8 as NPY_int8 from numpy cimport NPY_INT16 as NPY_int16 @@ -31,38 +34,21 @@ float16 = np.dtype(np.float16) float32 = np.dtype(np.float32) float64 = np.dtype(np.float64) -cdef np.int8_t MINint8 = np.iinfo(np.int8).min -cdef np.int16_t MINint16 = np.iinfo(np.int16).min -cdef np.int32_t MINint32 = np.iinfo(np.int32).min -cdef np.int64_t MINint64 = np.iinfo(np.int64).min -cdef np.float16_t MINfloat16 = np.NINF -cdef np.float32_t MINfloat32 = np.NINF -cdef np.float64_t MINfloat64 = np.NINF - -cdef np.int8_t MAXint8 = np.iinfo(np.int8).max -cdef np.int16_t MAXint16 = np.iinfo(np.int16).max -cdef np.int32_t MAXint32 = np.iinfo(np.int32).max -cdef np.int64_t MAXint64 = np.iinfo(np.int64).max -cdef np.float16_t MAXfloat16 = np.inf -cdef np.float32_t MAXfloat32 = np.inf -cdef np.float64_t MAXfloat64 = np.inf - cdef double NaN = np.NaN cdef double nan = NaN -cdef inline int int_max(int a, int b): return a if a >= b else b -cdef inline int int_min(int a, int b): return a if a <= b else b - - cdef extern from "src/headers/math.h": double sqrt(double x) nogil double fabs(double) nogil - int signbit(double) nogil -from pandas import lib +# this is our util.pxd +from util cimport numeric, get_nat -include "skiplist.pyx" +cimport lib +from lib cimport is_null_datetimelike +from pandas import lib +cdef int64_t iNaT = get_nat() cdef: int TIEBREAK_AVERAGE = 0 @@ -73,11 +59,11 @@ cdef: int TIEBREAK_DENSE = 5 tiebreakers = { - 'average' : TIEBREAK_AVERAGE, - 'min' : TIEBREAK_MIN, - 'max' : TIEBREAK_MAX, - 'first' : TIEBREAK_FIRST, - 'dense' : TIEBREAK_DENSE, + 'average': TIEBREAK_AVERAGE, + 'min': TIEBREAK_MIN, + 'max': TIEBREAK_MAX, + 'first': TIEBREAK_FIRST, + 'dense': TIEBREAK_DENSE, } @@ -503,7 +489,6 @@ def rank_1d_generic(object in_arr, bint retry=1, ties_method='average', bint keep_na = 0 float count = 0.0 - tiebreak = tiebreakers[ties_method] keep_na = na_option == 'keep' @@ -581,28 +566,28 @@ cdef inline are_diff(object left, object right): except TypeError: return left != right -_return_false = lambda self, other: False -_return_true = lambda self, other: True class Infinity(object): + """ provide a positive Infinity comparision method for ranking """ + + __lt__ = lambda self, other: False + __le__ = lambda self, other: self is other + __eq__ = lambda self, other: self is other + __ne__ = lambda self, other: self is not other + __gt__ = lambda self, other: self is not other + __ge__ = lambda self, other: True - __lt__ = _return_false - __le__ = _return_false - __eq__ = _return_false - __ne__ = _return_true - __gt__ = _return_true - __ge__ = _return_true - __cmp__ = _return_false class NegInfinity(object): + """ provide a negative Infinity comparision method for ranking """ + + __lt__ = lambda self, other: self is not other + __le__ = lambda self, other: True + __eq__ = lambda self, other: self is other + __ne__ = lambda self, other: self is not other + __gt__ = lambda self, other: False + __ge__ = lambda self, other: self is other - __lt__ = _return_true - __le__ = _return_true - __eq__ = _return_false - __ne__ = _return_true - __gt__ = _return_false - __ge__ = _return_false - __cmp__ = _return_true def rank_2d_generic(object in_arr, axis=0, ties_method='average', ascending=True, na_option='keep', pct=False): @@ -720,58 +705,6 @@ def rank_2d_generic(object in_arr, axis=0, ties_method='average', # return result -# Cython implementations of rolling sum, mean, variance, skewness, -# other statistical moment functions -# -# Misc implementation notes -# ------------------------- -# -# - In Cython x * x is faster than x ** 2 for C types, this should be -# periodically revisited to see if it's still true. -# -# - - -def _check_minp(win, minp, N, floor=1): - if minp > win: - raise ValueError('min_periods (%d) must be <= window (%d)' - % (minp, win)) - elif minp > N: - minp = N + 1 - elif minp < 0: - raise ValueError('min_periods must be >= 0') - return max(minp, floor) - -# original C implementation by N. Devillard. -# This code in public domain. -# Function : kth_smallest() -# In : array of elements, # of elements in the array, rank k -# Out : one element -# Job : find the kth smallest element in the array - -# Reference: - -# Author: Wirth, Niklaus -# Title: Algorithms + data structures = programs -# Publisher: Englewood Cliffs: Prentice-Hall, 1976 -# Physical description: 366 p. -# Series: Prentice-Hall Series in Automatic Computation - - -ctypedef fused numeric: - int8_t - int16_t - int32_t - int64_t - - uint8_t - uint16_t - uint32_t - uint64_t - - float32_t - float64_t - - cdef inline Py_ssize_t swap(numeric *a, numeric *b) nogil except -1: cdef numeric t @@ -813,11 +746,11 @@ cpdef numeric kth_smallest(numeric[:] a, Py_ssize_t k): cdef inline kth_smallest_c(float64_t* a, Py_ssize_t k, Py_ssize_t n): cdef: - Py_ssize_t i,j,l,m + Py_ssize_t i, j, l, m double_t x, t l = 0 - m = n-1 + m = n -1 while (l win - 1: - prev = input[i - win] - if prev == prev: - sum_x -= prev - nobs -= 1 - - if nobs >= minp: - output[i] = sum_x - else: - output[i] = NaN - - return output - -#------------------------------------------------------------------------------- -# Rolling mean -@cython.boundscheck(False) -@cython.wraparound(False) -def roll_mean(ndarray[double_t] input, - int win, int minp): - cdef: - double val, prev, result, sum_x = 0 - Py_ssize_t nobs = 0, i, neg_ct = 0 - Py_ssize_t N = len(input) - - cdef ndarray[double_t] output = np.empty(N, dtype=float) - minp = _check_minp(win, minp, N) - with nogil: - for i from 0 <= i < minp - 1: - val = input[i] - - # Not NaN - if val == val: - nobs += 1 - sum_x += val - if signbit(val): - neg_ct += 1 - - output[i] = NaN - - for i from minp - 1 <= i < N: - val = input[i] - - if val == val: - nobs += 1 - sum_x += val - if signbit(val): - neg_ct += 1 - - if i > win - 1: - prev = input[i - win] - if prev == prev: - sum_x -= prev - nobs -= 1 - if signbit(prev): - neg_ct -= 1 - - if nobs >= minp: - result = sum_x / nobs - if neg_ct == 0 and result < 0: - # all positive - output[i] = 0 - elif neg_ct == nobs and result > 0: - # all negative - output[i] = 0 - else: - output[i] = result - else: - output[i] = NaN - - return output - -#------------------------------------------------------------------------------- -# Exponentially weighted moving average - -def ewma(ndarray[double_t] input, double_t com, int adjust, int ignore_na, int minp): - """ - Compute exponentially-weighted moving average using center-of-mass. - - Parameters - ---------- - input : ndarray (float64 type) - com : float64 - adjust: int - ignore_na: int - minp: int - - Returns - ------- - y : ndarray - """ - - cdef Py_ssize_t N = len(input) - cdef ndarray[double_t] output = np.empty(N, dtype=float) - if N == 0: - return output - - minp = max(minp, 1) - - cdef double alpha, old_wt_factor, new_wt, weighted_avg, old_wt, cur - cdef Py_ssize_t i, nobs - - alpha = 1. / (1. + com) - old_wt_factor = 1. - alpha - new_wt = 1. if adjust else alpha - - weighted_avg = input[0] - is_observation = (weighted_avg == weighted_avg) - nobs = int(is_observation) - output[0] = weighted_avg if (nobs >= minp) else NaN - old_wt = 1. - - for i from 1 <= i < N: - cur = input[i] - is_observation = (cur == cur) - nobs += int(is_observation) - if weighted_avg == weighted_avg: - if is_observation or (not ignore_na): - old_wt *= old_wt_factor - if is_observation: - if weighted_avg != cur: # avoid numerical errors on constant series - weighted_avg = ((old_wt * weighted_avg) + (new_wt * cur)) / (old_wt + new_wt) - if adjust: - old_wt += new_wt - else: - old_wt = 1. - elif is_observation: - weighted_avg = cur - - output[i] = weighted_avg if (nobs >= minp) else NaN - - return output - -#------------------------------------------------------------------------------- -# Exponentially weighted moving covariance - -def ewmcov(ndarray[double_t] input_x, ndarray[double_t] input_y, - double_t com, int adjust, int ignore_na, int minp, int bias): - """ - Compute exponentially-weighted moving variance using center-of-mass. - - Parameters - ---------- - input_x : ndarray (float64 type) - input_y : ndarray (float64 type) - com : float64 - adjust: int - ignore_na: int - minp: int - bias: int - - Returns - ------- - y : ndarray - """ - - cdef Py_ssize_t N = len(input_x) - if len(input_y) != N: - raise ValueError('arrays are of different lengths (%d and %d)' % (N, len(input_y))) - cdef ndarray[double_t] output = np.empty(N, dtype=float) - if N == 0: - return output - - minp = max(minp, 1) - - cdef double alpha, old_wt_factor, new_wt, mean_x, mean_y, cov - cdef double sum_wt, sum_wt2, old_wt, cur_x, cur_y, old_mean_x, old_mean_y - cdef Py_ssize_t i, nobs - - alpha = 1. / (1. + com) - old_wt_factor = 1. - alpha - new_wt = 1. if adjust else alpha - - mean_x = input_x[0] - mean_y = input_y[0] - is_observation = ((mean_x == mean_x) and (mean_y == mean_y)) - nobs = int(is_observation) - if not is_observation: - mean_x = NaN - mean_y = NaN - output[0] = (0. if bias else NaN) if (nobs >= minp) else NaN - cov = 0. - sum_wt = 1. - sum_wt2 = 1. - old_wt = 1. - - for i from 1 <= i < N: - cur_x = input_x[i] - cur_y = input_y[i] - is_observation = ((cur_x == cur_x) and (cur_y == cur_y)) - nobs += int(is_observation) - if mean_x == mean_x: - if is_observation or (not ignore_na): - sum_wt *= old_wt_factor - sum_wt2 *= (old_wt_factor * old_wt_factor) - old_wt *= old_wt_factor - if is_observation: - old_mean_x = mean_x - old_mean_y = mean_y - if mean_x != cur_x: # avoid numerical errors on constant series - mean_x = ((old_wt * old_mean_x) + (new_wt * cur_x)) / (old_wt + new_wt) - if mean_y != cur_y: # avoid numerical errors on constant series - mean_y = ((old_wt * old_mean_y) + (new_wt * cur_y)) / (old_wt + new_wt) - cov = ((old_wt * (cov + ((old_mean_x - mean_x) * (old_mean_y - mean_y)))) + - (new_wt * ((cur_x - mean_x) * (cur_y - mean_y)))) / (old_wt + new_wt) - sum_wt += new_wt - sum_wt2 += (new_wt * new_wt) - old_wt += new_wt - if not adjust: - sum_wt /= old_wt - sum_wt2 /= (old_wt * old_wt) - old_wt = 1. - elif is_observation: - mean_x = cur_x - mean_y = cur_y - - if nobs >= minp: - if not bias: - numerator = sum_wt * sum_wt - denominator = numerator - sum_wt2 - output[i] = ((numerator / denominator) * cov) if (denominator > 0.) else NaN - else: - output[i] = cov - else: - output[i] = NaN - - return output - #---------------------------------------------------------------------- # Pairwise correlation/covariance + @cython.boundscheck(False) @cython.wraparound(False) def nancorr(ndarray[float64_t, ndim=2] mat, cov=False, minp=None): @@ -1213,6 +891,7 @@ def nancorr(ndarray[float64_t, ndim=2] mat, cov=False, minp=None): #---------------------------------------------------------------------- # Pairwise Spearman correlation + @cython.boundscheck(False) @cython.wraparound(False) def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1): @@ -1273,653 +952,6 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1): return result -#---------------------------------------------------------------------- -# Rolling variance - -@cython.boundscheck(False) -@cython.wraparound(False) -def roll_var(ndarray[double_t] input, int win, int minp, int ddof=1): - """ - Numerically stable implementation using Welford's method. - """ - cdef double val, prev, mean_x = 0, ssqdm_x = 0, nobs = 0, delta - cdef Py_ssize_t i - cdef Py_ssize_t N = len(input) - - cdef ndarray[double_t] output = np.empty(N, dtype=float) - - minp = _check_minp(win, minp, N) - - # Check for windows larger than array, addresses #7297 - win = min(win, N) - - with nogil: - # Over the first window, observations can only be added, never removed - for i from 0 <= i < win: - val = input[i] - - # Not NaN - if val == val: - nobs += 1 - delta = (val - mean_x) - mean_x += delta / nobs - ssqdm_x += delta * (val - mean_x) - - if (nobs >= minp) and (nobs > ddof): - #pathological case - if nobs == 1: - val = 0 - else: - val = ssqdm_x / (nobs - ddof) - if val < 0: - val = 0 - else: - val = NaN - - output[i] = val - - # After the first window, observations can both be added and removed - for i from win <= i < N: - val = input[i] - prev = input[i - win] - - if val == val: - if prev == prev: - # Adding one observation and removing another one - delta = val - prev - prev -= mean_x - mean_x += delta / nobs - val -= mean_x - ssqdm_x += (val + prev) * delta - else: - # Adding one observation and not removing any - nobs += 1 - delta = (val - mean_x) - mean_x += delta / nobs - ssqdm_x += delta * (val - mean_x) - elif prev == prev: - # Adding no new observation, but removing one - nobs -= 1 - if nobs: - delta = (prev - mean_x) - mean_x -= delta / nobs - ssqdm_x -= delta * (prev - mean_x) - else: - mean_x = 0 - ssqdm_x = 0 - # Variance is unchanged if no observation is added or removed - - if (nobs >= minp) and (nobs > ddof): - #pathological case - if nobs == 1: - val = 0 - else: - val = ssqdm_x / (nobs - ddof) - if val < 0: - val = 0 - else: - val = NaN - - output[i] = val - - return output - - -#------------------------------------------------------------------------------- -# Rolling skewness -@cython.boundscheck(False) -@cython.wraparound(False) -def roll_skew(ndarray[double_t] input, int win, int minp): - cdef double val, prev - cdef double x = 0, xx = 0, xxx = 0 - cdef Py_ssize_t nobs = 0, i - cdef Py_ssize_t N = len(input) - - cdef ndarray[double_t] output = np.empty(N, dtype=float) - - # 3 components of the skewness equation - cdef double A, B, C, R - - minp = _check_minp(win, minp, N) - with nogil: - for i from 0 <= i < minp - 1: - val = input[i] - - # Not NaN - if val == val: - nobs += 1 - x += val - xx += val * val - xxx += val * val * val - - output[i] = NaN - - for i from minp - 1 <= i < N: - val = input[i] - - if val == val: - nobs += 1 - x += val - xx += val * val - xxx += val * val * val - - if i > win - 1: - prev = input[i - win] - if prev == prev: - x -= prev - xx -= prev * prev - xxx -= prev * prev * prev - - nobs -= 1 - if nobs >= minp: - A = x / nobs - B = xx / nobs - A * A - C = xxx / nobs - A * A * A - 3 * A * B - if B <= 0 or nobs < 3: - output[i] = NaN - else: - R = sqrt(B) - output[i] = ((sqrt(nobs * (nobs - 1.)) * C) / - ((nobs-2) * R * R * R)) - else: - output[i] = NaN - - return output - -#------------------------------------------------------------------------------- -# Rolling kurtosis -@cython.boundscheck(False) -@cython.wraparound(False) -def roll_kurt(ndarray[double_t] input, - int win, int minp): - cdef double val, prev - cdef double x = 0, xx = 0, xxx = 0, xxxx = 0 - cdef Py_ssize_t nobs = 0, i - cdef Py_ssize_t N = len(input) - - cdef ndarray[double_t] output = np.empty(N, dtype=float) - - # 5 components of the kurtosis equation - cdef double A, B, C, D, R, K - - minp = _check_minp(win, minp, N) - with nogil: - for i from 0 <= i < minp - 1: - val = input[i] - - # Not NaN - if val == val: - nobs += 1 - - # seriously don't ask me why this is faster - x += val - xx += val * val - xxx += val * val * val - xxxx += val * val * val * val - - output[i] = NaN - - for i from minp - 1 <= i < N: - val = input[i] - - if val == val: - nobs += 1 - x += val - xx += val * val - xxx += val * val * val - xxxx += val * val * val * val - - if i > win - 1: - prev = input[i - win] - if prev == prev: - x -= prev - xx -= prev * prev - xxx -= prev * prev * prev - xxxx -= prev * prev * prev * prev - - nobs -= 1 - - if nobs >= minp: - A = x / nobs - R = A * A - B = xx / nobs - R - R = R * A - C = xxx / nobs - R - 3 * A * B - R = R * A - D = xxxx / nobs - R - 6*B*A*A - 4*C*A - - if B == 0 or nobs < 4: - output[i] = NaN - - else: - K = (nobs * nobs - 1.)*D/(B*B) - 3*((nobs-1.)**2) - K = K / ((nobs - 2.)*(nobs-3.)) - - output[i] = K - - else: - output[i] = NaN - - return output - -#------------------------------------------------------------------------------- -# Rolling median, min, max - -ctypedef double_t (* skiplist_f)(object sl, int n, int p) - -cdef _roll_skiplist_op(ndarray arg, int win, int minp, skiplist_f op): - cdef ndarray[double_t] input = arg - cdef double val, prev, midpoint - cdef IndexableSkiplist skiplist - cdef Py_ssize_t nobs = 0, i - - cdef Py_ssize_t N = len(input) - cdef ndarray[double_t] output = np.empty(N, dtype=float) - - skiplist = IndexableSkiplist(win) - - minp = _check_minp(win, minp, N) - - for i from 0 <= i < minp - 1: - val = input[i] - - # Not NaN - if val == val: - nobs += 1 - skiplist.insert(val) - - output[i] = NaN - - for i from minp - 1 <= i < N: - val = input[i] - - if i > win - 1: - prev = input[i - win] - - if prev == prev: - skiplist.remove(prev) - nobs -= 1 - - if val == val: - nobs += 1 - skiplist.insert(val) - - output[i] = op(skiplist, nobs, minp) - - return output - -from skiplist cimport * - - -@cython.boundscheck(False) -@cython.wraparound(False) -def roll_median_c(ndarray[float64_t] arg, int win, int minp): - cdef: - double val, res, prev - bint err=0 - int ret=0 - skiplist_t *sl - Py_ssize_t midpoint, nobs = 0, i - - - cdef Py_ssize_t N = len(arg) - cdef ndarray[double_t] output = np.empty(N, dtype=float) - - sl = skiplist_init(win) - if sl == NULL: - raise MemoryError("skiplist_init failed") - - minp = _check_minp(win, minp, N) - - with nogil: - for i from 0 <= i < minp - 1: - val = arg[i] - - # Not NaN - if val == val: - nobs += 1 - err = skiplist_insert(sl, val) != 1 - if err: - break - output[i] = NaN - - with nogil: - if not err: - for i from minp - 1 <= i < N: - - val = arg[i] - - if i > win - 1: - prev = arg[i - win] - - if prev == prev: - skiplist_remove(sl, prev) - nobs -= 1 - - if val == val: - nobs += 1 - err = skiplist_insert(sl, val) != 1 - if err: - break - - if nobs >= minp: - midpoint = nobs / 2 - if nobs % 2: - res = skiplist_get(sl, midpoint, &ret) - else: - res = (skiplist_get(sl, midpoint, &ret) + - skiplist_get(sl, (midpoint - 1), &ret)) / 2 - else: - res = NaN - - output[i] = res - - skiplist_destroy(sl) - if err: - raise MemoryError("skiplist_insert failed") - return output - -#---------------------------------------------------------------------- - -# Moving maximum / minimum code taken from Bottleneck under the terms -# of its Simplified BSD license -# https://github.com/kwgoodman/bottleneck - -from libc cimport stdlib - -@cython.boundscheck(False) -@cython.wraparound(False) -def roll_max(ndarray[numeric] a, int window, int minp): - """ - Moving max of 1d array of any numeric type along axis=0 ignoring NaNs. - - Parameters - ---------- - a: numpy array - window: int, size of rolling window - minp: if number of observations in window - is below this, output a NaN - """ - return _roll_min_max(a, window, minp, 1) - -@cython.boundscheck(False) -@cython.wraparound(False) -def roll_min(ndarray[numeric] a, int window, int minp): - """ - Moving max of 1d array of any numeric type along axis=0 ignoring NaNs. - - Parameters - ---------- - a: numpy array - window: int, size of rolling window - minp: if number of observations in window - is below this, output a NaN - """ - return _roll_min_max(a, window, minp, 0) - -@cython.boundscheck(False) -@cython.wraparound(False) -cdef _roll_min_max(ndarray[numeric] a, int window, int minp, bint is_max): - "Moving min/max of 1d array of any numeric type along axis=0 ignoring NaNs." - cdef numeric ai, aold - cdef Py_ssize_t count - cdef Py_ssize_t* death - cdef numeric* ring - cdef numeric* minvalue - cdef numeric* end - cdef numeric* last - cdef Py_ssize_t i0 - cdef np.npy_intp *dim - dim = PyArray_DIMS(a) - cdef Py_ssize_t n0 = dim[0] - cdef np.npy_intp *dims = [n0] - cdef bint should_replace - cdef np.ndarray[numeric, ndim=1] y = PyArray_EMPTY(1, dims, PyArray_TYPE(a), 0) - - if window < 1: - raise ValueError('Invalid window size %d' - % (window)) - - if minp > window: - raise ValueError('Invalid min_periods size %d greater than window %d' - % (minp, window)) - - minp = _check_minp(window, minp, n0) - with nogil: - ring = stdlib.malloc(window * sizeof(numeric)) - death = stdlib.malloc(window * sizeof(Py_ssize_t)) - end = ring + window - last = ring - - minvalue = ring - ai = a[0] - if numeric in cython.floating: - if ai == ai: - minvalue[0] = ai - elif is_max: - minvalue[0] = MINfloat64 - else: - minvalue[0] = MAXfloat64 - else: - minvalue[0] = ai - death[0] = window - - count = 0 - for i0 in range(n0): - ai = a[i0] - if numeric in cython.floating: - if ai == ai: - count += 1 - elif is_max: - ai = MINfloat64 - else: - ai = MAXfloat64 - else: - count += 1 - if i0 >= window: - aold = a[i0 - window] - if aold == aold: - count -= 1 - if death[minvalue-ring] == i0: - minvalue += 1 - if minvalue >= end: - minvalue = ring - should_replace = ai >= minvalue[0] if is_max else ai <= minvalue[0] - if should_replace: - minvalue[0] = ai - death[minvalue-ring] = i0 + window - last = minvalue - else: - should_replace = last[0] <= ai if is_max else last[0] >= ai - while should_replace: - if last == ring: - last = end - last -= 1 - should_replace = last[0] <= ai if is_max else last[0] >= ai - last += 1 - if last == end: - last = ring - last[0] = ai - death[last - ring] = i0 + window - if numeric in cython.floating: - if count >= minp: - y[i0] = minvalue[0] - else: - y[i0] = NaN - else: - y[i0] = minvalue[0] - - for i0 in range(minp - 1): - if numeric in cython.floating: - y[i0] = NaN - else: - y[i0] = 0 - - stdlib.free(ring) - stdlib.free(death) - return y - -def roll_quantile(ndarray[float64_t, cast=True] input, int win, - int minp, double quantile): - """ - O(N log(window)) implementation using skip list - """ - cdef double val, prev, midpoint - cdef IndexableSkiplist skiplist - cdef Py_ssize_t nobs = 0, i - cdef Py_ssize_t N = len(input) - cdef ndarray[double_t] output = np.empty(N, dtype=float) - - skiplist = IndexableSkiplist(win) - - minp = _check_minp(win, minp, N) - - for i from 0 <= i < minp - 1: - val = input[i] - - # Not NaN - if val == val: - nobs += 1 - skiplist.insert(val) - - output[i] = NaN - - for i from minp - 1 <= i < N: - val = input[i] - - if i > win - 1: - prev = input[i - win] - - if prev == prev: - skiplist.remove(prev) - nobs -= 1 - - if val == val: - nobs += 1 - skiplist.insert(val) - - if nobs >= minp: - idx = int((quantile / 1.) * (nobs - 1)) - output[i] = skiplist.get(idx) - else: - output[i] = NaN - - return output - -def roll_generic(ndarray[float64_t, cast=True] input, - int win, int minp, int offset, - object func, object args, object kwargs): - cdef ndarray[double_t] output, counts, bufarr - cdef Py_ssize_t i, n - cdef float64_t *buf - cdef float64_t *oldbuf - - if not input.flags.c_contiguous: - input = input.copy('C') - - n = len(input) - if n == 0: - return input - - minp = _check_minp(win, minp, n, floor=0) - output = np.empty(n, dtype=float) - counts = roll_sum(np.concatenate((np.isfinite(input).astype(float), np.array([0.] * offset))), win, minp)[offset:] - - # truncated windows at the beginning, through first full-length window - for i from 0 <= i < (int_min(win, n) - offset): - if counts[i] >= minp: - output[i] = func(input[0 : (i + offset + 1)], *args, **kwargs) - else: - output[i] = NaN - - # remaining full-length windows - buf = input.data - bufarr = np.empty(win, dtype=float) - oldbuf = bufarr.data - for i from (win - offset) <= i < (n - offset): - buf = buf + 1 - bufarr.data = buf - if counts[i] >= minp: - output[i] = func(bufarr, *args, **kwargs) - else: - output[i] = NaN - bufarr.data = oldbuf - - # truncated windows at the end - for i from int_max(n - offset, 0) <= i < n: - if counts[i] >= minp: - output[i] = func(input[int_max(i + offset - win + 1, 0) : n], *args, **kwargs) - else: - output[i] = NaN - - return output - - -def roll_window(ndarray[float64_t, ndim=1, cast=True] input, - ndarray[float64_t, ndim=1, cast=True] weights, - int minp, bint avg=True): - """ - Assume len(weights) << len(input) - """ - cdef: - ndarray[double_t] output, tot_wgt, counts - Py_ssize_t in_i, win_i, win_n, win_k, in_n, in_k - float64_t val_in, val_win, c, w - - in_n = len(input) - win_n = len(weights) - output = np.zeros(in_n, dtype=float) - counts = np.zeros(in_n, dtype=float) - if avg: - tot_wgt = np.zeros(in_n, dtype=float) - - minp = _check_minp(len(weights), minp, in_n) - - if avg: - for win_i from 0 <= win_i < win_n: - val_win = weights[win_i] - if val_win != val_win: - continue - - for in_i from 0 <= in_i < in_n - (win_n - win_i) + 1: - val_in = input[in_i] - if val_in == val_in: - output[in_i + (win_n - win_i) - 1] += val_in * val_win - counts[in_i + (win_n - win_i) - 1] += 1 - tot_wgt[in_i + (win_n - win_i) - 1] += val_win - - for in_i from 0 <= in_i < in_n: - c = counts[in_i] - if c < minp: - output[in_i] = NaN - else: - w = tot_wgt[in_i] - if w == 0: - output[in_i] = NaN - else: - output[in_i] /= tot_wgt[in_i] - - else: - for win_i from 0 <= win_i < win_n: - val_win = weights[win_i] - if val_win != val_win: - continue - - for in_i from 0 <= in_i < in_n - (win_n - win_i) + 1: - val_in = input[in_i] - - if val_in == val_in: - output[in_i + (win_n - win_i) - 1] += val_in * val_win - counts[in_i + (win_n - win_i) - 1] += 1 - - for in_i from 0 <= in_i < in_n: - c = counts[in_i] - if c < minp: - output[in_i] = NaN - - return output - - #---------------------------------------------------------------------- # group operations @@ -1938,16 +970,14 @@ def is_lexsorted(list list_of_arrays): cdef int64_t **vecs = malloc(nlevels * sizeof(int64_t*)) for i from 0 <= i < nlevels: - # vecs[i] = ( list_of_arrays[i]).data - arr = list_of_arrays[i] - vecs[i] = arr.data - # assume uniqueness?? + vecs[i] = arr.data + # Assume uniqueness?? for i from 1 <= i < n: for k from 0 <= k < nlevels: cur = vecs[k][i] - pre = vecs[k][i-1] + pre = vecs[k][i -1] if cur == pre: continue elif cur > pre: @@ -1959,25 +989,40 @@ def is_lexsorted(list list_of_arrays): @cython.boundscheck(False) -def groupby_indices(ndarray values): +def groupby_indices(dict ids, ndarray[int64_t] labels, + ndarray[int64_t] counts): + """ + turn group_labels output into a combined indexer maping the labels to + indexers + + Parameters + ---------- + ids: dict + mapping of label -> group indexer + labels: ndarray + labels for positions + counts: ndarray + group counts + + Returns + ------- + list of ndarrays of indices + + """ cdef: - Py_ssize_t i, n = len(values) - ndarray[int64_t] labels, counts, arr, seen + Py_ssize_t i, n = len(labels) + ndarray[int64_t] arr, seen int64_t loc - dict ids = {} - object val int64_t k + dict result = {} - ids, labels, counts = group_labels(values) seen = np.zeros_like(counts) - # try not to get in trouble here... cdef int64_t **vecs = malloc(len(ids) * sizeof(int64_t*)) - result = {} for i from 0 <= i < len(counts): arr = np.empty(counts[i], dtype=np.int64) result[ids[i]] = arr - vecs[i] = arr.data + vecs[i] = arr.data for i from 0 <= i < n: k = labels[i] @@ -1991,17 +1036,24 @@ def groupby_indices(ndarray values): seen[k] = loc + 1 free(vecs) - return result + @cython.wraparound(False) @cython.boundscheck(False) def group_labels(ndarray[object] values): """ Compute label vector from input values and associated useful data + Parameters + ---------- + values: object ndarray + Returns ------- + tuple of (reverse mappings of label -> group indexer, + factorized labels ndarray, + group counts ndarray) """ cdef: Py_ssize_t i, n = len(values) @@ -2067,6 +1119,7 @@ def groupsort_indexer(ndarray[int64_t] index, Py_ssize_t ngroups): #---------------------------------------------------------------------- # first, nth, last + @cython.boundscheck(False) @cython.wraparound(False) def group_nth_object(ndarray[object, ndim=2] out, @@ -2111,6 +1164,7 @@ def group_nth_object(ndarray[object, ndim=2] out, else: out[i, j] = resx[i, j] + @cython.boundscheck(False) @cython.wraparound(False) def group_nth_bin_object(ndarray[object, ndim=2] out, @@ -2161,6 +1215,7 @@ def group_nth_bin_object(ndarray[object, ndim=2] out, else: out[i, j] = resx[i, j] + @cython.boundscheck(False) @cython.wraparound(False) def group_last_object(ndarray[object, ndim=2] out, @@ -2203,6 +1258,7 @@ def group_last_object(ndarray[object, ndim=2] out, else: out[i, j] = resx[i, j] + @cython.boundscheck(False) @cython.wraparound(False) def group_last_bin_object(ndarray[object, ndim=2] out, @@ -2277,7 +1333,6 @@ cdef inline float64_t _median_linear(float64_t* a, int n): a = tmp n -= na_count - if n % 2: result = kth_smallest_c( a, n / 2, n) else: @@ -2289,5 +1344,8 @@ cdef inline float64_t _median_linear(float64_t* a, int n): return result -include "join.pyx" -include "generated.pyx" + +# generated from template +include "algos_common_helper.pxi" +include "algos_groupby_helper.pxi" +include "algos_take_helper.pxi" diff --git a/pandas/api/__init__.py b/pandas/api/__init__.py new file mode 100644 index 0000000000000..fcbf42f6dabc4 --- /dev/null +++ b/pandas/api/__init__.py @@ -0,0 +1 @@ +""" public toolkit API """ diff --git a/pandas/sandbox/__init__.py b/pandas/api/tests/__init__.py similarity index 100% rename from pandas/sandbox/__init__.py rename to pandas/api/tests/__init__.py diff --git a/pandas/api/tests/test_api.py b/pandas/api/tests/test_api.py new file mode 100644 index 0000000000000..d4d8b7e4e9747 --- /dev/null +++ b/pandas/api/tests/test_api.py @@ -0,0 +1,227 @@ +# -*- coding: utf-8 -*- + +import pandas as pd +from pandas.core import common as com +from pandas import api +from pandas.api import types +from pandas.util import testing as tm + +_multiprocess_can_split_ = True + + +class Base(object): + + def check(self, namespace, expected, ignored=None): + # see which names are in the namespace, minus optional + # ignored ones + # compare vs the expected + + result = sorted([f for f in dir(namespace) if not f.startswith('_')]) + if ignored is not None: + result = sorted(list(set(result) - set(ignored))) + + expected = sorted(expected) + tm.assert_almost_equal(result, expected) + + +class TestPDApi(Base, tm.TestCase): + + # these are optionally imported based on testing + # & need to be ignored + ignored = ['tests', 'rpy', 'locale'] + + # top-level sub-packages + lib = ['api', 'compat', 'computation', 'core', + 'indexes', 'formats', 'pandas', + 'test', 'tools', 'tseries', + 'types', 'util', 'options', 'io'] + + # top-level packages that are c-imports, should rename to _* + # to avoid naming conflicts + lib_to_rename = ['algos', 'hashtable', 'tslib', 'msgpack', 'sparse', + 'json', 'lib', 'index', 'parser'] + + # these are already deprecated; awaiting removal + deprecated_modules = ['ols', 'stats', 'datetools'] + + # misc + misc = ['IndexSlice', 'NaT'] + + # top-level classes + classes = ['Categorical', 'CategoricalIndex', 'DataFrame', 'DateOffset', + 'DatetimeIndex', 'ExcelFile', 'ExcelWriter', 'Float64Index', + 'Grouper', 'HDFStore', 'Index', 'Int64Index', 'MultiIndex', + 'Period', 'PeriodIndex', 'RangeIndex', + 'Series', 'SparseArray', 'SparseDataFrame', + 'SparseSeries', 'TimeGrouper', 'Timedelta', + 'TimedeltaIndex', 'Timestamp'] + + # these are already deprecated; awaiting removal + deprecated_classes = ['TimeSeries', 'WidePanel', + 'SparseTimeSeries', 'Panel4D', + 'SparseList'] + + # these should be deprecated in the future + deprecated_classes_in_future = ['Term', 'Panel'] + + # these should be removed from top-level namespace + remove_classes_from_top_level_namespace = ['Expr'] + + # external modules exposed in pandas namespace + modules = ['np', 'datetime'] + + # top-level functions + funcs = ['bdate_range', 'concat', 'crosstab', 'cut', + 'date_range', 'eval', + 'factorize', 'get_dummies', 'get_store', + 'infer_freq', 'isnull', 'lreshape', + 'match', 'melt', 'notnull', 'offsets', + 'merge', 'merge_ordered', 'merge_asof', + 'period_range', + 'pivot', 'pivot_table', 'plot_params', 'qcut', + 'scatter_matrix', + 'show_versions', 'timedelta_range', 'unique', + 'value_counts', 'wide_to_long'] + + # top-level option funcs + funcs_option = ['reset_option', 'describe_option', 'get_option', + 'option_context', 'set_option', + 'set_eng_float_format'] + + # top-level read_* funcs + funcs_read = ['read_clipboard', 'read_csv', 'read_excel', 'read_fwf', + 'read_gbq', 'read_hdf', 'read_html', 'read_json', + 'read_msgpack', 'read_pickle', 'read_sas', 'read_sql', + 'read_sql_query', 'read_sql_table', 'read_stata', + 'read_table'] + + # top-level to_* funcs + funcs_to = ['to_datetime', 'to_msgpack', + 'to_numeric', 'to_pickle', 'to_timedelta'] + + # these should be deprecated in the future + deprecated_funcs_in_future = ['pnow', 'groupby', 'info'] + + # these are already deprecated; awaiting removal + deprecated_funcs = ['ewma', 'ewmcorr', 'ewmcov', 'ewmstd', 'ewmvar', + 'ewmvol', 'expanding_apply', 'expanding_corr', + 'expanding_count', 'expanding_cov', 'expanding_kurt', + 'expanding_max', 'expanding_mean', 'expanding_median', + 'expanding_min', 'expanding_quantile', + 'expanding_skew', 'expanding_std', 'expanding_sum', + 'expanding_var', 'fama_macbeth', 'rolling_apply', + 'rolling_corr', 'rolling_count', 'rolling_cov', + 'rolling_kurt', 'rolling_max', 'rolling_mean', + 'rolling_median', 'rolling_min', 'rolling_quantile', + 'rolling_skew', 'rolling_std', 'rolling_sum', + 'rolling_var', 'rolling_window', 'ordered_merge'] + + def test_api(self): + + self.check(pd, + self.lib + self.lib_to_rename + self.misc + + self.modules + self.deprecated_modules + + self.classes + self.deprecated_classes + + self.deprecated_classes_in_future + + self.remove_classes_from_top_level_namespace + + self.funcs + self.funcs_option + + self.funcs_read + self.funcs_to + + self.deprecated_funcs + + self.deprecated_funcs_in_future, + self.ignored) + + +class TestApi(Base, tm.TestCase): + + allowed = ['tests', 'types'] + + def test_api(self): + + self.check(api, self.allowed) + + +class TestTypes(Base, tm.TestCase): + + allowed = ['is_any_int_dtype', 'is_bool', 'is_bool_dtype', + 'is_categorical', 'is_categorical_dtype', 'is_complex', + 'is_complex_dtype', 'is_datetime64_any_dtype', + 'is_datetime64_dtype', 'is_datetime64_ns_dtype', + 'is_datetime64tz_dtype', 'is_datetimetz', 'is_dtype_equal', + 'is_extension_type', 'is_float', 'is_float_dtype', + 'is_floating_dtype', 'is_int64_dtype', 'is_integer', + 'is_integer_dtype', 'is_number', 'is_numeric_dtype', + 'is_object_dtype', 'is_scalar', 'is_sparse', + 'is_string_dtype', + 'is_timedelta64_dtype', 'is_timedelta64_ns_dtype', + 'is_period', 'is_period_dtype', + 'is_re', 'is_re_compilable', + 'is_dict_like', 'is_iterator', + 'is_list_like', 'is_hashable', + 'is_named_tuple', 'is_sequence', + 'pandas_dtype'] + + def test_types(self): + + self.check(types, self.allowed) + + def check_deprecation(self, fold, fnew): + with tm.assert_produces_warning(DeprecationWarning): + try: + result = fold('foo') + expected = fnew('foo') + self.assertEqual(result, expected) + except TypeError: + self.assertRaises(TypeError, + lambda: fnew('foo')) + except AttributeError: + self.assertRaises(AttributeError, + lambda: fnew('foo')) + + def test_deprecation_core_common(self): + + # test that we are in fact deprecating + # the pandas.core.common introspectors + for t in self.allowed: + self.check_deprecation(getattr(com, t), getattr(types, t)) + + def test_deprecation_core_common_moved(self): + + # these are in pandas.types.common + l = ['is_datetime_arraylike', + 'is_datetime_or_timedelta_dtype', + 'is_datetimelike', + 'is_datetimelike_v_numeric', + 'is_datetimelike_v_object', + 'is_datetimetz', + 'is_int_or_datetime_dtype', + 'is_period_arraylike', + 'is_string_like', + 'is_string_like_dtype'] + + from pandas.types import common as c + for t in l: + self.check_deprecation(getattr(com, t), getattr(c, t)) + + def test_removed_from_core_common(self): + + for t in ['is_null_datelike_scalar', + 'ensure_float']: + self.assertRaises(AttributeError, lambda: getattr(com, t)) + + +class TestDatetools(tm.TestCase): + + def test_deprecation_access_func(self): + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + pd.datetools.to_datetime('2016-01-01') + + def test_deprecation_access_obj(self): + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + pd.datetools.monthEnd + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/api/types/__init__.py b/pandas/api/types/__init__.py new file mode 100644 index 0000000000000..ee217543f0420 --- /dev/null +++ b/pandas/api/types/__init__.py @@ -0,0 +1,4 @@ +""" public toolkit API """ + +from pandas.types.api import * # noqa +del np # noqa diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 9f07e8ce5e0c6..990018f2f7f3b 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -107,6 +107,10 @@ def signature(f): long = int unichr = chr + # This was introduced in Python 3.3, but we don't support + # Python 3.x < 3.4, so checking PY3 is safe. + FileNotFoundError = FileNotFoundError + # list-producing versions of the major Python iterating functions def lrange(*args, **kwargs): return list(range(*args, **kwargs)) @@ -125,6 +129,8 @@ def lfilter(*args, **kwargs): import re _name_re = re.compile(r"[a-zA-Z_][a-zA-Z0-9_]*$") + FileNotFoundError = IOError + def isidentifier(s, dotted=False): return bool(_name_re.match(s)) diff --git a/pandas/compat/numpy/__init__.py b/pandas/compat/numpy/__init__.py index 8ecc5dc979792..f2d837a4c9908 100644 --- a/pandas/compat/numpy/__init__.py +++ b/pandas/compat/numpy/__init__.py @@ -5,8 +5,6 @@ from distutils.version import LooseVersion from pandas.compat import string_types, string_and_binary_types -# turn off all numpy warnings -np.seterr(all='ignore') # numpy versioning _np_version = np.version.short_version diff --git a/pandas/compat/numpy/function.py b/pandas/compat/numpy/function.py index 274761f5d0b9c..adc17c7514832 100644 --- a/pandas/compat/numpy/function.py +++ b/pandas/compat/numpy/function.py @@ -21,7 +21,8 @@ from numpy import ndarray from pandas.util.validators import (validate_args, validate_kwargs, validate_args_and_kwargs) -from pandas.core.common import is_integer, UnsupportedFunctionCall +from pandas.core.common import UnsupportedFunctionCall +from pandas.types.common import is_integer, is_bool from pandas.compat import OrderedDict @@ -148,10 +149,26 @@ def validate_clip_with_axis(axis, args, kwargs): CUM_FUNC_DEFAULTS = OrderedDict() CUM_FUNC_DEFAULTS['dtype'] = None CUM_FUNC_DEFAULTS['out'] = None -validate_cum_func = CompatValidator(CUM_FUNC_DEFAULTS, method='kwargs') +validate_cum_func = CompatValidator(CUM_FUNC_DEFAULTS, method='both', + max_fname_arg_count=1) validate_cumsum = CompatValidator(CUM_FUNC_DEFAULTS, fname='cumsum', method='both', max_fname_arg_count=1) + +def validate_cum_func_with_skipna(skipna, args, kwargs, name): + """ + If this function is called via the 'numpy' library, the third + parameter in its signature is 'dtype', which takes either a + 'numpy' dtype or 'None', so check if the 'skipna' parameter is + a boolean or not + """ + if not is_bool(skipna): + args = (skipna,) + args + skipna = True + + validate_cum_func(args, kwargs, fname=name) + return skipna + LOGICAL_FUNC_DEFAULTS = dict(out=None) validate_logical_func = CompatValidator(LOGICAL_FUNC_DEFAULTS, method='kwargs') diff --git a/pandas/computation/align.py b/pandas/computation/align.py index a117342fdefe2..4e12d58a4ab85 100644 --- a/pandas/computation/align.py +++ b/pandas/computation/align.py @@ -95,7 +95,7 @@ def _align_core(terms): term_axis_size = len(ti.axes[axis]) reindexer_size = len(reindexer) - ordm = np.log10(abs(reindexer_size - term_axis_size)) + ordm = np.log10(max(1, abs(reindexer_size - term_axis_size))) if ordm >= 1 and reindexer_size >= 10000: warnings.warn('Alignment difference on axis {0} is larger ' 'than an order of magnitude on term {1!r}, ' diff --git a/pandas/computation/expressions.py b/pandas/computation/expressions.py index 086e92dbde1a0..8fd9ab3477b74 100644 --- a/pandas/computation/expressions.py +++ b/pandas/computation/expressions.py @@ -59,7 +59,8 @@ def _evaluate_standard(op, op_str, a, b, raise_on_error=True, **eval_kwargs): """ standard evaluation """ if _TEST_MODE: _store_test_result(False) - return op(a, b) + with np.errstate(all='ignore'): + return op(a, b) def _can_use_numexpr(op, op_str, a, b, dtype_check): diff --git a/pandas/computation/ops.py b/pandas/computation/ops.py index bf6fa35cf255f..9446e84d891c4 100644 --- a/pandas/computation/ops.py +++ b/pandas/computation/ops.py @@ -7,11 +7,11 @@ import numpy as np +from pandas.types.common import is_list_like, is_scalar import pandas as pd from pandas.compat import PY3, string_types, text_type import pandas.core.common as com from pandas.formats.printing import pprint_thing, pprint_thing_encoded -import pandas.lib as lib from pandas.core.base import StringMixin from pandas.computation.common import _ensure_decoded, _result_type_many from pandas.computation.scope import _DEFAULT_GLOBALS @@ -100,7 +100,7 @@ def update(self, value): @property def isscalar(self): - return lib.isscalar(self._value) + return is_scalar(self._value) @property def type(self): @@ -229,7 +229,7 @@ def _in(x, y): try: return x.isin(y) except AttributeError: - if com.is_list_like(x): + if is_list_like(x): try: return y.isin(x) except AttributeError: @@ -244,7 +244,7 @@ def _not_in(x, y): try: return ~x.isin(y) except AttributeError: - if com.is_list_like(x): + if is_list_like(x): try: return ~y.isin(x) except AttributeError: @@ -286,7 +286,7 @@ def _cast_inplace(terms, acceptable_dtypes, dtype): acceptable_dtypes : list of acceptable numpy.dtype Will not cast if term's dtype in this list. - .. versionadded:: 0.18.2 + .. versionadded:: 0.19.0 dtype : str or numpy.dtype The dtype to cast to. @@ -523,7 +523,8 @@ def __init__(self, func, args): def __call__(self, env): operands = [op(env) for op in self.operands] - return self.func.func(*operands) + with np.errstate(all='ignore'): + return self.func.func(*operands) def __unicode__(self): operands = map(str, self.operands) diff --git a/pandas/computation/pytables.py b/pandas/computation/pytables.py index d6d55d15fec30..a4dd03a0fa7ee 100644 --- a/pandas/computation/pytables.py +++ b/pandas/computation/pytables.py @@ -7,6 +7,8 @@ from datetime import datetime, timedelta import numpy as np import pandas as pd + +from pandas.types.common import is_list_like import pandas.core.common as com from pandas.compat import u, string_types, DeepChainMap from pandas.core.base import StringMixin @@ -127,7 +129,7 @@ def pr(left, right): def conform(self, rhs): """ inplace conform rhs """ - if not com.is_list_like(rhs): + if not is_list_like(rhs): rhs = [rhs] if isinstance(rhs, np.ndarray): rhs = rhs.ravel() @@ -196,6 +198,11 @@ def stringify(value): elif meta == u('category'): metadata = com._values_from_object(self.metadata) result = metadata.searchsorted(v, side='left') + + # result returns 0 if v is first element or if v is not in metadata + # check that metadata contains v + if not result and v not in metadata: + result = -1 return TermValue(result, result, u('integer')) elif kind == u('integer'): v = int(float(v)) diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py index 5019dd392a567..c50944f0a4d3b 100644 --- a/pandas/computation/tests/test_eval.py +++ b/pandas/computation/tests/test_eval.py @@ -13,6 +13,7 @@ from numpy.random import randn, rand, randint import numpy as np +from pandas.types.common import is_list_like, is_scalar import pandas as pd from pandas.core import common as com from pandas import DataFrame, Series, Panel, date_range @@ -200,7 +201,7 @@ def check_complex_cmp_op(self, lhs, cmp1, rhs, binop, cmp2): ex = '(lhs {cmp1} rhs) {binop} (lhs {cmp2} rhs)'.format(cmp1=cmp1, binop=binop, cmp2=cmp2) - scalar_with_in_notin = (lib.isscalar(rhs) and (cmp1 in skip_these or + scalar_with_in_notin = (is_scalar(rhs) and (cmp1 in skip_these or cmp2 in skip_these)) if scalar_with_in_notin: with tm.assertRaises(TypeError): @@ -253,7 +254,7 @@ def check_operands(left, right, cmp_op): def check_simple_cmp_op(self, lhs, cmp1, rhs): ex = 'lhs {0} rhs'.format(cmp1) - if cmp1 in ('in', 'not in') and not com.is_list_like(rhs): + if cmp1 in ('in', 'not in') and not is_list_like(rhs): self.assertRaises(TypeError, pd.eval, ex, engine=self.engine, parser=self.parser, local_dict={'lhs': lhs, 'rhs': rhs}) @@ -331,7 +332,7 @@ def check_pow(self, lhs, arith1, rhs): expected = self.get_expected_pow_result(lhs, rhs) result = pd.eval(ex, engine=self.engine, parser=self.parser) - if (lib.isscalar(lhs) and lib.isscalar(rhs) and + if (is_scalar(lhs) and is_scalar(rhs) and _is_py3_complex_incompat(result, expected)): self.assertRaises(AssertionError, tm.assert_numpy_array_equal, result, expected) @@ -364,16 +365,16 @@ def check_compound_invert_op(self, lhs, cmp1, rhs): skip_these = 'in', 'not in' ex = '~(lhs {0} rhs)'.format(cmp1) - if lib.isscalar(rhs) and cmp1 in skip_these: + if is_scalar(rhs) and cmp1 in skip_these: self.assertRaises(TypeError, pd.eval, ex, engine=self.engine, parser=self.parser, local_dict={'lhs': lhs, 'rhs': rhs}) else: # compound - if lib.isscalar(lhs) and lib.isscalar(rhs): + if is_scalar(lhs) and is_scalar(rhs): lhs, rhs = map(lambda x: np.array([x]), (lhs, rhs)) expected = _eval_single_bin(lhs, cmp1, rhs, self.engine) - if lib.isscalar(expected): + if is_scalar(expected): expected = not expected else: expected = ~expected @@ -643,17 +644,17 @@ def test_identical(self): x = 1 result = pd.eval('x', engine=self.engine, parser=self.parser) self.assertEqual(result, 1) - self.assertTrue(lib.isscalar(result)) + self.assertTrue(is_scalar(result)) x = 1.5 result = pd.eval('x', engine=self.engine, parser=self.parser) self.assertEqual(result, 1.5) - self.assertTrue(lib.isscalar(result)) + self.assertTrue(is_scalar(result)) x = False result = pd.eval('x', engine=self.engine, parser=self.parser) self.assertEqual(result, False) - self.assertTrue(lib.isscalar(result)) + self.assertTrue(is_scalar(result)) x = np.array([1]) result = pd.eval('x', engine=self.engine, parser=self.parser) @@ -1612,7 +1613,8 @@ def test_unary_functions(self): for fn in self.unary_fns: expr = "{0}(a)".format(fn) got = self.eval(expr) - expect = getattr(np, fn)(a) + with np.errstate(all='ignore'): + expect = getattr(np, fn)(a) tm.assert_series_equal(got, expect, check_names=False) def test_binary_functions(self): @@ -1623,7 +1625,8 @@ def test_binary_functions(self): for fn in self.binary_fns: expr = "{0}(a, b)".format(fn) got = self.eval(expr) - expect = getattr(np, fn)(a, b) + with np.errstate(all='ignore'): + expect = getattr(np, fn)(a, b) tm.assert_almost_equal(got, expect, check_names=False) def test_df_use_case(self): diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 4b40bce79cbb5..ee59d6552bb2f 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -7,10 +7,31 @@ import numpy as np from pandas import compat, lib, tslib, _np_version_under1p8 +from pandas.types.cast import _maybe_promote +from pandas.types.generic import ABCSeries, ABCIndex +from pandas.types.common import (is_integer_dtype, + is_int64_dtype, + is_categorical_dtype, + is_extension_type, + is_datetimetz, + is_period_dtype, + is_period_arraylike, + is_float_dtype, + needs_i8_conversion, + is_categorical, + is_datetime64_dtype, + is_timedelta64_dtype, + is_scalar, + _ensure_platform_int, + _ensure_object, + _ensure_float64, + _ensure_int64, + is_list_like) +from pandas.types.missing import isnull + import pandas.core.common as com import pandas.algos as algos import pandas.hashtable as htable -from pandas.types import api as gt from pandas.compat import string_types from pandas.tslib import iNaT @@ -105,12 +126,12 @@ def isin(comps, values): boolean array same length as comps """ - if not com.is_list_like(comps): + if not is_list_like(comps): raise TypeError("only list-like objects are allowed to be passed" " to isin(), you passed a " "[{0}]".format(type(comps).__name__)) comps = np.asarray(comps) - if not com.is_list_like(values): + if not is_list_like(values): raise TypeError("only list-like objects are allowed to be passed" " to isin(), you passed a " "[{0}]".format(type(values).__name__)) @@ -126,15 +147,15 @@ def isin(comps, values): f = lambda x, y: lib.ismember_int64(x, set(y)) # may need i8 conversion for proper membership testing - if com.is_datetime64_dtype(comps): + if is_datetime64_dtype(comps): from pandas.tseries.tools import to_datetime values = to_datetime(values)._values.view('i8') comps = comps.view('i8') - elif com.is_timedelta64_dtype(comps): + elif is_timedelta64_dtype(comps): from pandas.tseries.timedeltas import to_timedelta values = to_timedelta(values)._values.view('i8') comps = comps.view('i8') - elif com.is_int64_dtype(comps): + elif is_int64_dtype(comps): pass else: f = lambda x, y: lib.ismember(x, set(values)) @@ -142,6 +163,104 @@ def isin(comps, values): return f(comps, values) +def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False): + """ + Sort ``values`` and reorder corresponding ``labels``. + ``values`` should be unique if ``labels`` is not None. + Safe for use with mixed types (int, str), orders ints before strs. + + .. versionadded:: 0.19.0 + + Parameters + ---------- + values : list-like + Sequence; must be unique if ``labels`` is not None. + labels : list_like + Indices to ``values``. All out of bound indices are treated as + "not found" and will be masked with ``na_sentinel``. + na_sentinel : int, default -1 + Value in ``labels`` to mark "not found". + Ignored when ``labels`` is None. + assume_unique : bool, default False + When True, ``values`` are assumed to be unique, which can speed up + the calculation. Ignored when ``labels`` is None. + + Returns + ------- + ordered : ndarray + Sorted ``values`` + new_labels : ndarray + Reordered ``labels``; returned when ``labels`` is not None. + + Raises + ------ + TypeError + * If ``values`` is not list-like or if ``labels`` is neither None + nor list-like + * If ``values`` cannot be sorted + ValueError + * If ``labels`` is not None and ``values`` contain duplicates. + """ + if not is_list_like(values): + raise TypeError("Only list-like objects are allowed to be passed to" + "safe_sort as values") + values = np.array(values, copy=False) + + def sort_mixed(values): + # order ints before strings, safe in py3 + str_pos = np.array([isinstance(x, string_types) for x in values], + dtype=bool) + nums = np.sort(values[~str_pos]) + strs = np.sort(values[str_pos]) + return _ensure_object(np.concatenate([nums, strs])) + + sorter = None + if compat.PY3 and lib.infer_dtype(values) == 'mixed-integer': + # unorderable in py3 if mixed str/int + ordered = sort_mixed(values) + else: + try: + sorter = values.argsort() + ordered = values.take(sorter) + except TypeError: + # try this anyway + ordered = sort_mixed(values) + + # labels: + + if labels is None: + return ordered + + if not is_list_like(labels): + raise TypeError("Only list-like objects or None are allowed to be" + "passed to safe_sort as labels") + labels = _ensure_platform_int(np.asarray(labels)) + + from pandas import Index + if not assume_unique and not Index(values).is_unique: + raise ValueError("values should be unique if labels is not None") + + if sorter is None: + # mixed types + (hash_klass, _), values = _get_data_algo(values, _hashtables) + t = hash_klass(len(values)) + t.map_locations(values) + sorter = _ensure_platform_int(t.lookup(ordered)) + + reverse_indexer = np.empty(len(sorter), dtype=np.int_) + reverse_indexer.put(sorter, np.arange(len(sorter))) + + mask = (labels < -len(values)) | (labels >= len(values)) | \ + (labels == na_sentinel) + + # (Out of bound indices will be masked with `na_sentinel` next, so we may + # deal with them here without performance loss using `mode='wrap'`.) + new_labels = reverse_indexer.take(labels, mode='wrap') + np.putmask(new_labels, mask, na_sentinel) + + return ordered, _ensure_platform_int(new_labels) + + def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): """ Encode input values as an enumerated type or categorical variable @@ -171,54 +290,30 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): vals = np.asarray(values) # localize to UTC - is_datetimetz = com.is_datetimetz(values) - if is_datetimetz: + is_datetimetz_type = is_datetimetz(values) + if is_datetimetz_type: values = DatetimeIndex(values) - vals = values.tz_localize(None) + vals = values.asi8 - is_datetime = com.is_datetime64_dtype(vals) - is_timedelta = com.is_timedelta64_dtype(vals) + is_datetime = is_datetime64_dtype(vals) + is_timedelta = is_timedelta64_dtype(vals) (hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables) table = hash_klass(size_hint or len(vals)) uniques = vec_klass() labels = table.get_labels(vals, uniques, 0, na_sentinel, True) - labels = com._ensure_platform_int(labels) + labels = _ensure_platform_int(labels) uniques = uniques.to_array() if sort and len(uniques) > 0: - try: - sorter = uniques.argsort() - except: - # unorderable in py3 if mixed str/int - t = hash_klass(len(uniques)) - t.map_locations(com._ensure_object(uniques)) - - # order ints before strings - ordered = np.concatenate([ - np.sort(np.array([e for i, e in enumerate(uniques) if f(e)], - dtype=object)) for f in - [lambda x: not isinstance(x, string_types), - lambda x: isinstance(x, string_types)]]) - sorter = com._ensure_platform_int(t.lookup( - com._ensure_object(ordered))) - - reverse_indexer = np.empty(len(sorter), dtype=np.int_) - reverse_indexer.put(sorter, np.arange(len(sorter))) - - mask = labels < 0 - labels = reverse_indexer.take(labels) - np.putmask(labels, mask, -1) - - uniques = uniques.take(sorter) - - if is_datetimetz: + uniques, labels = safe_sort(uniques, labels, na_sentinel=na_sentinel, + assume_unique=True) + if is_datetimetz_type: # reset tz - uniques = DatetimeIndex(uniques.astype('M8[ns]')).tz_localize( - values.tz) + uniques = values._shallow_copy(uniques) elif is_datetime: uniques = uniques.astype('M8[ns]') elif is_timedelta: @@ -267,7 +362,7 @@ def value_counts(values, sort=True, ascending=False, normalize=False, raise TypeError("bins argument only works with numeric data.") values = cat.codes - if com.is_extension_type(values) and not com.is_datetimetz(values): + if is_extension_type(values) and not is_datetimetz(values): # handle Categorical and sparse, # datetime tz can be handeled in ndarray path result = Series(values).values.value_counts(dropna=dropna) @@ -298,9 +393,9 @@ def value_counts(values, sort=True, ascending=False, normalize=False, def _value_counts_arraylike(values, dropna=True): - is_datetimetz = com.is_datetimetz(values) - is_period = (isinstance(values, gt.ABCPeriodIndex) or - com.is_period_arraylike(values)) + is_datetimetz_type = is_datetimetz(values) + is_period_type = (is_period_dtype(values) or + is_period_arraylike(values)) orig = values @@ -308,16 +403,18 @@ def _value_counts_arraylike(values, dropna=True): values = Series(values).values dtype = values.dtype - if com.is_datetime_or_timedelta_dtype(dtype) or is_period: + if needs_i8_conversion(dtype) or is_period_type: + from pandas.tseries.index import DatetimeIndex from pandas.tseries.period import PeriodIndex - if is_period: + if is_period_type: + # values may be an object values = PeriodIndex(values) freq = values.freq values = values.view(np.int64) - keys, counts = htable.value_count_scalar64(values, dropna) + keys, counts = htable.value_count_int64(values, dropna) if dropna: msk = keys != iNaT @@ -327,24 +424,20 @@ def _value_counts_arraylike(values, dropna=True): keys = keys.astype(dtype) # dtype handling - if is_datetimetz: - if isinstance(orig, gt.ABCDatetimeIndex): - tz = orig.tz - else: - tz = orig.dt.tz - keys = DatetimeIndex._simple_new(keys, tz=tz) - if is_period: + if is_datetimetz_type: + keys = DatetimeIndex._simple_new(keys, tz=orig.dtype.tz) + if is_period_type: keys = PeriodIndex._simple_new(keys, freq=freq) - elif com.is_integer_dtype(dtype): - values = com._ensure_int64(values) - keys, counts = htable.value_count_scalar64(values, dropna) - elif com.is_float_dtype(dtype): - values = com._ensure_float64(values) - keys, counts = htable.value_count_scalar64(values, dropna) + elif is_integer_dtype(dtype): + values = _ensure_int64(values) + keys, counts = htable.value_count_int64(values, dropna) + elif is_float_dtype(dtype): + values = _ensure_float64(values) + keys, counts = htable.value_count_float64(values, dropna) else: - values = com._ensure_object(values) - mask = com.isnull(values) + values = _ensure_object(values) + mask = isnull(values) keys, counts = htable.value_count_object(values, mask) if not dropna and mask.any(): keys = np.insert(keys, 0, np.NaN) @@ -353,6 +446,52 @@ def _value_counts_arraylike(values, dropna=True): return keys, counts +def duplicated(values, keep='first'): + """ + Return boolean ndarray denoting duplicate values + + .. versionadded:: 0.19.0 + + Parameters + ---------- + keep : {'first', 'last', False}, default 'first' + - ``first`` : Mark duplicates as ``True`` except for the first + occurrence. + - ``last`` : Mark duplicates as ``True`` except for the last + occurrence. + - False : Mark all duplicates as ``True``. + + Returns + ------- + duplicated : ndarray + """ + + dtype = values.dtype + + # no need to revert to original type + if needs_i8_conversion(dtype): + values = values.view(np.int64) + elif is_period_arraylike(values): + from pandas.tseries.period import PeriodIndex + values = PeriodIndex(values).asi8 + elif is_categorical_dtype(dtype): + values = values.values.codes + elif isinstance(values, (ABCSeries, ABCIndex)): + values = values.values + + if is_integer_dtype(dtype): + values = _ensure_int64(values) + duplicated = htable.duplicated_int64(values, keep=keep) + elif is_float_dtype(dtype): + values = _ensure_float64(values) + duplicated = htable.duplicated_float64(values, keep=keep) + else: + values = _ensure_object(values) + duplicated = htable.duplicated_object(values, keep=keep) + + return duplicated + + def mode(values): """Returns the mode or mode(s) of the passed Series or ndarray (sorted)""" # must sort because hash order isn't necessarily defined. @@ -366,8 +505,8 @@ def mode(values): constructor = Series dtype = values.dtype - if com.is_integer_dtype(values): - values = com._ensure_int64(values) + if is_integer_dtype(values): + values = _ensure_int64(values) result = constructor(sorted(htable.mode_int64(values)), dtype=dtype) elif issubclass(values.dtype.type, (np.datetime64, np.timedelta64)): @@ -375,11 +514,11 @@ def mode(values): values = values.view(np.int64) result = constructor(sorted(htable.mode_int64(values)), dtype=dtype) - elif com.is_categorical_dtype(values): + elif is_categorical_dtype(values): result = constructor(values.mode()) else: - mask = com.isnull(values) - values = com._ensure_object(values) + mask = isnull(values) + values = _ensure_object(values) res = htable.mode_object(values, mask) try: res = sorted(res) @@ -459,7 +598,7 @@ def quantile(x, q, interpolation_method='fraction'): """ x = np.asarray(x) - mask = com.isnull(x) + mask = isnull(x) x = x[~mask] @@ -486,7 +625,7 @@ def _get_score(at): return score - if lib.isscalar(q): + if is_scalar(q): return _get_score(q) else: q = np.asarray(q, np.float64) @@ -593,18 +732,18 @@ def _hashtable_algo(f, dtype, return_dtype=None): """ f(HashTable, type_caster) -> result """ - if com.is_float_dtype(dtype): - return f(htable.Float64HashTable, com._ensure_float64) - elif com.is_integer_dtype(dtype): - return f(htable.Int64HashTable, com._ensure_int64) - elif com.is_datetime64_dtype(dtype): + if is_float_dtype(dtype): + return f(htable.Float64HashTable, _ensure_float64) + elif is_integer_dtype(dtype): + return f(htable.Int64HashTable, _ensure_int64) + elif is_datetime64_dtype(dtype): return_dtype = return_dtype or 'M8[ns]' - return f(htable.Int64HashTable, com._ensure_int64).view(return_dtype) - elif com.is_timedelta64_dtype(dtype): + return f(htable.Int64HashTable, _ensure_int64).view(return_dtype) + elif is_timedelta64_dtype(dtype): return_dtype = return_dtype or 'm8[ns]' - return f(htable.Int64HashTable, com._ensure_int64).view(return_dtype) + return f(htable.Int64HashTable, _ensure_int64).view(return_dtype) else: - return f(htable.PyObjectHashTable, com._ensure_object) + return f(htable.PyObjectHashTable, _ensure_object) _hashtables = { 'float64': (htable.Float64HashTable, htable.Float64Vector), @@ -614,20 +753,20 @@ def _hashtable_algo(f, dtype, return_dtype=None): def _get_data_algo(values, func_map): - if com.is_float_dtype(values): + if is_float_dtype(values): f = func_map['float64'] - values = com._ensure_float64(values) + values = _ensure_float64(values) - elif com.needs_i8_conversion(values): + elif needs_i8_conversion(values): f = func_map['int64'] values = values.view('i8') - elif com.is_integer_dtype(values): + elif is_integer_dtype(values): f = func_map['int64'] - values = com._ensure_int64(values) + values = _ensure_int64(values) else: f = func_map['generic'] - values = com._ensure_object(values) + values = _ensure_object(values) return f, values @@ -689,7 +828,7 @@ def _take_nd_generic(arr, indexer, out, axis, fill_value, mask_info): if arr.dtype != out.dtype: arr = arr.astype(out.dtype) if arr.shape[axis] > 0: - arr.take(com._ensure_platform_int(indexer), axis=axis, out=out) + arr.take(_ensure_platform_int(indexer), axis=axis, out=out) if needs_masking: outindexer = [slice(None)] * arr.ndim outindexer[axis] = mask @@ -830,7 +969,7 @@ def _get_take_nd_function(ndim, arr_dtype, out_dtype, axis=0, mask_info=None): return func def func(arr, indexer, out, fill_value=np.nan): - indexer = com._ensure_int64(indexer) + indexer = _ensure_int64(indexer) _take_nd_generic(arr, indexer, out, axis=axis, fill_value=fill_value, mask_info=mask_info) @@ -854,7 +993,7 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, out : ndarray or None, default None Optional output array, must be appropriate type to hold input and fill_value together, if indexer has any -1 value entries; call - common._maybe_promote to determine this type for any fill_value + _maybe_promote to determine this type for any fill_value fill_value : any, default np.nan Fill value to replace -1 values with mask_info : tuple of (ndarray, boolean) @@ -868,24 +1007,24 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, """ # dispatch to internal type takes - if com.is_categorical(arr): + if is_categorical(arr): return arr.take_nd(indexer, fill_value=fill_value, allow_fill=allow_fill) - elif com.is_datetimetz(arr): + elif is_datetimetz(arr): return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) if indexer is None: indexer = np.arange(arr.shape[axis], dtype=np.int64) dtype, fill_value = arr.dtype, arr.dtype.type() else: - indexer = com._ensure_int64(indexer) + indexer = _ensure_int64(indexer) if not allow_fill: dtype, fill_value = arr.dtype, arr.dtype.type() mask_info = None, False else: # check for promotion based on types only (do this first because # it's faster than computing a mask) - dtype, fill_value = com._maybe_promote(arr.dtype, fill_value) + dtype, fill_value = _maybe_promote(arr.dtype, fill_value) if dtype != arr.dtype and (out is None or out.dtype != dtype): # check if promotion is actually required based on indexer if mask_info is not None: @@ -931,7 +1070,7 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, func = _get_take_nd_function(arr.ndim, arr.dtype, out.dtype, axis=axis, mask_info=mask_info) - indexer = com._ensure_int64(indexer) + indexer = _ensure_int64(indexer) func(arr, indexer, out, fill_value) if flip_order: @@ -957,11 +1096,11 @@ def take_2d_multi(arr, indexer, out=None, fill_value=np.nan, mask_info=None, if row_idx is None: row_idx = np.arange(arr.shape[0], dtype=np.int64) else: - row_idx = com._ensure_int64(row_idx) + row_idx = _ensure_int64(row_idx) if col_idx is None: col_idx = np.arange(arr.shape[1], dtype=np.int64) else: - col_idx = com._ensure_int64(col_idx) + col_idx = _ensure_int64(col_idx) indexer = row_idx, col_idx if not allow_fill: dtype, fill_value = arr.dtype, arr.dtype.type() @@ -969,7 +1108,7 @@ def take_2d_multi(arr, indexer, out=None, fill_value=np.nan, mask_info=None, else: # check for promotion based on types only (do this first because # it's faster than computing a mask) - dtype, fill_value = com._maybe_promote(arr.dtype, fill_value) + dtype, fill_value = _maybe_promote(arr.dtype, fill_value) if dtype != arr.dtype and (out is None or out.dtype != dtype): # check if promotion is actually required based on indexer if mask_info is not None: @@ -1032,7 +1171,7 @@ def diff(arr, n, axis=0): na = np.nan dtype = arr.dtype is_timedelta = False - if com.needs_i8_conversion(arr): + if needs_i8_conversion(arr): dtype = np.float64 arr = arr.view('i8') na = tslib.iNaT diff --git a/pandas/core/api.py b/pandas/core/api.py index 0a6992bfebd70..c0f39e2ac4717 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -5,7 +5,7 @@ import numpy as np from pandas.core.algorithms import factorize, match, unique, value_counts -from pandas.core.common import isnull, notnull +from pandas.types.missing import isnull, notnull from pandas.core.categorical import Categorical from pandas.core.groupby import Grouper from pandas.formats.format import set_eng_float_format @@ -28,8 +28,18 @@ from pandas.tseries.tdi import TimedeltaIndex, Timedelta from pandas.tseries.period import Period, PeriodIndex -# legacy -import pandas.core.datetools as datetools +# see gh-14094. +from pandas.util.depr_module import _DeprecatedModule + +_alts = ['pandas.tseries.tools', 'pandas.tseries.offsets', + 'pandas.tseries.frequencies'] +_removals = ['day', 'bday', 'businessDay', 'cday', 'customBusinessDay', + 'customBusinessMonthEnd', 'customBusinessMonthBegin', + 'monthEnd', 'yearEnd', 'yearBegin', 'bmonthEnd', 'bmonthBegin', + 'cbmonthEnd', 'cbmonthBegin', 'bquarterEnd', 'quarterEnd', + 'byearEnd', 'week'] +datetools = _DeprecatedModule(deprmod='pandas.core.datetools', alts=_alts, + removals=_removals) from pandas.core.config import (get_option, set_option, reset_option, describe_option, option_context, options) diff --git a/pandas/core/base.py b/pandas/core/base.py index 96732a7140f9e..b9a70292498e4 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -4,6 +4,11 @@ from pandas import compat from pandas.compat import builtins import numpy as np + +from pandas.types.missing import isnull +from pandas.types.generic import ABCDataFrame, ABCSeries, ABCIndexClass +from pandas.types.common import is_object_dtype, is_list_like, is_scalar + from pandas.core import common as com import pandas.core.nanops as nanops import pandas.lib as lib @@ -11,12 +16,11 @@ from pandas.util.decorators import (Appender, cache_readonly, deprecate_kwarg, Substitution) from pandas.core.common import AbstractMethodError -from pandas.types import api as gt from pandas.formats.printing import pprint_thing _shared_docs = dict() _indexops_doc_kwargs = dict(klass='IndexOpsMixin', inplace='', - duplicated='IndexOpsMixin') + unique='IndexOpsMixin', duplicated='IndexOpsMixin') class StringMixin(object): @@ -121,7 +125,7 @@ def __sizeof__(self): """ if hasattr(self, 'memory_usage'): mem = self.memory_usage(deep=True) - if not lib.isscalar(mem): + if not is_scalar(mem): mem = mem.sum() return int(mem) @@ -293,15 +297,15 @@ def name(self): @property def _selection_list(self): - if not isinstance(self._selection, (list, tuple, gt.ABCSeries, - gt.ABCIndex, np.ndarray)): + if not isinstance(self._selection, (list, tuple, ABCSeries, + ABCIndexClass, np.ndarray)): return [self._selection] return self._selection @cache_readonly def _selected_obj(self): - if self._selection is None or isinstance(self.obj, gt.ABCSeries): + if self._selection is None or isinstance(self.obj, ABCSeries): return self.obj else: return self.obj[self._selection] @@ -313,7 +317,7 @@ def ndim(self): @cache_readonly def _obj_with_exclusions(self): if self._selection is not None and isinstance(self.obj, - gt.ABCDataFrame): + ABCDataFrame): return self.obj.reindex(columns=self._selection_list) if len(self.exclusions) > 0: @@ -325,7 +329,7 @@ def __getitem__(self, key): if self._selection is not None: raise Exception('Column(s) %s already selected' % self._selection) - if isinstance(key, (list, tuple, gt.ABCSeries, gt.ABCIndex, + if isinstance(key, (list, tuple, ABCSeries, ABCIndexClass, np.ndarray)): if len(self.obj.columns.intersection(key)) != len(key): bad_keys = list(set(key).difference(self.obj.columns)) @@ -553,7 +557,7 @@ def _agg(arg, func): if isinstance(result, list): result = concat(result, keys=keys, axis=1) elif isinstance(list(compat.itervalues(result))[0], - gt.ABCDataFrame): + ABCDataFrame): result = concat([result[k] for k in keys], keys=keys, axis=1) else: from pandas import DataFrame @@ -682,7 +686,7 @@ def _gotitem(self, key, ndim, subset=None): **kwargs) self._reset_cache() if subset.ndim == 2: - if lib.isscalar(key) and key in subset or com.is_list_like(key): + if is_scalar(key) and key in subset or is_list_like(key): self._selection = key return self @@ -903,7 +907,7 @@ def argmin(self, axis=None): @cache_readonly def hasnans(self): """ return if I have any nans; enables various perf speedups """ - return com.isnull(self).any() + return isnull(self).any() def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds): @@ -947,21 +951,27 @@ def value_counts(self, normalize=False, sort=True, ascending=False, normalize=normalize, bins=bins, dropna=dropna) return result - def unique(self): + _shared_docs['unique'] = ( """ - Return array of unique values in the object. Significantly faster than - numpy.unique. Includes NA values. + Return %(unique)s of unique values in the object. + Significantly faster than numpy.unique. Includes NA values. + The order of the original is preserved. Returns ------- - uniques : ndarray - """ - from pandas.core.nanops import unique1d - values = self.values - if hasattr(values, 'unique'): - return values.unique() + uniques : %(unique)s + """) - return unique1d(values) + @Appender(_shared_docs['unique'] % _indexops_doc_kwargs) + def unique(self): + values = self._values + + if hasattr(values, 'unique'): + result = values.unique() + else: + from pandas.core.nanops import unique1d + result = unique1d(values) + return result def nunique(self, dropna=True): """ @@ -980,7 +990,7 @@ def nunique(self, dropna=True): """ uniqs = self.unique() n = len(uniqs) - if dropna and com.isnull(uniqs).any(): + if dropna and isnull(uniqs).any(): n -= 1 return n @@ -1001,7 +1011,7 @@ def is_monotonic(self): Return boolean if values in the object are monotonic_increasing - .. versionadded:: 0.18.2 + .. versionadded:: 0.19.0 Returns ------- @@ -1009,6 +1019,7 @@ def is_monotonic(self): """ from pandas import Index return Index(self).is_monotonic + is_monotonic_increasing = is_monotonic @property @@ -1017,7 +1028,7 @@ def is_monotonic_decreasing(self): Return boolean if values in the object are monotonic_decreasing - .. versionadded:: 0.18.2 + .. versionadded:: 0.19.0 Returns ------- @@ -1053,7 +1064,7 @@ def memory_usage(self, deep=False): return self.values.memory_usage(deep=deep) v = self.values.nbytes - if deep and com.is_object_dtype(self): + if deep and is_object_dtype(self): v += lib.memory_usage_of_objects(self.values) return v @@ -1166,6 +1177,10 @@ def searchsorted(self, key, side='left', sorter=None): False: 'first'}) @Appender(_shared_docs['drop_duplicates'] % _indexops_doc_kwargs) def drop_duplicates(self, keep='first', inplace=False): + if isinstance(self, ABCIndexClass): + if self.is_unique: + return self._shallow_copy() + duplicated = self.duplicated(keep=keep) result = self[np.logical_not(duplicated)] if inplace: @@ -1195,13 +1210,14 @@ def drop_duplicates(self, keep='first', inplace=False): False: 'first'}) @Appender(_shared_docs['duplicated'] % _indexops_doc_kwargs) def duplicated(self, keep='first'): - keys = com._values_from_object(com._ensure_object(self.values)) - duplicated = lib.duplicated(keys, keep=keep) - try: - return self._constructor(duplicated, + from pandas.core.algorithms import duplicated + if isinstance(self, ABCIndexClass): + if self.is_unique: + return np.zeros(len(self), dtype=np.bool) + return duplicated(self, keep=keep) + else: + return self._constructor(duplicated(self, keep=keep), index=self.index).__finalize__(self) - except AttributeError: - return np.array(duplicated, dtype=bool) # ---------------------------------------------------------------------- # abstracts diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index fa3d13c174245..0a13c8936eeec 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -5,7 +5,24 @@ import types from pandas import compat, lib -from pandas.compat import u +from pandas.compat import u, lzip + +from pandas.types.generic import ABCSeries, ABCIndexClass, ABCCategoricalIndex +from pandas.types.missing import isnull, notnull +from pandas.types.cast import (_possibly_infer_to_datetimelike, + _coerce_indexer_dtype) +from pandas.types.dtypes import CategoricalDtype +from pandas.types.common import (_ensure_int64, + _ensure_object, + _ensure_platform_int, + is_dtype_equal, + is_datetimelike, + is_categorical, + is_categorical_dtype, + is_integer_dtype, is_bool, + is_list_like, is_sequence, + is_scalar) +from pandas.core.common import is_null_slice from pandas.core.algorithms import factorize, take_1d from pandas.core.base import (PandasObject, PandasDelegate, @@ -16,13 +33,6 @@ from pandas.util.decorators import (Appender, cache_readonly, deprecate_kwarg, Substitution) -from pandas.core.common import ( - ABCSeries, ABCIndexClass, ABCCategoricalIndex, isnull, notnull, - is_dtype_equal, is_categorical_dtype, is_integer_dtype, - _possibly_infer_to_datetimelike, is_list_like, - is_sequence, is_null_slice, is_bool, _ensure_object, _ensure_int64, - _coerce_indexer_dtype) -from pandas.types.api import CategoricalDtype from pandas.util.terminal import get_terminal_size from pandas.core.config import get_option @@ -64,7 +74,7 @@ def f(self, other): # With cat[0], for example, being ``np.int64(1)`` by the time it gets # into this function would become ``np.array(1)``. other = lib.item_from_zerodim(other) - if lib.isscalar(other): + if is_scalar(other): if other in self.categories: i = self.categories.get_loc(other) return getattr(self._codes, op)(i) @@ -219,8 +229,10 @@ class Categorical(PandasObject): __array_priority__ = 1000 _typ = 'categorical' - def __init__(self, values, categories=None, ordered=False, name=None, - fastpath=False, levels=None): + def __init__(self, values, categories=None, ordered=False, + name=None, fastpath=False): + + self._validate_ordered(ordered) if fastpath: # fast path @@ -236,17 +248,6 @@ def __init__(self, values, categories=None, ordered=False, name=None, "name=\"something\")'") warn(msg, UserWarning, stacklevel=2) - # TODO: Remove after deprecation period in 2017/ after 0.18 - if levels is not None: - warn("Creating a 'Categorical' with 'levels' is deprecated, use " - "'categories' instead", FutureWarning, stacklevel=2) - if categories is None: - categories = levels - else: - raise ValueError("Cannot pass in both 'categories' and " - "(deprecated) 'levels', use only " - "'categories'", stacklevel=2) - # sanitize input if is_categorical_dtype(values): @@ -258,7 +259,7 @@ def __init__(self, values, categories=None, ordered=False, name=None, ordered = values.ordered if categories is None: categories = values.categories - values = values.__array__() + values = values.get_values() elif isinstance(values, (ABCIndexClass, ABCSeries)): pass @@ -330,11 +331,16 @@ def __init__(self, values, categories=None, ordered=False, name=None, self._categories = categories self._codes = _coerce_indexer_dtype(codes, categories) + @property + def _constructor(self): + return Categorical + def copy(self): """ Copy constructor. """ - return Categorical(values=self._codes.copy(), - categories=self.categories, ordered=self.ordered, - fastpath=True) + return self._constructor(values=self._codes.copy(), + categories=self.categories, + ordered=self.ordered, + fastpath=True) def astype(self, dtype, copy=True): """ @@ -348,7 +354,7 @@ def astype(self, dtype, copy=True): If copy is set to False and dtype is categorical, the original object is returned. - .. versionadded:: 0.18.2 + .. versionadded:: 0.19.0 """ if is_categorical_dtype(dtype): @@ -374,11 +380,28 @@ def itemsize(self): def reshape(self, new_shape, *args, **kwargs): """ - An ndarray-compatible method that returns - `self` because categorical instances cannot - actually be reshaped. + DEPRECATED: calling this method will raise an error in a + future release. + + An ndarray-compatible method that returns `self` because + `Categorical` instances cannot actually be reshaped. + + Parameters + ---------- + new_shape : int or tuple of ints + A 1-D array of integers that correspond to the new + shape of the `Categorical`. For more information on + the parameter, please refer to `np.reshape`. """ + warn("reshape is deprecated and will raise " + "in a subsequent release", FutureWarning, stacklevel=2) + nv.validate_reshape(args, kwargs) + + # while the 'new_shape' parameter has no effect, + # we should still enforce valid shape parameters + np.reshape(self.codes, new_shape) + return self @property @@ -389,6 +412,8 @@ def base(self): @classmethod def from_array(cls, data, **kwargs): """ + DEPRECATED: Use ``Categorical`` instead. + Make a Categorical type from a single array-like object. For internal compatibility with numpy arrays. @@ -399,7 +424,9 @@ def from_array(cls, data, **kwargs): Can be an Index or array-like. The categories are assumed to be the unique values of `data`. """ - return Categorical(data, **kwargs) + warn("Categorical.from_array is deprecated, use Categorical instead", + FutureWarning, stacklevel=2) + return cls(data, **kwargs) @classmethod def from_codes(cls, codes, categories, ordered=False, name=None): @@ -443,8 +470,8 @@ def from_codes(cls, codes, categories, ordered=False, name=None): raise ValueError("codes need to be between -1 and " "len(categories)-1") - return Categorical(codes, categories=categories, ordered=ordered, - fastpath=True) + return cls(codes, categories=categories, ordered=ordered, + fastpath=True) _codes = None @@ -482,6 +509,25 @@ def _get_labels(self): _categories = None + @classmethod + def _validate_ordered(cls, ordered): + """ + Validates that we have a valid ordered parameter. If + it is not a boolean, a TypeError will be raised. + + Parameters + ---------- + ordered : object + The parameter to be verified. + + Raises + ------ + TypeError + If 'ordered' is not a boolean. + """ + if not is_bool(ordered): + raise TypeError("'ordered' must either be 'True' or 'False'") + @classmethod def _validate_categories(cls, categories, fastpath=False): """ @@ -554,29 +600,8 @@ def _get_categories(self): categories = property(fget=_get_categories, fset=_set_categories, doc=_categories_doc) - def _set_levels(self, levels): - """ set new levels (deprecated, use "categories") """ - warn("Assigning to 'levels' is deprecated, use 'categories'", - FutureWarning, stacklevel=2) - self.categories = levels - - def _get_levels(self): - """ Gets the levels (deprecated, use "categories") """ - warn("Accessing 'levels' is deprecated, use 'categories'", - FutureWarning, stacklevel=2) - return self.categories - - # TODO: Remove after deprecation period in 2017/ after 0.18 - levels = property(fget=_get_levels, fset=_set_levels) - _ordered = None - def _set_ordered(self, value): - """ Sets the ordered attribute to the boolean value """ - warn("Setting 'ordered' directly is deprecated, use 'set_ordered'", - FutureWarning, stacklevel=2) - self.set_ordered(value, inplace=True) - def set_ordered(self, value, inplace=False): """ Sets the ordered attribute to the boolean value @@ -589,8 +614,7 @@ def set_ordered(self, value, inplace=False): Whether or not to set the ordered attribute inplace or return a copy of this categorical with ordered set to the value """ - if not is_bool(value): - raise TypeError("ordered must be a boolean value") + self._validate_ordered(value) cat = self if inplace else self.copy() cat._ordered = value if not inplace: @@ -624,7 +648,7 @@ def _get_ordered(self): """ Gets the ordered attribute """ return self._ordered - ordered = property(fget=_get_ordered, fset=_set_ordered) + ordered = property(fget=_get_ordered) def set_categories(self, new_categories, ordered=None, rename=False, inplace=False): @@ -922,9 +946,9 @@ def map(self, mapper): """ new_categories = self.categories.map(mapper) try: - return Categorical.from_codes(self._codes.copy(), - categories=new_categories, - ordered=self.ordered) + return self.from_codes(self._codes.copy(), + categories=new_categories, + ordered=self.ordered) except ValueError: return np.take(new_categories, self._codes) @@ -968,14 +992,14 @@ def shift(self, periods): if codes.ndim > 1: raise NotImplementedError("Categorical with ndim > 1.") if np.prod(codes.shape) and (periods != 0): - codes = np.roll(codes, com._ensure_platform_int(periods), axis=0) + codes = np.roll(codes, _ensure_platform_int(periods), axis=0) if periods > 0: codes[:periods] = -1 else: codes[periods:] = -1 - return Categorical.from_codes(codes, categories=self.categories, - ordered=self.ordered) + return self.from_codes(codes, categories=self.categories, + ordered=self.ordered) def __array__(self, dtype=None): """ @@ -999,11 +1023,12 @@ def __setstate__(self, state): raise Exception('invalid pickle state') # Provide compatibility with pre-0.15.0 Categoricals. - if '_codes' not in state and 'labels' in state: - state['_codes'] = state.pop('labels').astype(np.int8) if '_categories' not in state and '_levels' in state: state['_categories'] = self._validate_categories(state.pop( '_levels')) + if '_codes' not in state and 'labels' in state: + state['_codes'] = _coerce_indexer_dtype(state.pop('labels'), + state['_categories']) # 0.16.0 ordered change if '_ordered' not in state: @@ -1147,7 +1172,7 @@ def value_counts(self, dropna=True): counts : Series """ from numpy import bincount - from pandas.core.common import isnull + from pandas.types.missing import isnull from pandas.core.series import Series from pandas.core.index import CategoricalIndex @@ -1164,8 +1189,8 @@ def value_counts(self, dropna=True): count = bincount(np.where(mask, code, ncat)) ix = np.append(ix, -1) - ix = Categorical(ix, categories=cat, ordered=obj.ordered, - fastpath=True) + ix = self._constructor(ix, categories=cat, ordered=obj.ordered, + fastpath=True) return Series(count, index=CategoricalIndex(ix), dtype='int64') @@ -1181,7 +1206,7 @@ def get_values(self): Index if datetime / periods """ # if we are a datetime and period index, return Index to keep metadata - if com.is_datetimelike(self.categories): + if is_datetimelike(self.categories): return self.categories.take(self._codes, fill_value=np.nan) return np.array(self) @@ -1318,8 +1343,8 @@ def sort_values(self, inplace=False, ascending=True, na_position='last'): self._codes = codes return else: - return Categorical(values=codes, categories=self.categories, - ordered=self.ordered, fastpath=True) + return self._constructor(values=codes, categories=self.categories, + ordered=self.ordered, fastpath=True) def order(self, inplace=False, ascending=True, na_position='last'): """ @@ -1444,10 +1469,13 @@ def fillna(self, value=None, method=None, limit=None): mask = values == -1 if mask.any(): values = values.copy() - values[mask] = self.categories.get_loc(value) + if isnull(value): + values[mask] = -1 + else: + values[mask] = self.categories.get_loc(value) - return Categorical(values, categories=self.categories, - ordered=self.ordered, fastpath=True) + return self._constructor(values, categories=self.categories, + ordered=self.ordered, fastpath=True) def take_nd(self, indexer, allow_fill=True, fill_value=None): """ Take the codes by the indexer, fill with the fill_value. @@ -1460,8 +1488,8 @@ def take_nd(self, indexer, allow_fill=True, fill_value=None): assert isnull(fill_value) codes = take_1d(self._codes, indexer, allow_fill=True, fill_value=-1) - result = Categorical(codes, categories=self.categories, - ordered=self.ordered, fastpath=True) + result = self._constructor(codes, categories=self.categories, + ordered=self.ordered, fastpath=True) return result take = take_nd @@ -1481,8 +1509,8 @@ def _slice(self, slicer): slicer = slicer[1] _codes = self._codes[slicer] - return Categorical(values=_codes, categories=self.categories, - ordered=self.ordered, fastpath=True) + return self._constructor(values=_codes, categories=self.categories, + ordered=self.ordered, fastpath=True) def __len__(self): """The length of this Categorical.""" @@ -1593,10 +1621,9 @@ def __getitem__(self, key): else: return self.categories[i] else: - return Categorical(values=self._codes[key], - categories=self.categories, - ordered=self.ordered, - fastpath=True) + return self._constructor(values=self._codes[key], + categories=self.categories, + ordered=self.ordered, fastpath=True) def __setitem__(self, key, value): """ Item assignment. @@ -1747,8 +1774,8 @@ def mode(self): import pandas.hashtable as htable good = self._codes != -1 values = sorted(htable.mode_int64(_ensure_int64(self._codes[good]))) - result = Categorical(values=values, categories=self.categories, - ordered=self.ordered, fastpath=True) + result = self._constructor(values=values, categories=self.categories, + ordered=self.ordered, fastpath=True) return result def unique(self): @@ -1842,8 +1869,8 @@ def repeat(self, repeats, *args, **kwargs): """ nv.validate_repeat(args, kwargs) codes = self._codes.repeat(repeats) - return Categorical(values=codes, categories=self.categories, - ordered=self.ordered, fastpath=True) + return self._constructor(values=codes, categories=self.categories, + ordered=self.ordered, fastpath=True) # The Series.cat accessor @@ -1932,8 +1959,52 @@ def _convert_to_list_like(list_like): if (is_sequence(list_like) or isinstance(list_like, tuple) or isinstance(list_like, types.GeneratorType)): return list(list_like) - elif lib.isscalar(list_like): + elif is_scalar(list_like): return [list_like] else: # is this reached? return [list_like] + + +def _factorize_from_iterable(values): + """ + Factorize an input `values` into `categories` and `codes`. Preserves + categorical dtype in `categories`. + + *This is an internal function* + + Parameters + ---------- + values : list-like + + Returns + ------- + codes : np.array + categories : Index + If `values` has a categorical dtype, then `categories` is + a CategoricalIndex keeping the categories and order of `values`. + """ + from pandas.indexes.category import CategoricalIndex + + if is_categorical(values): + if isinstance(values, (ABCCategoricalIndex, ABCSeries)): + values = values._values + categories = CategoricalIndex(values.categories, + categories=values.categories, + ordered=values.ordered) + codes = values.codes + else: + cat = Categorical(values, ordered=True) + categories = cat.categories + codes = cat.codes + return codes, categories + + +def _factorize_from_iterables(iterables): + """ + A higher-level wrapper over `_factorize_from_iterable`. + See `_factorize_from_iterable` for more info. + + *This is an internal function* + """ + return lzip(*[_factorize_from_iterable(it) for it in iterables]) diff --git a/pandas/core/common.py b/pandas/core/common.py index d26c59e62de30..341bd3b4cc845 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -2,23 +2,66 @@ Misc tools for implementing data structures """ -import re -import collections -import numbers +import sys +import warnings from datetime import datetime, timedelta from functools import partial import numpy as np -import pandas as pd -import pandas.algos as algos import pandas.lib as lib import pandas.tslib as tslib from pandas import compat -from pandas.compat import (long, zip, map, string_types, - iteritems) -from pandas.types import api as gt -from pandas.types.api import * # noqa +from pandas.compat import long, zip, iteritems from pandas.core.config import get_option +from pandas.types.generic import ABCSeries +from pandas.types.common import _NS_DTYPE +from pandas.types.inference import _iterable_not_string +from pandas.types.missing import isnull +from pandas.api import types +from pandas.types import common + +# back-compat of public API +# deprecate these functions +m = sys.modules['pandas.core.common'] +for t in [t for t in dir(types) if not t.startswith('_')]: + + def outer(t=t): + + def wrapper(*args, **kwargs): + warnings.warn("pandas.core.common.{t} is deprecated. " + "import from the public API: " + "pandas.api.types.{t} instead".format(t=t), + DeprecationWarning, stacklevel=3) + return getattr(types, t)(*args, **kwargs) + return wrapper + + setattr(m, t, outer(t)) + +# back-compat for non-public functions +# deprecate these functions +for t in ['is_datetime_arraylike', + 'is_datetime_or_timedelta_dtype', + 'is_datetimelike', + 'is_datetimelike_v_numeric', + 'is_datetimelike_v_object', + 'is_datetimetz', + 'is_int_or_datetime_dtype', + 'is_period_arraylike', + 'is_string_like', + 'is_string_like_dtype']: + + def outer(t=t): + + def wrapper(*args, **kwargs): + warnings.warn("pandas.core.common.{t} is deprecated. " + "These are not longer public API functions, " + "but can be imported from " + "pandas.types.common.{t} instead".format(t=t), + DeprecationWarning, stacklevel=3) + return getattr(common, t)(*args, **kwargs) + return wrapper + + setattr(m, t, outer(t)) class PandasError(Exception): @@ -58,305 +101,6 @@ def __str__(self): self.class_instance.__class__.__name__) -_POSSIBLY_CAST_DTYPES = set([np.dtype(t).name - for t in ['O', 'int8', 'uint8', 'int16', 'uint16', - 'int32', 'uint32', 'int64', 'uint64']]) - -_NS_DTYPE = np.dtype('M8[ns]') -_TD_DTYPE = np.dtype('m8[ns]') -_INT64_DTYPE = np.dtype(np.int64) -_DATELIKE_DTYPES = set([np.dtype(t) - for t in ['M8[ns]', 'M8[ns]', - 'm8[ns]', 'm8[ns]']]) -_int8_max = np.iinfo(np.int8).max -_int16_max = np.iinfo(np.int16).max -_int32_max = np.iinfo(np.int32).max -_int64_max = np.iinfo(np.int64).max - - -def isnull(obj): - """Detect missing values (NaN in numeric arrays, None/NaN in object arrays) - - Parameters - ---------- - arr : ndarray or object value - Object to check for null-ness - - Returns - ------- - isnulled : array-like of bool or bool - Array or bool indicating whether an object is null or if an array is - given which of the element is null. - - See also - -------- - pandas.notnull: boolean inverse of pandas.isnull - """ - return _isnull(obj) - - -def _isnull_new(obj): - if lib.isscalar(obj): - return lib.checknull(obj) - # hack (for now) because MI registers as ndarray - elif isinstance(obj, pd.MultiIndex): - raise NotImplementedError("isnull is not defined for MultiIndex") - elif isinstance(obj, (gt.ABCSeries, np.ndarray, pd.Index)): - return _isnull_ndarraylike(obj) - elif isinstance(obj, gt.ABCGeneric): - return obj._constructor(obj._data.isnull(func=isnull)) - elif isinstance(obj, list) or hasattr(obj, '__array__'): - return _isnull_ndarraylike(np.asarray(obj)) - else: - return obj is None - - -def _isnull_old(obj): - """Detect missing values. Treat None, NaN, INF, -INF as null. - - Parameters - ---------- - arr: ndarray or object value - - Returns - ------- - boolean ndarray or boolean - """ - if lib.isscalar(obj): - return lib.checknull_old(obj) - # hack (for now) because MI registers as ndarray - elif isinstance(obj, pd.MultiIndex): - raise NotImplementedError("isnull is not defined for MultiIndex") - elif isinstance(obj, (gt.ABCSeries, np.ndarray, pd.Index)): - return _isnull_ndarraylike_old(obj) - elif isinstance(obj, gt.ABCGeneric): - return obj._constructor(obj._data.isnull(func=_isnull_old)) - elif isinstance(obj, list) or hasattr(obj, '__array__'): - return _isnull_ndarraylike_old(np.asarray(obj)) - else: - return obj is None - - -_isnull = _isnull_new - - -def _use_inf_as_null(key): - """Option change callback for null/inf behaviour - Choose which replacement for numpy.isnan / ~numpy.isfinite is used. - - Parameters - ---------- - flag: bool - True means treat None, NaN, INF, -INF as null (old way), - False means None and NaN are null, but INF, -INF are not null - (new way). - - Notes - ----- - This approach to setting global module values is discussed and - approved here: - - * http://stackoverflow.com/questions/4859217/ - programmatically-creating-variables-in-python/4859312#4859312 - """ - flag = get_option(key) - if flag: - globals()['_isnull'] = _isnull_old - else: - globals()['_isnull'] = _isnull_new - - -def _isnull_ndarraylike(obj): - - values = getattr(obj, 'values', obj) - dtype = values.dtype - - if is_string_dtype(dtype): - if is_categorical_dtype(values): - from pandas import Categorical - if not isinstance(values, Categorical): - values = values.values - result = values.isnull() - else: - - # Working around NumPy ticket 1542 - shape = values.shape - - if is_string_like_dtype(dtype): - result = np.zeros(values.shape, dtype=bool) - else: - result = np.empty(shape, dtype=bool) - vec = lib.isnullobj(values.ravel()) - result[...] = vec.reshape(shape) - - elif is_datetimelike(obj): - # this is the NaT pattern - result = values.view('i8') == tslib.iNaT - else: - result = np.isnan(values) - - # box - if isinstance(obj, gt.ABCSeries): - from pandas import Series - result = Series(result, index=obj.index, name=obj.name, copy=False) - - return result - - -def _isnull_ndarraylike_old(obj): - values = getattr(obj, 'values', obj) - dtype = values.dtype - - if is_string_dtype(dtype): - # Working around NumPy ticket 1542 - shape = values.shape - - if is_string_like_dtype(dtype): - result = np.zeros(values.shape, dtype=bool) - else: - result = np.empty(shape, dtype=bool) - vec = lib.isnullobj_old(values.ravel()) - result[:] = vec.reshape(shape) - - elif dtype in _DATELIKE_DTYPES: - # this is the NaT pattern - result = values.view('i8') == tslib.iNaT - else: - result = ~np.isfinite(values) - - # box - if isinstance(obj, gt.ABCSeries): - from pandas import Series - result = Series(result, index=obj.index, name=obj.name, copy=False) - - return result - - -def notnull(obj): - """Replacement for numpy.isfinite / ~numpy.isnan which is suitable for use - on object arrays. - - Parameters - ---------- - arr : ndarray or object value - Object to check for *not*-null-ness - - Returns - ------- - isnulled : array-like of bool or bool - Array or bool indicating whether an object is *not* null or if an array - is given which of the element is *not* null. - - See also - -------- - pandas.isnull : boolean inverse of pandas.notnull - """ - res = isnull(obj) - if lib.isscalar(res): - return not res - return ~res - - -def is_null_datelike_scalar(other): - """ test whether the object is a null datelike, e.g. Nat - but guard against passing a non-scalar """ - if other is pd.NaT or other is None: - return True - elif lib.isscalar(other): - - # a timedelta - if hasattr(other, 'dtype'): - return other.view('i8') == tslib.iNaT - elif is_integer(other) and other == tslib.iNaT: - return True - return isnull(other) - return False - - -def array_equivalent(left, right, strict_nan=False): - """ - True if two arrays, left and right, have equal non-NaN elements, and NaNs - in corresponding locations. False otherwise. It is assumed that left and - right are NumPy arrays of the same dtype. The behavior of this function - (particularly with respect to NaNs) is not defined if the dtypes are - different. - - Parameters - ---------- - left, right : ndarrays - strict_nan : bool, default False - If True, consider NaN and None to be different. - - Returns - ------- - b : bool - Returns True if the arrays are equivalent. - - Examples - -------- - >>> array_equivalent( - ... np.array([1, 2, np.nan]), - ... np.array([1, 2, np.nan])) - True - >>> array_equivalent( - ... np.array([1, np.nan, 2]), - ... np.array([1, 2, np.nan])) - False - """ - - left, right = np.asarray(left), np.asarray(right) - - # shape compat - if left.shape != right.shape: - return False - - # Object arrays can contain None, NaN and NaT. - # string dtypes must be come to this path for NumPy 1.7.1 compat - if is_string_dtype(left) or is_string_dtype(right): - - if not strict_nan: - # pd.isnull considers NaN and None to be equivalent. - return lib.array_equivalent_object(_ensure_object(left.ravel()), - _ensure_object(right.ravel())) - - for left_value, right_value in zip(left, right): - if left_value is tslib.NaT and right_value is not tslib.NaT: - return False - - elif isinstance(left_value, float) and np.isnan(left_value): - if (not isinstance(right_value, float) or - not np.isnan(right_value)): - return False - else: - if left_value != right_value: - return False - return True - - # NaNs can occur in float and complex arrays. - if is_float_dtype(left) or is_complex_dtype(left): - return ((left == right) | (np.isnan(left) & np.isnan(right))).all() - - # numpy will will not allow this type of datetimelike vs integer comparison - elif is_datetimelike_v_numeric(left, right): - return False - - # M8/m8 - elif needs_i8_conversion(left) and needs_i8_conversion(right): - if not is_dtype_equal(left.dtype, right.dtype): - return False - - left = left.view('i8') - right = right.view('i8') - - # NaNs cannot occur otherwise. - return np.array_equal(left, right) - - -def _iterable_not_string(x): - return (isinstance(x, collections.Iterable) and - not isinstance(x, compat.string_types)) - - def flatten(l): """Flatten an arbitrarily nested sequence. @@ -381,510 +125,6 @@ def flatten(l): yield el -def _coerce_indexer_dtype(indexer, categories): - """ coerce the indexer input array to the smallest dtype possible """ - l = len(categories) - if l < _int8_max: - return _ensure_int8(indexer) - elif l < _int16_max: - return _ensure_int16(indexer) - elif l < _int32_max: - return _ensure_int32(indexer) - return _ensure_int64(indexer) - - -def _coerce_to_dtypes(result, dtypes): - """ given a dtypes and a result set, coerce the result elements to the - dtypes - """ - if len(result) != len(dtypes): - raise AssertionError("_coerce_to_dtypes requires equal len arrays") - - from pandas.tseries.timedeltas import _coerce_scalar_to_timedelta_type - - def conv(r, dtype): - try: - if isnull(r): - pass - elif dtype == _NS_DTYPE: - r = lib.Timestamp(r) - elif dtype == _TD_DTYPE: - r = _coerce_scalar_to_timedelta_type(r) - elif dtype == np.bool_: - # messy. non 0/1 integers do not get converted. - if is_integer(r) and r not in [0, 1]: - return int(r) - r = bool(r) - elif dtype.kind == 'f': - r = float(r) - elif dtype.kind == 'i': - r = int(r) - except: - pass - - return r - - return [conv(r, dtype) for r, dtype in zip(result, dtypes)] - - -def _infer_fill_value(val): - """ - infer the fill value for the nan/NaT from the provided - scalar/ndarray/list-like if we are a NaT, return the correct dtyped - element to provide proper block construction - """ - - if not is_list_like(val): - val = [val] - val = np.array(val, copy=False) - if is_datetimelike(val): - return np.array('NaT', dtype=val.dtype) - elif is_object_dtype(val.dtype): - dtype = lib.infer_dtype(_ensure_object(val)) - if dtype in ['datetime', 'datetime64']: - return np.array('NaT', dtype=_NS_DTYPE) - elif dtype in ['timedelta', 'timedelta64']: - return np.array('NaT', dtype=_TD_DTYPE) - return np.nan - - -def _infer_dtype_from_scalar(val): - """ interpret the dtype from a scalar """ - - dtype = np.object_ - - # a 1-element ndarray - if isinstance(val, np.ndarray): - if val.ndim != 0: - raise ValueError( - "invalid ndarray passed to _infer_dtype_from_scalar") - - dtype = val.dtype - val = val.item() - - elif isinstance(val, compat.string_types): - - # If we create an empty array using a string to infer - # the dtype, NumPy will only allocate one character per entry - # so this is kind of bad. Alternately we could use np.repeat - # instead of np.empty (but then you still don't want things - # coming out as np.str_! - - dtype = np.object_ - - elif isinstance(val, (np.datetime64, - datetime)) and getattr(val, 'tzinfo', None) is None: - val = lib.Timestamp(val).value - dtype = np.dtype('M8[ns]') - - elif isinstance(val, (np.timedelta64, timedelta)): - val = lib.Timedelta(val).value - dtype = np.dtype('m8[ns]') - - elif is_bool(val): - dtype = np.bool_ - - elif is_integer(val): - if isinstance(val, np.integer): - dtype = type(val) - else: - dtype = np.int64 - - elif is_float(val): - if isinstance(val, np.floating): - dtype = type(val) - else: - dtype = np.float64 - - elif is_complex(val): - dtype = np.complex_ - - return dtype, val - - -def _is_na_compat(arr, fill_value=np.nan): - """ - Parameters - ---------- - arr: a numpy array - fill_value: fill value, default to np.nan - - Returns - ------- - True if we can fill using this fill_value - """ - dtype = arr.dtype - if isnull(fill_value): - return not (is_bool_dtype(dtype) or - is_integer_dtype(dtype)) - return True - - -def _maybe_fill(arr, fill_value=np.nan): - """ - if we have a compatiable fill_value and arr dtype, then fill - """ - if _is_na_compat(arr, fill_value): - arr.fill(fill_value) - return arr - - -def _maybe_promote(dtype, fill_value=np.nan): - - # if we passed an array here, determine the fill value by dtype - if isinstance(fill_value, np.ndarray): - if issubclass(fill_value.dtype.type, (np.datetime64, np.timedelta64)): - fill_value = tslib.iNaT - else: - - # we need to change to object type as our - # fill_value is of object type - if fill_value.dtype == np.object_: - dtype = np.dtype(np.object_) - fill_value = np.nan - - # returns tuple of (dtype, fill_value) - if issubclass(dtype.type, (np.datetime64, np.timedelta64)): - # for now: refuse to upcast datetime64 - # (this is because datetime64 will not implicitly upconvert - # to object correctly as of numpy 1.6.1) - if isnull(fill_value): - fill_value = tslib.iNaT - else: - if issubclass(dtype.type, np.datetime64): - try: - fill_value = lib.Timestamp(fill_value).value - except: - # the proper thing to do here would probably be to upcast - # to object (but numpy 1.6.1 doesn't do this properly) - fill_value = tslib.iNaT - elif issubclass(dtype.type, np.timedelta64): - try: - fill_value = lib.Timedelta(fill_value).value - except: - # as for datetimes, cannot upcast to object - fill_value = tslib.iNaT - else: - fill_value = tslib.iNaT - elif is_datetimetz(dtype): - if isnull(fill_value): - fill_value = tslib.iNaT - elif is_float(fill_value): - if issubclass(dtype.type, np.bool_): - dtype = np.object_ - elif issubclass(dtype.type, np.integer): - dtype = np.float64 - elif is_bool(fill_value): - if not issubclass(dtype.type, np.bool_): - dtype = np.object_ - elif is_integer(fill_value): - if issubclass(dtype.type, np.bool_): - dtype = np.object_ - elif issubclass(dtype.type, np.integer): - # upcast to prevent overflow - arr = np.asarray(fill_value) - if arr != arr.astype(dtype): - dtype = arr.dtype - elif is_complex(fill_value): - if issubclass(dtype.type, np.bool_): - dtype = np.object_ - elif issubclass(dtype.type, (np.integer, np.floating)): - dtype = np.complex128 - elif fill_value is None: - if is_float_dtype(dtype) or is_complex_dtype(dtype): - fill_value = np.nan - elif is_integer_dtype(dtype): - dtype = np.float64 - fill_value = np.nan - elif is_datetime_or_timedelta_dtype(dtype): - fill_value = tslib.iNaT - else: - dtype = np.object_ - else: - dtype = np.object_ - - # in case we have a string that looked like a number - if is_categorical_dtype(dtype): - pass - elif is_datetimetz(dtype): - pass - elif issubclass(np.dtype(dtype).type, compat.string_types): - dtype = np.object_ - - return dtype, fill_value - - -def _maybe_upcast_putmask(result, mask, other): - """ - A safe version of putmask that potentially upcasts the result - - Parameters - ---------- - result : ndarray - The destination array. This will be mutated in-place if no upcasting is - necessary. - mask : boolean ndarray - other : ndarray or scalar - The source array or value - - Returns - ------- - result : ndarray - changed : boolean - Set to true if the result array was upcasted - """ - - if mask.any(): - # Two conversions for date-like dtypes that can't be done automatically - # in np.place: - # NaN -> NaT - # integer or integer array -> date-like array - if result.dtype in _DATELIKE_DTYPES: - if lib.isscalar(other): - if isnull(other): - other = result.dtype.type('nat') - elif is_integer(other): - other = np.array(other, dtype=result.dtype) - elif is_integer_dtype(other): - other = np.array(other, dtype=result.dtype) - - def changeit(): - - # try to directly set by expanding our array to full - # length of the boolean - try: - om = other[mask] - om_at = om.astype(result.dtype) - if (om == om_at).all(): - new_result = result.values.copy() - new_result[mask] = om_at - result[:] = new_result - return result, False - except: - pass - - # we are forced to change the dtype of the result as the input - # isn't compatible - r, _ = _maybe_upcast(result, fill_value=other, copy=True) - np.place(r, mask, other) - - return r, True - - # we want to decide whether place will work - # if we have nans in the False portion of our mask then we need to - # upcast (possibly), otherwise we DON't want to upcast (e.g. if we - # have values, say integers, in the success portion then it's ok to not - # upcast) - new_dtype, _ = _maybe_promote(result.dtype, other) - if new_dtype != result.dtype: - - # we have a scalar or len 0 ndarray - # and its nan and we are changing some values - if (lib.isscalar(other) or - (isinstance(other, np.ndarray) and other.ndim < 1)): - if isnull(other): - return changeit() - - # we have an ndarray and the masking has nans in it - else: - - if isnull(other[mask]).any(): - return changeit() - - try: - np.place(result, mask, other) - except: - return changeit() - - return result, False - - -def _maybe_upcast(values, fill_value=np.nan, dtype=None, copy=False): - """ provide explict type promotion and coercion - - Parameters - ---------- - values : the ndarray that we want to maybe upcast - fill_value : what we want to fill with - dtype : if None, then use the dtype of the values, else coerce to this type - copy : if True always make a copy even if no upcast is required - """ - - if is_extension_type(values): - if copy: - values = values.copy() - else: - if dtype is None: - dtype = values.dtype - new_dtype, fill_value = _maybe_promote(dtype, fill_value) - if new_dtype != values.dtype: - values = values.astype(new_dtype) - elif copy: - values = values.copy() - - return values, fill_value - - -def _possibly_cast_item(obj, item, dtype): - chunk = obj[item] - - if chunk.values.dtype != dtype: - if dtype in (np.object_, np.bool_): - obj[item] = chunk.astype(np.object_) - elif not issubclass(dtype, (np.integer, np.bool_)): # pragma: no cover - raise ValueError("Unexpected dtype encountered: %s" % dtype) - - -def _possibly_downcast_to_dtype(result, dtype): - """ try to cast to the specified dtype (e.g. convert back to bool/int - or could be an astype of float64->float32 - """ - - if lib.isscalar(result): - return result - - def trans(x): - return x - - if isinstance(dtype, compat.string_types): - if dtype == 'infer': - inferred_type = lib.infer_dtype(_ensure_object(result.ravel())) - if inferred_type == 'boolean': - dtype = 'bool' - elif inferred_type == 'integer': - dtype = 'int64' - elif inferred_type == 'datetime64': - dtype = 'datetime64[ns]' - elif inferred_type == 'timedelta64': - dtype = 'timedelta64[ns]' - - # try to upcast here - elif inferred_type == 'floating': - dtype = 'int64' - if issubclass(result.dtype.type, np.number): - - def trans(x): # noqa - return x.round() - else: - dtype = 'object' - - if isinstance(dtype, compat.string_types): - dtype = np.dtype(dtype) - - try: - - # don't allow upcasts here (except if empty) - if dtype.kind == result.dtype.kind: - if (result.dtype.itemsize <= dtype.itemsize and - np.prod(result.shape)): - return result - - if issubclass(dtype.type, np.floating): - return result.astype(dtype) - elif dtype == np.bool_ or issubclass(dtype.type, np.integer): - - # if we don't have any elements, just astype it - if not np.prod(result.shape): - return trans(result).astype(dtype) - - # do a test on the first element, if it fails then we are done - r = result.ravel() - arr = np.array([r[0]]) - - # if we have any nulls, then we are done - if isnull(arr).any() or not np.allclose(arr, - trans(arr).astype(dtype)): - return result - - # a comparable, e.g. a Decimal may slip in here - elif not isinstance(r[0], (np.integer, np.floating, np.bool, int, - float, bool)): - return result - - if (issubclass(result.dtype.type, (np.object_, np.number)) and - notnull(result).all()): - new_result = trans(result).astype(dtype) - try: - if np.allclose(new_result, result): - return new_result - except: - - # comparison of an object dtype with a number type could - # hit here - if (new_result == result).all(): - return new_result - - # a datetimelike - elif dtype.kind in ['M', 'm'] and result.dtype.kind in ['i']: - try: - result = result.astype(dtype) - except: - if dtype.tz: - # convert to datetime and change timezone - result = pd.to_datetime(result).tz_localize(dtype.tz) - - except: - pass - - return result - - -def _maybe_convert_string_to_object(values): - """ - - Convert string-like and string-like array to convert object dtype. - This is to avoid numpy to handle the array as str dtype. - """ - if isinstance(values, string_types): - values = np.array([values], dtype=object) - elif (isinstance(values, np.ndarray) and - issubclass(values.dtype.type, (np.string_, np.unicode_))): - values = values.astype(object) - return values - - -def _maybe_convert_scalar(values): - """ - Convert a python scalar to the appropriate numpy dtype if possible - This avoids numpy directly converting according to platform preferences - """ - if lib.isscalar(values): - dtype, values = _infer_dtype_from_scalar(values) - try: - values = dtype(values) - except TypeError: - pass - return values - - -def _lcd_dtypes(a_dtype, b_dtype): - """ return the lcd dtype to hold these types """ - - if is_datetime64_dtype(a_dtype) or is_datetime64_dtype(b_dtype): - return _NS_DTYPE - elif is_timedelta64_dtype(a_dtype) or is_timedelta64_dtype(b_dtype): - return _TD_DTYPE - elif is_complex_dtype(a_dtype): - if is_complex_dtype(b_dtype): - return a_dtype - return np.float64 - elif is_integer_dtype(a_dtype): - if is_integer_dtype(b_dtype): - if a_dtype.itemsize == b_dtype.itemsize: - return a_dtype - return np.int64 - return np.float64 - elif is_float_dtype(a_dtype): - if is_float_dtype(b_dtype): - if a_dtype.itemsize == b_dtype.itemsize: - return a_dtype - else: - return np.float64 - elif is_integer(b_dtype): - return np.float64 - return np.object - - def _consensus_name_attr(objs): name = objs[0].name for obj in objs[1:]: @@ -892,66 +132,20 @@ def _consensus_name_attr(objs): return None return name -# ---------------------------------------------------------------------- -# Lots of little utilities - - -def _validate_date_like_dtype(dtype): - try: - typ = np.datetime_data(dtype)[0] - except ValueError as e: - raise TypeError('%s' % e) - if typ != 'generic' and typ != 'ns': - raise ValueError('%r is too specific of a frequency, try passing %r' % - (dtype.name, dtype.type.__name__)) - - -def _invalidate_string_dtypes(dtype_set): - """Change string like dtypes to object for - ``DataFrame.select_dtypes()``. - """ - non_string_dtypes = dtype_set - _string_dtypes - if non_string_dtypes != dtype_set: - raise TypeError("string dtypes are not allowed, use 'object' instead") - - -def _get_dtype_from_object(dtype): - """Get a numpy dtype.type-style object. This handles the datetime64[ns] - and datetime64[ns, TZ] compat - - Notes - ----- - If nothing can be found, returns ``object``. - """ - # type object from a dtype - if isinstance(dtype, type) and issubclass(dtype, np.generic): - return dtype - elif is_categorical(dtype): - return gt.CategoricalDtype().type - elif is_datetimetz(dtype): - return gt.DatetimeTZDtype(dtype).type - elif isinstance(dtype, np.dtype): # dtype object - try: - _validate_date_like_dtype(dtype) - except TypeError: - # should still pass if we don't have a datelike - pass - return dtype.type - elif isinstance(dtype, compat.string_types): - if dtype == 'datetime' or dtype == 'timedelta': - dtype += '64' - - try: - return _get_dtype_from_object(getattr(np, dtype)) - except (AttributeError, TypeError): - # handles cases like _get_dtype(int) - # i.e., python objects that are valid dtypes (unlike user-defined - # types, in general) - # TypeError handles the float16 typecode of 'e' - # further handle internal types - pass - return _get_dtype_from_object(np.dtype(dtype)) +def _maybe_match_name(a, b): + a_has = hasattr(a, 'name') + b_has = hasattr(b, 'name') + if a_has and b_has: + if a.name == b.name: + return a.name + else: + return None + elif a_has: + return a.name + elif b_has: + return b.name + return None def _get_info_slice(obj, indexer): @@ -988,225 +182,8 @@ def _maybe_box_datetimelike(value): _values_from_object = lib.values_from_object -def _possibly_castable(arr): - # return False to force a non-fastpath - - # check datetime64[ns]/timedelta64[ns] are valid - # otherwise try to coerce - kind = arr.dtype.kind - if kind == 'M' or kind == 'm': - return arr.dtype in _DATELIKE_DTYPES - - return arr.dtype.name not in _POSSIBLY_CAST_DTYPES - - -def _possibly_convert_platform(values): - """ try to do platform conversion, allow ndarray or list here """ - - if isinstance(values, (list, tuple)): - values = lib.list_to_object_array(values) - if getattr(values, 'dtype', None) == np.object_: - if hasattr(values, '_values'): - values = values._values - values = lib.maybe_convert_objects(values) - - return values - - -def _possibly_cast_to_datetime(value, dtype, errors='raise'): - """ try to cast the array/value to a datetimelike dtype, converting float - nan to iNaT - """ - from pandas.tseries.timedeltas import to_timedelta - from pandas.tseries.tools import to_datetime - - if dtype is not None: - if isinstance(dtype, compat.string_types): - dtype = np.dtype(dtype) - - is_datetime64 = is_datetime64_dtype(dtype) - is_datetime64tz = is_datetime64tz_dtype(dtype) - is_timedelta64 = is_timedelta64_dtype(dtype) - - if is_datetime64 or is_datetime64tz or is_timedelta64: - - # force the dtype if needed - if is_datetime64 and not is_dtype_equal(dtype, _NS_DTYPE): - if dtype.name == 'datetime64[ns]': - dtype = _NS_DTYPE - else: - raise TypeError("cannot convert datetimelike to " - "dtype [%s]" % dtype) - elif is_datetime64tz: - - # our NaT doesn't support tz's - # this will coerce to DatetimeIndex with - # a matching dtype below - if lib.isscalar(value) and isnull(value): - value = [value] - - elif is_timedelta64 and not is_dtype_equal(dtype, _TD_DTYPE): - if dtype.name == 'timedelta64[ns]': - dtype = _TD_DTYPE - else: - raise TypeError("cannot convert timedeltalike to " - "dtype [%s]" % dtype) - - if lib.isscalar(value): - if value == tslib.iNaT or isnull(value): - value = tslib.iNaT - else: - value = np.array(value, copy=False) - - # have a scalar array-like (e.g. NaT) - if value.ndim == 0: - value = tslib.iNaT - - # we have an array of datetime or timedeltas & nulls - elif np.prod(value.shape) or not is_dtype_equal(value.dtype, - dtype): - try: - if is_datetime64: - value = to_datetime(value, errors=errors)._values - elif is_datetime64tz: - # input has to be UTC at this point, so just - # localize - value = to_datetime( - value, - errors=errors).tz_localize(dtype.tz) - elif is_timedelta64: - value = to_timedelta(value, errors=errors)._values - except (AttributeError, ValueError, TypeError): - pass - - # coerce datetimelike to object - elif is_datetime64_dtype(value) and not is_datetime64_dtype(dtype): - if is_object_dtype(dtype): - ints = np.asarray(value).view('i8') - return tslib.ints_to_pydatetime(ints) - - # we have a non-castable dtype that was passed - raise TypeError('Cannot cast datetime64 to %s' % dtype) - - else: - - is_array = isinstance(value, np.ndarray) - - # catch a datetime/timedelta that is not of ns variety - # and no coercion specified - if is_array and value.dtype.kind in ['M', 'm']: - dtype = value.dtype - - if dtype.kind == 'M' and dtype != _NS_DTYPE: - value = value.astype(_NS_DTYPE) - - elif dtype.kind == 'm' and dtype != _TD_DTYPE: - value = to_timedelta(value) - - # only do this if we have an array and the dtype of the array is not - # setup already we are not an integer/object, so don't bother with this - # conversion - elif not (is_array and not (issubclass(value.dtype.type, np.integer) or - value.dtype == np.object_)): - value = _possibly_infer_to_datetimelike(value) - - return value - - -def _possibly_infer_to_datetimelike(value, convert_dates=False): - """ - we might have an array (or single object) that is datetime like, - and no dtype is passed don't change the value unless we find a - datetime/timedelta set - - this is pretty strict in that a datetime/timedelta is REQUIRED - in addition to possible nulls/string likes - - ONLY strings are NOT datetimelike - - Parameters - ---------- - value : np.array / Series / Index / list-like - convert_dates : boolean, default False - if True try really hard to convert dates (such as datetime.date), other - leave inferred dtype 'date' alone - - """ - - if isinstance(value, (gt.ABCDatetimeIndex, gt.ABCPeriodIndex)): - return value - elif isinstance(value, gt.ABCSeries): - if isinstance(value._values, gt.ABCDatetimeIndex): - return value._values - - v = value - if not is_list_like(v): - v = [v] - v = np.array(v, copy=False) - shape = v.shape - if not v.ndim == 1: - v = v.ravel() - - if len(v): - - def _try_datetime(v): - # safe coerce to datetime64 - try: - v = tslib.array_to_datetime(v, errors='raise') - except ValueError: - - # we might have a sequence of the same-datetimes with tz's - # if so coerce to a DatetimeIndex; if they are not the same, - # then these stay as object dtype - try: - from pandas import to_datetime - return to_datetime(v) - except: - pass - - except: - pass - - return v.reshape(shape) - - def _try_timedelta(v): - # safe coerce to timedelta64 - - # will try first with a string & object conversion - from pandas.tseries.timedeltas import to_timedelta - try: - return to_timedelta(v)._values.reshape(shape) - except: - return v - - # do a quick inference for perf - sample = v[:min(3, len(v))] - inferred_type = lib.infer_dtype(sample) - - if (inferred_type in ['datetime', 'datetime64'] or - (convert_dates and inferred_type in ['date'])): - value = _try_datetime(v) - elif inferred_type in ['timedelta', 'timedelta64']: - value = _try_timedelta(v) - - # It's possible to have nulls intermixed within the datetime or - # timedelta. These will in general have an inferred_type of 'mixed', - # so have to try both datetime and timedelta. - - # try timedelta first to avoid spurious datetime conversions - # e.g. '00:00:01' is a timedelta but technically is also a datetime - elif inferred_type in ['mixed']: - - if lib.is_possible_datetimelike_array(_ensure_object(v)): - value = _try_timedelta(v) - if lib.infer_dtype(value) in ['mixed']: - value = _try_datetime(v) - - return value - - def is_bool_indexer(key): - if isinstance(key, (gt.ABCSeries, np.ndarray)): + if isinstance(key, (ABCSeries, np.ndarray)): if key.dtype == np.object_: key = np.asarray(_values_from_object(key)) @@ -1233,12 +210,6 @@ def _default_index(n): return RangeIndex(0, n, name=None) -def ensure_float(arr): - if issubclass(arr.dtype.type, (np.integer, np.bool_)): - arr = arr.astype(float) - return arr - - def _mut_exclusive(**kwargs): item1, item2 = kwargs.items() label1, val1 = item1 @@ -1270,6 +241,10 @@ def _all_not_none(*args): return True +def _count_not_none(*args): + return sum(x is not None for x in args) + + def _try_sort(iterable): listed = list(iterable) try: @@ -1278,10 +253,6 @@ def _try_sort(iterable): return listed -def _count_not_none(*args): - return sum(x is not None for x in args) - - def iterpairs(seq): """ Parameters @@ -1434,349 +405,6 @@ def _maybe_make_list(obj): return [obj] return obj -# TYPE TESTING - -is_bool = lib.is_bool - -is_integer = lib.is_integer - -is_float = lib.is_float - -is_complex = lib.is_complex - - -def is_string_like(obj): - return isinstance(obj, (compat.text_type, compat.string_types)) - - -def is_iterator(obj): - # python 3 generators have __next__ instead of next - return hasattr(obj, 'next') or hasattr(obj, '__next__') - - -def is_number(obj): - return isinstance(obj, (numbers.Number, np.number)) - - -def is_period_arraylike(arr): - """ return if we are period arraylike / PeriodIndex """ - if isinstance(arr, pd.PeriodIndex): - return True - elif isinstance(arr, (np.ndarray, gt.ABCSeries)): - return arr.dtype == object and lib.infer_dtype(arr) == 'period' - return getattr(arr, 'inferred_type', None) == 'period' - - -def is_datetime_arraylike(arr): - """ return if we are datetime arraylike / DatetimeIndex """ - if isinstance(arr, gt.ABCDatetimeIndex): - return True - elif isinstance(arr, (np.ndarray, gt.ABCSeries)): - return arr.dtype == object and lib.infer_dtype(arr) == 'datetime' - return getattr(arr, 'inferred_type', None) == 'datetime' - - -def is_datetimelike(arr): - return (arr.dtype in _DATELIKE_DTYPES or - isinstance(arr, gt.ABCPeriodIndex) or - is_datetimetz(arr)) - - -def _coerce_to_dtype(dtype): - """ coerce a string / np.dtype to a dtype """ - if is_categorical_dtype(dtype): - dtype = gt.CategoricalDtype() - elif is_datetime64tz_dtype(dtype): - dtype = gt.DatetimeTZDtype(dtype) - else: - dtype = np.dtype(dtype) - return dtype - - -def _get_dtype(arr_or_dtype): - if isinstance(arr_or_dtype, np.dtype): - return arr_or_dtype - elif isinstance(arr_or_dtype, type): - return np.dtype(arr_or_dtype) - elif isinstance(arr_or_dtype, gt.CategoricalDtype): - return arr_or_dtype - elif isinstance(arr_or_dtype, gt.DatetimeTZDtype): - return arr_or_dtype - elif isinstance(arr_or_dtype, compat.string_types): - if is_categorical_dtype(arr_or_dtype): - return gt.CategoricalDtype.construct_from_string(arr_or_dtype) - elif is_datetime64tz_dtype(arr_or_dtype): - return gt.DatetimeTZDtype.construct_from_string(arr_or_dtype) - - if hasattr(arr_or_dtype, 'dtype'): - arr_or_dtype = arr_or_dtype.dtype - return np.dtype(arr_or_dtype) - - -def _get_dtype_type(arr_or_dtype): - if isinstance(arr_or_dtype, np.dtype): - return arr_or_dtype.type - elif isinstance(arr_or_dtype, type): - return np.dtype(arr_or_dtype).type - elif isinstance(arr_or_dtype, gt.CategoricalDtype): - return gt.CategoricalDtypeType - elif isinstance(arr_or_dtype, gt.DatetimeTZDtype): - return gt.DatetimeTZDtypeType - elif isinstance(arr_or_dtype, compat.string_types): - if is_categorical_dtype(arr_or_dtype): - return gt.CategoricalDtypeType - elif is_datetime64tz_dtype(arr_or_dtype): - return gt.DatetimeTZDtypeType - return _get_dtype_type(np.dtype(arr_or_dtype)) - try: - return arr_or_dtype.dtype.type - except AttributeError: - return type(None) - - -def is_dtype_equal(source, target): - """ return a boolean if the dtypes are equal """ - try: - source = _get_dtype(source) - target = _get_dtype(target) - return source == target - except (TypeError, AttributeError): - - # invalid comparison - # object == category will hit this - return False - - -def is_any_int_dtype(arr_or_dtype): - tipo = _get_dtype_type(arr_or_dtype) - return issubclass(tipo, np.integer) - - -def is_integer_dtype(arr_or_dtype): - tipo = _get_dtype_type(arr_or_dtype) - return (issubclass(tipo, np.integer) and - not issubclass(tipo, (np.datetime64, np.timedelta64))) - - -def is_int64_dtype(arr_or_dtype): - tipo = _get_dtype_type(arr_or_dtype) - return issubclass(tipo, np.int64) - - -def is_int_or_datetime_dtype(arr_or_dtype): - tipo = _get_dtype_type(arr_or_dtype) - return (issubclass(tipo, np.integer) or - issubclass(tipo, (np.datetime64, np.timedelta64))) - - -def is_datetime64_dtype(arr_or_dtype): - try: - tipo = _get_dtype_type(arr_or_dtype) - except TypeError: - return False - return issubclass(tipo, np.datetime64) - - -def is_datetime64tz_dtype(arr_or_dtype): - return gt.DatetimeTZDtype.is_dtype(arr_or_dtype) - - -def is_datetime64_any_dtype(arr_or_dtype): - return (is_datetime64_dtype(arr_or_dtype) or - is_datetime64tz_dtype(arr_or_dtype)) - - -def is_datetime64_ns_dtype(arr_or_dtype): - try: - tipo = _get_dtype(arr_or_dtype) - except TypeError: - return False - return tipo == _NS_DTYPE - - -def is_timedelta64_dtype(arr_or_dtype): - tipo = _get_dtype_type(arr_or_dtype) - return issubclass(tipo, np.timedelta64) - - -def is_timedelta64_ns_dtype(arr_or_dtype): - tipo = _get_dtype(arr_or_dtype) - return tipo == _TD_DTYPE - - -def is_datetime_or_timedelta_dtype(arr_or_dtype): - tipo = _get_dtype_type(arr_or_dtype) - return issubclass(tipo, (np.datetime64, np.timedelta64)) - - -def is_numeric_v_string_like(a, b): - """ - numpy doesn't like to compare numeric arrays vs scalar string-likes - - return a boolean result if this is the case for a,b or b,a - - """ - is_a_array = isinstance(a, np.ndarray) - is_b_array = isinstance(b, np.ndarray) - - is_a_numeric_array = is_a_array and is_numeric_dtype(a) - is_b_numeric_array = is_b_array and is_numeric_dtype(b) - is_a_string_array = is_a_array and is_string_like_dtype(a) - is_b_string_array = is_b_array and is_string_like_dtype(b) - - is_a_scalar_string_like = not is_a_array and is_string_like(a) - is_b_scalar_string_like = not is_b_array and is_string_like(b) - - return ((is_a_numeric_array and is_b_scalar_string_like) or - (is_b_numeric_array and is_a_scalar_string_like) or - (is_a_numeric_array and is_b_string_array) or - (is_b_numeric_array and is_a_string_array)) - - -def is_datetimelike_v_numeric(a, b): - # return if we have an i8 convertible and numeric comparison - if not hasattr(a, 'dtype'): - a = np.asarray(a) - if not hasattr(b, 'dtype'): - b = np.asarray(b) - - def is_numeric(x): - return is_integer_dtype(x) or is_float_dtype(x) - - is_datetimelike = needs_i8_conversion - return ((is_datetimelike(a) and is_numeric(b)) or - (is_datetimelike(b) and is_numeric(a))) - - -def is_datetimelike_v_object(a, b): - # return if we have an i8 convertible and object comparsion - if not hasattr(a, 'dtype'): - a = np.asarray(a) - if not hasattr(b, 'dtype'): - b = np.asarray(b) - - def f(x): - return is_object_dtype(x) - - def is_object(x): - return is_integer_dtype(x) or is_float_dtype(x) - - is_datetimelike = needs_i8_conversion - return ((is_datetimelike(a) and is_object(b)) or - (is_datetimelike(b) and is_object(a))) - - -def needs_i8_conversion(arr_or_dtype): - return (is_datetime_or_timedelta_dtype(arr_or_dtype) or - is_datetime64tz_dtype(arr_or_dtype)) - - -def is_numeric_dtype(arr_or_dtype): - tipo = _get_dtype_type(arr_or_dtype) - return (issubclass(tipo, (np.number, np.bool_)) and - not issubclass(tipo, (np.datetime64, np.timedelta64))) - - -def is_string_dtype(arr_or_dtype): - dtype = _get_dtype(arr_or_dtype) - return dtype.kind in ('O', 'S', 'U') - - -def is_string_like_dtype(arr_or_dtype): - # exclude object as its a mixed dtype - dtype = _get_dtype(arr_or_dtype) - return dtype.kind in ('S', 'U') - - -def is_float_dtype(arr_or_dtype): - tipo = _get_dtype_type(arr_or_dtype) - return issubclass(tipo, np.floating) - - -def is_floating_dtype(arr_or_dtype): - tipo = _get_dtype_type(arr_or_dtype) - return isinstance(tipo, np.floating) - - -def is_bool_dtype(arr_or_dtype): - try: - tipo = _get_dtype_type(arr_or_dtype) - except ValueError: - # this isn't even a dtype - return False - return issubclass(tipo, np.bool_) - - -def is_sparse(array): - """ return if we are a sparse array """ - return isinstance(array, (gt.ABCSparseArray, gt.ABCSparseSeries)) - - -def is_datetimetz(array): - """ return if we are a datetime with tz array """ - return ((isinstance(array, gt.ABCDatetimeIndex) and - getattr(array, 'tz', None) is not None) or - is_datetime64tz_dtype(array)) - - -def is_extension_type(value): - """ - if we are a klass that is preserved by the internals - these are internal klasses that we represent (and don't use a np.array) - """ - if is_categorical(value): - return True - elif is_sparse(value): - return True - elif is_datetimetz(value): - return True - return False - - -def is_categorical(array): - """ return if we are a categorical possibility """ - return isinstance(array, gt.ABCCategorical) or is_categorical_dtype(array) - - -def is_categorical_dtype(arr_or_dtype): - return gt.CategoricalDtype.is_dtype(arr_or_dtype) - - -def is_complex_dtype(arr_or_dtype): - tipo = _get_dtype_type(arr_or_dtype) - return issubclass(tipo, np.complexfloating) - - -def is_object_dtype(arr_or_dtype): - tipo = _get_dtype_type(arr_or_dtype) - return issubclass(tipo, np.object_) - - -def is_re(obj): - return isinstance(obj, re._pattern_type) - - -def is_re_compilable(obj): - try: - re.compile(obj) - except TypeError: - return False - else: - return True - - -def is_list_like(arg): - return (hasattr(arg, '__iter__') and - not isinstance(arg, compat.string_and_binary_types)) - - -def is_dict_like(arg): - return hasattr(arg, '__getitem__') and hasattr(arg, 'keys') - - -def is_named_tuple(arg): - return isinstance(arg, tuple) and hasattr(arg, '_fields') - def is_null_slice(obj): """ we have a null slice """ @@ -1790,47 +418,6 @@ def is_full_slice(obj, l): obj.step is None) -def is_hashable(arg): - """Return True if hash(arg) will succeed, False otherwise. - - Some types will pass a test against collections.Hashable but fail when they - are actually hashed with hash(). - - Distinguish between these and other types by trying the call to hash() and - seeing if they raise TypeError. - - Examples - -------- - >>> a = ([],) - >>> isinstance(a, collections.Hashable) - True - >>> is_hashable(a) - False - """ - # unfortunately, we can't use isinstance(arg, collections.Hashable), which - # can be faster than calling hash, because numpy scalars on Python 3 fail - # this test - - # reconsider this decision once this numpy bug is fixed: - # https://github.com/numpy/numpy/issues/5562 - - try: - hash(arg) - except TypeError: - return False - else: - return True - - -def is_sequence(x): - try: - iter(x) - len(x) # it has a length - return not isinstance(x, compat.string_and_binary_types) - except (TypeError, AttributeError): - return False - - def _get_callable_name(obj): # typical case has name if hasattr(obj, '__name__'): @@ -1858,74 +445,6 @@ def _apply_if_callable(maybe_callable, obj, **kwargs): return maybe_callable -_string_dtypes = frozenset(map(_get_dtype_from_object, (compat.binary_type, - compat.text_type))) - -_ensure_float64 = algos.ensure_float64 -_ensure_float32 = algos.ensure_float32 -_ensure_int64 = algos.ensure_int64 -_ensure_int32 = algos.ensure_int32 -_ensure_int16 = algos.ensure_int16 -_ensure_int8 = algos.ensure_int8 -_ensure_platform_int = algos.ensure_platform_int -_ensure_object = algos.ensure_object - - -def _astype_nansafe(arr, dtype, copy=True): - """ return a view if copy is False, but - need to be very careful as the result shape could change! """ - if not isinstance(dtype, np.dtype): - dtype = _coerce_to_dtype(dtype) - - if issubclass(dtype.type, compat.text_type): - # in Py3 that's str, in Py2 that's unicode - return lib.astype_unicode(arr.ravel()).reshape(arr.shape) - elif issubclass(dtype.type, compat.string_types): - return lib.astype_str(arr.ravel()).reshape(arr.shape) - elif is_datetime64_dtype(arr): - if dtype == object: - return tslib.ints_to_pydatetime(arr.view(np.int64)) - elif dtype == np.int64: - return arr.view(dtype) - elif dtype != _NS_DTYPE: - raise TypeError("cannot astype a datetimelike from [%s] to [%s]" % - (arr.dtype, dtype)) - return arr.astype(_NS_DTYPE) - elif is_timedelta64_dtype(arr): - if dtype == np.int64: - return arr.view(dtype) - elif dtype == object: - return tslib.ints_to_pytimedelta(arr.view(np.int64)) - - # in py3, timedelta64[ns] are int64 - elif ((compat.PY3 and dtype not in [_INT64_DTYPE, _TD_DTYPE]) or - (not compat.PY3 and dtype != _TD_DTYPE)): - - # allow frequency conversions - if dtype.kind == 'm': - mask = isnull(arr) - result = arr.astype(dtype).astype(np.float64) - result[mask] = np.nan - return result - - raise TypeError("cannot astype a timedelta from [%s] to [%s]" % - (arr.dtype, dtype)) - - return arr.astype(_TD_DTYPE) - elif (np.issubdtype(arr.dtype, np.floating) and - np.issubdtype(dtype, np.integer)): - - if np.isnan(arr).any(): - raise ValueError('Cannot convert NA to integer') - elif arr.dtype == np.object_ and np.issubdtype(dtype.type, np.integer): - # work around NumPy brokenness, #1987 - return lib.astype_intsafe(arr.ravel(), dtype).reshape(arr.shape) - - if copy: - return arr.astype(dtype) - return arr.view(dtype) - - def _all_none(*args): for arg in args: if arg is not None: @@ -1971,6 +490,9 @@ class Sentinel(object): return Sentinel() +# ---------------------------------------------------------------------- +# Detect our environment + def in_interactive_session(): """ check if we're running in an interactive shell @@ -2038,21 +560,6 @@ def in_ipython_frontend(): return False -def _maybe_match_name(a, b): - a_has = hasattr(a, 'name') - b_has = hasattr(b, 'name') - if a_has and b_has: - if a.name == b.name: - return a.name - else: - return None - elif a_has: - return a.name - elif b_has: - return b.name - return None - - def _random_state(state=None): """ Helper function for processing random_state arguments. @@ -2071,7 +578,7 @@ def _random_state(state=None): np.random.RandomState """ - if is_integer(state): + if types.is_integer(state): return np.random.RandomState(state) elif isinstance(state, np.random.RandomState): return state diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 3ca2c6cd014bc..fe47391c9ff81 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -346,6 +346,17 @@ def mpl_style_cb(key): cf.deprecate_option('display.height', msg=pc_height_deprecation_warning, rkey='display.max_rows') +pc_html_border_doc = """ +: int + A ``border=value`` attribute is inserted in the ``
`` tag + for the DataFrame HTML repr. +""" + +with cf.config_prefix('html'): + cf.register_option('border', 1, pc_html_border_doc, + validator=is_int) + + tc_sim_interactive_doc = """ : boolean Whether to simulate interactive mode for purposes of testing @@ -366,7 +377,7 @@ def mpl_style_cb(key): def use_inf_as_null_cb(key): - from pandas.core.common import _use_inf_as_null + from pandas.types.missing import _use_inf_as_null _use_inf_as_null(key) with cf.config_prefix('mode'): diff --git a/pandas/core/convert.py b/pandas/core/convert.py deleted file mode 100644 index 7f4fe73c688f8..0000000000000 --- a/pandas/core/convert.py +++ /dev/null @@ -1,127 +0,0 @@ -""" -Functions for converting object to other types -""" - -import numpy as np - -import pandas as pd -from pandas.core.common import (_possibly_cast_to_datetime, is_object_dtype, - isnull) -import pandas.lib as lib - - -# TODO: Remove in 0.18 or 2017, which ever is sooner -def _possibly_convert_objects(values, convert_dates=True, convert_numeric=True, - convert_timedeltas=True, copy=True): - """ if we have an object dtype, try to coerce dates and/or numbers """ - - # if we have passed in a list or scalar - if isinstance(values, (list, tuple)): - values = np.array(values, dtype=np.object_) - if not hasattr(values, 'dtype'): - values = np.array([values], dtype=np.object_) - - # convert dates - if convert_dates and values.dtype == np.object_: - - # we take an aggressive stance and convert to datetime64[ns] - if convert_dates == 'coerce': - new_values = _possibly_cast_to_datetime(values, 'M8[ns]', - errors='coerce') - - # if we are all nans then leave me alone - if not isnull(new_values).all(): - values = new_values - - else: - values = lib.maybe_convert_objects(values, - convert_datetime=convert_dates) - - # convert timedeltas - if convert_timedeltas and values.dtype == np.object_: - - if convert_timedeltas == 'coerce': - from pandas.tseries.timedeltas import to_timedelta - new_values = to_timedelta(values, coerce=True) - - # if we are all nans then leave me alone - if not isnull(new_values).all(): - values = new_values - - else: - values = lib.maybe_convert_objects( - values, convert_timedelta=convert_timedeltas) - - # convert to numeric - if values.dtype == np.object_: - if convert_numeric: - try: - new_values = lib.maybe_convert_numeric(values, set(), - coerce_numeric=True) - - # if we are all nans then leave me alone - if not isnull(new_values).all(): - values = new_values - - except: - pass - else: - # soft-conversion - values = lib.maybe_convert_objects(values) - - values = values.copy() if copy else values - - return values - - -def _soft_convert_objects(values, datetime=True, numeric=True, timedelta=True, - coerce=False, copy=True): - """ if we have an object dtype, try to coerce dates and/or numbers """ - - conversion_count = sum((datetime, numeric, timedelta)) - if conversion_count == 0: - raise ValueError('At least one of datetime, numeric or timedelta must ' - 'be True.') - elif conversion_count > 1 and coerce: - raise ValueError("Only one of 'datetime', 'numeric' or " - "'timedelta' can be True when when coerce=True.") - - if isinstance(values, (list, tuple)): - # List or scalar - values = np.array(values, dtype=np.object_) - elif not hasattr(values, 'dtype'): - values = np.array([values], dtype=np.object_) - elif not is_object_dtype(values.dtype): - # If not object, do not attempt conversion - values = values.copy() if copy else values - return values - - # If 1 flag is coerce, ensure 2 others are False - if coerce: - # Immediate return if coerce - if datetime: - return pd.to_datetime(values, errors='coerce', box=False) - elif timedelta: - return pd.to_timedelta(values, errors='coerce', box=False) - elif numeric: - return pd.to_numeric(values, errors='coerce') - - # Soft conversions - if datetime: - values = lib.maybe_convert_objects(values, convert_datetime=datetime) - - if timedelta and is_object_dtype(values.dtype): - # Object check to ensure only run if previous did not convert - values = lib.maybe_convert_objects(values, convert_timedelta=timedelta) - - if numeric and is_object_dtype(values.dtype): - try: - converted = lib.maybe_convert_numeric(values, set(), - coerce_numeric=True) - # If all NaNs, then do not-alter - values = converted if not isnull(converted).all() else values - values = values.copy() if copy else values - except: - pass - - return values diff --git a/pandas/core/datetools.py b/pandas/core/datetools.py index 79718c79f9bdd..bfc3f3d4e4743 100644 --- a/pandas/core/datetools.py +++ b/pandas/core/datetools.py @@ -2,10 +2,16 @@ # flake8: noqa +import warnings + from pandas.tseries.tools import * from pandas.tseries.offsets import * from pandas.tseries.frequencies import * +warnings.warn("The pandas.core.datetools module is deprecated and will be " + "removed in a future version. Please use the pandas.tseries " + "module instead.", FutureWarning, stacklevel=2) + day = DateOffset() bday = BDay() businessDay = bday diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 69def7502a6f7..ac3e5d2aabef7 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -23,12 +23,45 @@ import numpy as np import numpy.ma as ma -from pandas.core.common import ( - isnull, notnull, PandasError, _try_sort, _default_index, _maybe_upcast, - is_sequence, _infer_dtype_from_scalar, _values_from_object, is_list_like, - _maybe_box_datetimelike, is_categorical_dtype, is_object_dtype, - is_extension_type, is_datetimetz, _possibly_infer_to_datetimelike, - _dict_compat) +from pandas.types.cast import (_maybe_upcast, + _infer_dtype_from_scalar, + _possibly_cast_to_datetime, + _possibly_infer_to_datetimelike, + _possibly_convert_platform, + _possibly_downcast_to_dtype, + _invalidate_string_dtypes, + _coerce_to_dtypes, + _maybe_upcast_putmask, + _find_common_type) +from pandas.types.common import (is_categorical_dtype, + is_object_dtype, + is_extension_type, + is_datetimetz, + is_datetime64_dtype, + is_datetime64tz_dtype, + is_bool_dtype, + is_integer_dtype, + is_float_dtype, + is_integer, + is_scalar, + is_dtype_equal, + needs_i8_conversion, + _get_dtype_from_object, + _ensure_float, + _ensure_float64, + _ensure_int64, + _ensure_platform_int, + is_list_like, + is_iterator, + is_sequence, + is_named_tuple) +from pandas.types.missing import isnull, notnull + +from pandas.core.common import (PandasError, _try_sort, + _default_index, + _values_from_object, + _maybe_box_datetimelike, + _dict_compat) from pandas.core.generic import NDFrame, _shared_docs from pandas.core.index import Index, MultiIndex, _ensure_index from pandas.core.indexing import (maybe_droplevels, convert_to_index_sliceable, @@ -45,8 +78,7 @@ OrderedDict, raise_with_traceback) from pandas import compat from pandas.compat.numpy import function as nv -from pandas.util.decorators import (deprecate, Appender, Substitution, - deprecate_kwarg) +from pandas.util.decorators import deprecate_kwarg, Appender, Substitution from pandas.tseries.period import PeriodIndex from pandas.tseries.index import DatetimeIndex @@ -68,8 +100,12 @@ # --------------------------------------------------------------------- # Docstring templates -_shared_doc_kwargs = dict(axes='index, columns', klass='DataFrame', - axes_single_arg="{0, 1, 'index', 'columns'}") +_shared_doc_kwargs = dict( + axes='index, columns', klass='DataFrame', + axes_single_arg="{0 or 'index', 1 or 'columns'}", + optional_by=""" + by : str or list of str + Name or list of names which refer to the axis items.""") _numeric_only_doc = """numeric_only : boolean, default None Include only float, int, boolean data. If None, will attempt to use @@ -153,6 +189,12 @@ merged : DataFrame The output type will the be same as 'left', if it is a subclass of DataFrame. + +See also +-------- +merge_ordered +merge_asof + """ # ----------------------------------------------------------------------- @@ -258,7 +300,7 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, data = list(data) if len(data) > 0: if is_list_like(data[0]) and getattr(data[0], 'ndim', 1) == 1: - if com.is_named_tuple(data[0]) and columns is None: + if is_named_tuple(data[0]) and columns is None: columns = data[0]._fields arrays, columns = _to_arrays(data, columns, dtype=dtype) columns = _ensure_index(columns) @@ -652,8 +694,9 @@ def iterrows(self): """ columns = self.columns + klass = self._constructor_sliced for k, v in zip(self.index, self.values): - s = Series(v, index=columns, name=k) + s = klass(v, index=columns, name=k) yield k, s def itertuples(self, index=True, name="Pandas"): @@ -808,7 +851,6 @@ def from_dict(cls, data, orient='columns', dtype=None): return cls(data, index=index, columns=columns, dtype=dtype) - @deprecate_kwarg(old_arg_name='outtype', new_arg_name='orient') def to_dict(self, orient='dict'): """Convert DataFrame to dictionary. @@ -931,7 +973,7 @@ def from_records(cls, data, index=None, exclude=None, columns=None, if columns is not None: columns = _ensure_index(columns) - if com.is_iterator(data): + if is_iterator(data): if nrows == 0: return cls() @@ -1042,7 +1084,7 @@ def to_records(self, index=True, convert_datetime64=True): y : recarray """ if index: - if com.is_datetime64_dtype(self.index) and convert_datetime64: + if is_datetime64_dtype(self.index) and convert_datetime64: ix_vals = [self.index.to_pydatetime()] else: if isinstance(self.index, MultiIndex): @@ -1258,15 +1300,13 @@ def to_panel(self): return self._constructor_expanddim(new_mgr) - to_wide = deprecate('to_wide', to_panel) - def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, columns=None, header=True, index=True, index_label=None, mode='w', encoding=None, compression=None, quoting=None, quotechar='"', line_terminator='\n', chunksize=None, tupleize_cols=False, date_format=None, doublequote=True, - escapechar=None, decimal='.', **kwds): - """Write DataFrame to a comma-separated values (csv) file + escapechar=None, decimal='.'): + r"""Write DataFrame to a comma-separated values (csv) file Parameters ---------- @@ -1292,8 +1332,6 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, sequence should be given if the DataFrame uses MultiIndex. If False do not print fields for index names. Use index_label=False for easier importing in R - nanRep : None - deprecated, use na_rep mode : str Python write mode, default 'w' encoding : string, optional @@ -1303,7 +1341,7 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, a string representing the compression to use in the output file, allowed values are 'gzip', 'bz2', 'xz', only used when the first argument is a filename - line_terminator : string, default '\\n' + line_terminator : string, default ``'\n'`` The newline character or character sequence to use in the output file quoting : optional constant from csv module @@ -1336,7 +1374,6 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, cols=columns, header=header, index=index, index_label=index_label, mode=mode, chunksize=chunksize, quotechar=quotechar, - engine=kwds.get("engine"), tupleize_cols=tupleize_cols, date_format=date_format, doublequote=doublequote, @@ -1428,24 +1465,50 @@ def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='', def to_stata(self, fname, convert_dates=None, write_index=True, encoding="latin-1", byteorder=None, time_stamp=None, - data_label=None): + data_label=None, variable_labels=None): """ A class for writing Stata binary dta files from array-like objects Parameters ---------- - fname : file path or buffer - Where to save the dta file. + fname : str or buffer + String path of file-like object convert_dates : dict - Dictionary mapping column of datetime types to the stata internal - format that you want to use for the dates. Options are - 'tc', 'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either a - number or a name. + Dictionary mapping columns containing datetime types to stata + internal format to use when wirting the dates. Options are 'tc', + 'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either an integer + or a name. Datetime columns that do not have a conversion type + specified will be converted to 'tc'. Raises NotImplementedError if + a datetime column has timezone information + write_index : bool + Write the index to Stata dataset. encoding : str - Default is latin-1. Note that Stata does not support unicode. + Default is latin-1. Unicode is not supported byteorder : str - Can be ">", "<", "little", or "big". The default is None which uses - `sys.byteorder` + Can be ">", "<", "little", or "big". default is `sys.byteorder` + time_stamp : datetime + A datetime to use as file creation date. Default is the current + time. + dataset_label : str + A label for the data set. Must be 80 characters or smaller. + variable_labels : dict + Dictionary containing columns as keys and variable labels as + values. Each label must be 80 characters or smaller. + + .. versionadded:: 0.19.0 + + Raises + ------ + NotImplementedError + * If datetimes contain timezone information + * Column dtype is not representable in Stata + ValueError + * Columns listed in convert_dates are noth either datetime64[ns] + or datetime.datetime + * Column listed in convert_dates is not in DataFrame + * Categorical label contains more than 32,000 characters + + .. versionadded:: 0.19.0 Examples -------- @@ -1461,7 +1524,8 @@ def to_stata(self, fname, convert_dates=None, write_index=True, writer = StataWriter(fname, self, convert_dates=convert_dates, encoding=encoding, byteorder=byteorder, time_stamp=time_stamp, data_label=data_label, - write_index=write_index) + write_index=write_index, + variable_labels=variable_labels) writer.write_file() @Appender(fmt.docstring_to_string, indents=1) @@ -1492,12 +1556,12 @@ def to_string(self, buf=None, columns=None, col_space=None, header=True, return result @Appender(fmt.docstring_to_string, indents=1) - def to_html(self, buf=None, columns=None, col_space=None, colSpace=None, - header=True, index=True, na_rep='NaN', formatters=None, - float_format=None, sparsify=None, index_names=True, - justify=None, bold_rows=True, classes=None, escape=True, - max_rows=None, max_cols=None, show_dimensions=False, - notebook=False, decimal='.'): + def to_html(self, buf=None, columns=None, col_space=None, header=True, + index=True, na_rep='NaN', formatters=None, float_format=None, + sparsify=None, index_names=True, justify=None, bold_rows=True, + classes=None, escape=True, max_rows=None, max_cols=None, + show_dimensions=False, notebook=False, decimal='.', + border=None): """ Render a DataFrame as an HTML table. @@ -1519,12 +1583,12 @@ def to_html(self, buf=None, columns=None, col_space=None, colSpace=None, Character recognized as decimal separator, e.g. ',' in Europe .. versionadded:: 0.18.0 - """ + border : int + A ``border=border`` attribute is included in the opening + `
` tag. Default ``pd.options.html.border``. - if colSpace is not None: # pragma: no cover - warnings.warn("colSpace is deprecated, use col_space", - FutureWarning, stacklevel=2) - col_space = colSpace + .. versionadded:: 0.19.0 + """ formatter = fmt.DataFrameFormatter(self, buf=buf, columns=columns, col_space=col_space, na_rep=na_rep, @@ -1539,17 +1603,17 @@ def to_html(self, buf=None, columns=None, col_space=None, colSpace=None, show_dimensions=show_dimensions, decimal=decimal) # TODO: a generic formatter wld b in DataFrameFormatter - formatter.to_html(classes=classes, notebook=notebook) + formatter.to_html(classes=classes, notebook=notebook, border=border) if buf is None: return formatter.buf.getvalue() @Appender(fmt.common_docstring + fmt.return_docstring, indents=1) - def to_latex(self, buf=None, columns=None, col_space=None, colSpace=None, - header=True, index=True, na_rep='NaN', formatters=None, - float_format=None, sparsify=None, index_names=True, - bold_rows=True, column_format=None, longtable=None, - escape=None, encoding=None, decimal='.'): + def to_latex(self, buf=None, columns=None, col_space=None, header=True, + index=True, na_rep='NaN', formatters=None, float_format=None, + sparsify=None, index_names=True, bold_rows=True, + column_format=None, longtable=None, escape=None, + encoding=None, decimal='.'): """ Render a DataFrame to a tabular environment table. You can splice this into a LaTeX document. Requires \\usepackage{booktabs}. @@ -1578,11 +1642,6 @@ def to_latex(self, buf=None, columns=None, col_space=None, colSpace=None, .. versionadded:: 0.18.0 """ - - if colSpace is not None: # pragma: no cover - warnings.warn("colSpace is deprecated, use col_space", - FutureWarning, stacklevel=2) - col_space = colSpace # Get defaults from the pandas config if longtable is None: longtable = get_option("display.latex.longtable") @@ -1912,7 +1971,7 @@ def _ixs(self, i, axis=0): copy = True else: new_values = self._data.fast_xs(i) - if lib.isscalar(new_values): + if is_scalar(new_values): return new_values # if we are a copy, mark as such @@ -2064,7 +2123,7 @@ def _getitem_multilevel(self, key): return self._get_item_cache(key) def _getitem_frame(self, key): - if key.values.size and not com.is_bool_dtype(key.values): + if key.values.size and not is_bool_dtype(key.values): raise ValueError('Must pass DataFrame with boolean values only') return self.where(key) @@ -2213,7 +2272,7 @@ def eval(self, expr, inplace=None, **kwargs): resolvers = dict(self.iteritems()), index_resolvers if 'target' not in kwargs: kwargs['target'] = self - kwargs['resolvers'] = kwargs.get('resolvers', ()) + resolvers + kwargs['resolvers'] = kwargs.get('resolvers', ()) + tuple(resolvers) return _eval(expr, inplace=inplace, **kwargs) def select_dtypes(self, include=None, exclude=None): @@ -2281,7 +2340,7 @@ def select_dtypes(self, include=None, exclude=None): 5 False """ include, exclude = include or (), exclude or () - if not (com.is_list_like(include) and com.is_list_like(exclude)): + if not (is_list_like(include) and is_list_like(exclude)): raise TypeError('include and exclude must both be non-string' ' sequences') selection = tuple(map(frozenset, (include, exclude))) @@ -2292,9 +2351,9 @@ def select_dtypes(self, include=None, exclude=None): # convert the myriad valid dtypes object to a single representation include, exclude = map( - lambda x: frozenset(map(com._get_dtype_from_object, x)), selection) + lambda x: frozenset(map(_get_dtype_from_object, x)), selection) for dtypes in (include, exclude): - com._invalidate_string_dtypes(dtypes) + _invalidate_string_dtypes(dtypes) # can't both include AND exclude! if not include.isdisjoint(exclude): @@ -2384,12 +2443,12 @@ def _setitem_array(self, key, value): def _setitem_frame(self, key, value): # support boolean setting with DataFrame input, e.g. # df[df > df2] = 0 - if key.values.size and not com.is_bool_dtype(key.values): + if key.values.size and not is_bool_dtype(key.values): raise TypeError('Must pass DataFrame with boolean values only') self._check_inplace_setting(value) self._check_setitem_copy() - self.where(-key, value, inplace=True) + self._where(-key, value, inplace=True) def _ensure_valid_index(self, value): """ @@ -2578,11 +2637,13 @@ def reindexer(value): value = _sanitize_index(value, self.index, copy=False) if not isinstance(value, (np.ndarray, Index)): if isinstance(value, list) and len(value) > 0: - value = com._possibly_convert_platform(value) + value = _possibly_convert_platform(value) else: value = com._asarray_tuplesafe(value) elif value.ndim == 2: value = value.copy().T + elif isinstance(value, Index): + value = value.copy(deep=True) else: value = value.copy() @@ -2594,7 +2655,7 @@ def reindexer(value): # upcast the scalar dtype, value = _infer_dtype_from_scalar(value) value = np.repeat(value, len(self.index)).astype(dtype) - value = com._possibly_cast_to_datetime(value, dtype) + value = _possibly_cast_to_datetime(value, dtype) # return internal types directly if is_extension_type(value): @@ -2908,8 +2969,8 @@ def _maybe_casted_values(index, labels=None): mask = labels == -1 values = values.take(labels) if mask.any(): - values, changed = com._maybe_upcast_putmask(values, mask, - np.nan) + values, changed = _maybe_upcast_putmask(values, mask, + np.nan) return values new_index = _default_index(len(new_obj)) @@ -3118,37 +3179,36 @@ def sort_values(self, by, axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last'): axis = self._get_axis_number(axis) + other_axis = 0 if axis == 1 else 1 - if axis != 0: - raise ValueError('When sorting by column, axis must be 0 (rows)') if not isinstance(by, list): by = [by] - if com.is_sequence(ascending) and len(by) != len(ascending): + if is_sequence(ascending) and len(by) != len(ascending): raise ValueError('Length of ascending (%d) != length of by (%d)' % (len(ascending), len(by))) if len(by) > 1: from pandas.core.groupby import _lexsort_indexer def trans(v): - if com.needs_i8_conversion(v): + if needs_i8_conversion(v): return v.view('i8') return v keys = [] for x in by: - k = self[x].values + k = self.xs(x, axis=other_axis).values if k.ndim == 2: raise ValueError('Cannot sort by duplicate column %s' % str(x)) keys.append(trans(k)) indexer = _lexsort_indexer(keys, orders=ascending, na_position=na_position) - indexer = com._ensure_platform_int(indexer) + indexer = _ensure_platform_int(indexer) else: from pandas.core.groupby import _nargsort by = by[0] - k = self[by].values + k = self.xs(by, axis=other_axis).values if k.ndim == 2: # try to be helpful @@ -3312,7 +3372,7 @@ def sortlevel(self, level=0, axis=0, ascending=True, inplace=False, inplace=inplace, sort_remaining=sort_remaining) def _nsorted(self, columns, n, method, keep): - if not com.is_list_like(columns): + if not is_list_like(columns): columns = [columns] columns = list(columns) ser = getattr(self[columns[0]], method)(n, keep=keep) @@ -3648,30 +3708,38 @@ def combine(self, other, func, fill_value=None, overwrite=True): otherSeries[other_mask] = fill_value # if we have different dtypes, possibily promote - new_dtype = this_dtype - if this_dtype != other_dtype: - new_dtype = com._lcd_dtypes(this_dtype, other_dtype) - series = series.astype(new_dtype) + if notnull(series).all(): + new_dtype = this_dtype otherSeries = otherSeries.astype(new_dtype) + else: + new_dtype = _find_common_type([this_dtype, other_dtype]) + if not is_dtype_equal(this_dtype, new_dtype): + series = series.astype(new_dtype) + if not is_dtype_equal(other_dtype, new_dtype): + otherSeries = otherSeries.astype(new_dtype) # see if we need to be represented as i8 (datetimelike) # try to keep us at this dtype - needs_i8_conversion = com.needs_i8_conversion(new_dtype) - if needs_i8_conversion: - this_dtype = new_dtype + needs_i8_conversion_i = needs_i8_conversion(new_dtype) + if needs_i8_conversion_i: arr = func(series, otherSeries, True) else: arr = func(series, otherSeries) if do_fill: - arr = com.ensure_float(arr) + arr = _ensure_float(arr) arr[this_mask & other_mask] = NA # try to downcast back to the original dtype - if needs_i8_conversion: - arr = com._possibly_cast_to_datetime(arr, this_dtype) + if needs_i8_conversion_i: + # ToDo: This conversion should be handled in + # _possibly_cast_to_datetime but the change affects lot... + if is_datetime64tz_dtype(new_dtype): + arr = DatetimeIndex._simple_new(arr, tz=new_dtype.tz) + else: + arr = _possibly_cast_to_datetime(arr, new_dtype) else: - arr = com._possibly_downcast_to_dtype(arr, this_dtype) + arr = _possibly_downcast_to_dtype(arr, this_dtype) result[col] = arr @@ -3749,7 +3817,8 @@ def update(self, other, join='left', overwrite=True, filter_func=None, this = self[col].values that = other[col].values if filter_func is not None: - mask = ~filter_func(this) | isnull(that) + with np.errstate(all='ignore'): + mask = ~filter_func(this) | isnull(that) else: if raise_conflict: mask_this = notnull(that) @@ -4044,7 +4113,8 @@ def f(x): return self._apply_empty_result(func, axis, reduce, *args, **kwds) if isinstance(f, np.ufunc): - results = f(self.values) + with np.errstate(all='ignore'): + results = f(self.values) return self._constructor(data=results, index=self.index, columns=self.columns, copy=False) else: @@ -4314,14 +4384,20 @@ def append(self, other, ignore_index=False, verify_integrity=False): raise TypeError('Can only append a Series if ignore_index=True' ' or if the Series has a name') - index = None if other.name is None else [other.name] + if other.name is None: + index = None + else: + # other must have the same index name as self, otherwise + # index name will be reset + index = Index([other.name], name=self.index.name) + combined_columns = self.columns.tolist() + self.columns.union( other.index).difference(self.columns).tolist() other = other.reindex(combined_columns, copy=False) other = DataFrame(other.values.reshape((1, len(other))), - index=index, columns=combined_columns) + index=index, + columns=combined_columns) other = other._convert(datetime=True, timedelta=True) - if not self.columns.equals(combined_columns): self = self.reindex(columns=combined_columns) elif isinstance(other, list) and not isinstance(other[0], DataFrame): @@ -4573,7 +4649,7 @@ def _dict_round(df, decimals): yield vals def _series_round(s, decimals): - if com.is_integer_dtype(s) or com.is_float_dtype(s): + if is_integer_dtype(s) or is_float_dtype(s): return s.round(decimals) return s @@ -4584,7 +4660,7 @@ def _series_round(s, decimals): if not decimals.index.is_unique: raise ValueError("Index of decimals must be unique") new_cols = [col for col in _dict_round(self, decimals)] - elif com.is_integer(decimals): + elif is_integer(decimals): # Dispatch to Series.round new_cols = [_series_round(v, decimals) for _, v in self.iteritems()] @@ -4626,14 +4702,14 @@ def corr(self, method='pearson', min_periods=1): mat = numeric_df.values if method == 'pearson': - correl = _algos.nancorr(com._ensure_float64(mat), minp=min_periods) + correl = _algos.nancorr(_ensure_float64(mat), minp=min_periods) elif method == 'spearman': - correl = _algos.nancorr_spearman(com._ensure_float64(mat), + correl = _algos.nancorr_spearman(_ensure_float64(mat), minp=min_periods) else: if min_periods is None: min_periods = 1 - mat = com._ensure_float64(mat).T + mat = _ensure_float64(mat).T corrf = nanops.get_corr_func(method) K = len(cols) correl = np.empty((K, K), dtype=float) @@ -4688,7 +4764,7 @@ def cov(self, min_periods=None): baseCov = np.cov(mat.T) baseCov = baseCov.reshape((len(cols), len(cols))) else: - baseCov = _algos.nancorr(com._ensure_float64(mat), cov=True, + baseCov = _algos.nancorr(_ensure_float64(mat), cov=True, minp=min_periods) return self._constructor(baseCov, index=cols, columns=cols) @@ -4817,7 +4893,7 @@ def _count_level(self, level, axis=0, numeric_only=False): level = count_axis._get_level_number(level) level_index = count_axis.levels[level] - labels = com._ensure_int64(count_axis.labels[level]) + labels = _ensure_int64(count_axis.labels[level]) counts = lib.count_level_2d(mask, labels, len(level_index), axis=0) result = DataFrame(counts, index=level_index, columns=agg_axis) @@ -4870,7 +4946,8 @@ def f(x): "type %s not implemented." % filter_type) raise_with_traceback(e) - result = f(data.values) + with np.errstate(all='ignore'): + result = f(data.values) labels = data._get_agg_axis(axis) else: if numeric_only: @@ -4898,7 +4975,7 @@ def f(x): # try to coerce to the original dtypes item by item if we can if axis == 0: - result = com._coerce_to_dtypes(result, self.dtypes) + result = _coerce_to_dtypes(result, self.dtypes) return Series(result, index=labels) @@ -5368,13 +5445,13 @@ def _prep_ndarray(values, copy=True): return np.empty((0, 0), dtype=object) def convert(v): - return com._possibly_convert_platform(v) + return _possibly_convert_platform(v) # we could have a 1-dim or 2-dim list here # this is equiv of np.asarray, but does object conversion # and platform dtype preservation try: - if com.is_list_like(values[0]) or hasattr(values[0], 'len'): + if is_list_like(values[0]) or hasattr(values[0], 'len'): values = np.array([convert(v) for v in values]) else: values = convert(values) @@ -5562,7 +5639,7 @@ def _convert_object_array(content, columns, coerce_float=False, dtype=None): def convert(arr): if dtype != object and dtype != np.object: arr = lib.maybe_convert_objects(arr, try_float=coerce_float) - arr = com._possibly_cast_to_datetime(arr, dtype) + arr = _possibly_cast_to_datetime(arr, dtype) return arr arrays = [convert(arr) for arr in content] diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 9ecaaebc2b523..2f78c9acf7972 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1,4 +1,5 @@ # pylint: disable=W0231,E1101 +import collections import warnings import operator import weakref @@ -8,28 +9,46 @@ import pandas.lib as lib import pandas as pd + + +from pandas.types.common import (_coerce_to_dtype, + _ensure_int64, + needs_i8_conversion, + is_scalar, + is_integer, is_bool, + is_bool_dtype, + is_numeric_dtype, + is_datetime64_dtype, + is_timedelta64_dtype, + is_list_like, + is_dict_like, + is_re_compilable) +from pandas.types.cast import _maybe_promote, _maybe_upcast_putmask +from pandas.types.missing import isnull, notnull +from pandas.types.generic import ABCSeries, ABCPanel + +from pandas.core.common import (_values_from_object, + _maybe_box_datetimelike, + SettingWithCopyError, SettingWithCopyWarning, + AbstractMethodError) + from pandas.core.base import PandasObject from pandas.core.index import (Index, MultiIndex, _ensure_index, InvalidIndexError) import pandas.core.indexing as indexing from pandas.tseries.index import DatetimeIndex -from pandas.tseries.period import PeriodIndex +from pandas.tseries.period import PeriodIndex, Period from pandas.core.internals import BlockManager import pandas.core.algorithms as algos import pandas.core.common as com import pandas.core.missing as missing -import pandas.core.datetools as datetools from pandas.formats.printing import pprint_thing from pandas.formats.format import format_percentiles +from pandas.tseries.frequencies import to_offset from pandas import compat from pandas.compat.numpy import function as nv from pandas.compat import (map, zip, lrange, string_types, isidentifier, set_function_name) -from pandas.core.common import (isnull, notnull, is_list_like, - _values_from_object, _maybe_promote, - _maybe_box_datetimelike, ABCSeries, - SettingWithCopyError, SettingWithCopyWarning, - AbstractMethodError) import pandas.core.nanops as nanops from pandas.util.decorators import Appender, Substitution, deprecate_kwarg from pandas.core import config @@ -37,14 +56,13 @@ # goal is to be able to define the docs close to function, while still being # able to share _shared_docs = dict() -_shared_doc_kwargs = dict(axes='keywords for axes', klass='NDFrame', - axes_single_arg='int or labels for object', - args_transpose='axes to permute (int or label for' - ' object)') - - -def is_dictlike(x): - return isinstance(x, (dict, com.ABCSeries)) +_shared_doc_kwargs = dict( + axes='keywords for axes', klass='NDFrame', + axes_single_arg='int or labels for object', + args_transpose='axes to permute (int or label for object)', + optional_by=""" + by : str or list of str + Name or list of names which refer to the axis items.""") def _single_replace(self, to_replace, method, inplace, limit): @@ -113,7 +131,7 @@ def _validate_dtype(self, dtype): """ validate the passed dtype """ if dtype is not None: - dtype = com._coerce_to_dtype(dtype) + dtype = _coerce_to_dtype(dtype) # a compound dtype if dtype.kind == 'V': @@ -144,7 +162,7 @@ def _init_mgr(self, mgr, axes=None, dtype=None, copy=False): @property def _constructor(self): - """Used when a manipulation result has the same dimesions as the + """Used when a manipulation result has the same dimensions as the original. """ raise AbstractMethodError(self) @@ -307,7 +325,7 @@ def _from_axes(cls, data, axes, **kwargs): def _get_axis_number(self, axis): axis = self._AXIS_ALIASES.get(axis, axis) - if com.is_integer(axis): + if is_integer(axis): if axis in self._AXIS_NAMES: return axis else: @@ -555,8 +573,8 @@ def swaplevel(self, i=-2, j=-1, axis=0): _shared_docs['rename'] = """ Alter axes input function or functions. Function / dict values must be unique (1-to-1). Labels not contained in a dict / Series will be left - as-is. Alternatively, change ``Series.name`` with a scalar - value (Series only). + as-is. Extra labels listed don't throw an error. Alternatively, change + ``Series.name`` with a scalar value (Series only). Parameters ---------- @@ -611,6 +629,11 @@ def swaplevel(self, i=-2, j=-1, axis=0): 0 1 4 1 2 5 2 3 6 + >>> df.rename(index=str, columns={"A": "a", "C": "c"}) + a B + 0 1 4 + 1 2 5 + 2 3 6 """ @Appender(_shared_docs['rename'] % dict(axes='axes keywords for this' @@ -709,8 +732,8 @@ def rename_axis(self, mapper, axis=0, copy=True, inplace=False): 1 2 5 2 3 6 """ - non_mapper = lib.isscalar(mapper) or (com.is_list_like(mapper) and not - com.is_dict_like(mapper)) + non_mapper = is_scalar(mapper) or (is_list_like(mapper) and not + is_dict_like(mapper)) if non_mapper: return self._set_axis_name(mapper, axis=axis) else: @@ -904,7 +927,7 @@ def bool(self): v = self.squeeze() if isinstance(v, (bool, np.bool_)): return bool(v) - elif lib.isscalar(v): + elif is_scalar(v): raise ValueError("bool cannot act on a non-boolean single element " "{0}".format(self.__class__.__name__)) @@ -994,7 +1017,7 @@ def __setstate__(self, state): def to_json(self, path_or_buf=None, orient=None, date_format='epoch', double_precision=10, force_ascii=True, date_unit='ms', - default_handler=None): + default_handler=None, lines=False): """ Convert the object to a JSON string. @@ -1042,6 +1065,13 @@ def to_json(self, path_or_buf=None, orient=None, date_format='epoch', Handler to call if object cannot otherwise be converted to a suitable format for JSON. Should receive a single argument which is the object to convert and return a serialisable object. + lines : boolean, defalut False + If 'orient' is 'records' write out line delimited json format. Will + throw ValueError if incorrect 'orient' since others are not list + like. + + .. versionadded:: 0.19.0 + Returns ------- @@ -1054,20 +1084,19 @@ def to_json(self, path_or_buf=None, orient=None, date_format='epoch', date_format=date_format, double_precision=double_precision, force_ascii=force_ascii, date_unit=date_unit, - default_handler=default_handler) + default_handler=default_handler, + lines=lines) def to_hdf(self, path_or_buf, key, **kwargs): - """Activate the HDFStore. + """Write the contained data to an HDF5 file using HDFStore. Parameters ---------- path_or_buf : the path (string) or HDFStore object key : string indentifier for the group in the store - mode : optional, {'a', 'w', 'r', 'r+'}, default 'a' + mode : optional, {'a', 'w', 'r+'}, default 'a' - ``'r'`` - Read-only; no data can be modified. ``'w'`` Write; a new file is created (an existing file with the same name would be deleted). @@ -1076,15 +1105,22 @@ def to_hdf(self, path_or_buf, key, **kwargs): and if the file does not exist it is created. ``'r+'`` It is similar to ``'a'``, but the file must already exist. - format : 'fixed(f)|table(t)', default is 'fixed' + format : 'fixed(f)|table(t)', default is 'fixed' fixed(f) : Fixed format Fast writing/reading. Not-appendable, nor searchable table(t) : Table format Write as a PyTables Table structure which may perform worse but allow more flexible operations like searching / selecting subsets of the data - append : boolean, default False + append : boolean, default False For Table formats, append the input data to the existing + data_columns : list of columns, or True, default None + List of columns to create as indexed data columns for on-disk + queries, or True to use all columns. By default only the axes + of the object are indexed. See `here + `__. + + Applicable only to format='table'. complevel : int, 1-9, default 0 If a complib is specified compression will be applied where possible @@ -1095,7 +1131,6 @@ def to_hdf(self, path_or_buf, key, **kwargs): If applying compression use the fletcher32 checksum dropna : boolean, default False. If true, ALL nan rows will not be written to store. - """ from pandas.io import pytables @@ -1122,7 +1157,7 @@ def to_msgpack(self, path_or_buf=None, encoding='utf-8', **kwargs): return packers.to_msgpack(path_or_buf, self, encoding=encoding, **kwargs) - def to_sql(self, name, con, flavor='sqlite', schema=None, if_exists='fail', + def to_sql(self, name, con, flavor=None, schema=None, if_exists='fail', index=True, index_label=None, chunksize=None, dtype=None): """ Write records stored in a DataFrame to a SQL database. @@ -1133,12 +1168,11 @@ def to_sql(self, name, con, flavor='sqlite', schema=None, if_exists='fail', Name of SQL table con : SQLAlchemy engine or DBAPI2 connection (legacy mode) Using SQLAlchemy makes it possible to use any DB supported by that - library. - If a DBAPI2 object, only sqlite3 is supported. - flavor : {'sqlite', 'mysql'}, default 'sqlite' - The flavor of SQL to use. Ignored when using SQLAlchemy engine. - 'mysql' is deprecated and will be removed in future versions, but - it will be further supported through SQLAlchemy engines. + library. If a DBAPI2 object, only sqlite3 is supported. + flavor : 'sqlite', default None + DEPRECATED: this parameter will be removed in a future version, + as 'sqlite' is the only supported option if SQLAlchemy is not + installed. schema : string, default None Specify the schema (if database flavor supports this). If None, use default schema. @@ -1641,7 +1675,7 @@ def take(self, indices, axis=0, convert=True, is_copy=True, **kwargs): return result - def xs(self, key, axis=0, level=None, copy=None, drop_level=True): + def xs(self, key, axis=0, level=None, drop_level=True): """ Returns a cross-section (row(s) or column(s)) from the Series/DataFrame. Defaults to cross-section on the rows (axis=0). @@ -1655,8 +1689,6 @@ def xs(self, key, axis=0, level=None, copy=None, drop_level=True): level : object, defaults to first n levels (n=1 or len(key)) In case of a key partially contained in a MultiIndex, indicate which levels are used. Levels can be referred by label or position. - copy : boolean [deprecated] - Whether to make a copy of the data drop_level : boolean, default True If False, returns object with same levels as self. @@ -1712,10 +1744,6 @@ def xs(self, key, axis=0, level=None, copy=None, drop_level=True): :ref:`MultiIndex Slicers ` """ - if copy is not None: - warnings.warn("copy keyword is deprecated, " - "default is to return a copy or a view if possible") - axis = self._get_axis_number(axis) labels = self._get_axis(axis) if level is not None: @@ -1756,10 +1784,10 @@ def xs(self, key, axis=0, level=None, copy=None, drop_level=True): else: return self.take(loc, axis=axis, convert=True) - if not lib.isscalar(loc): + if not is_scalar(loc): new_index = self.index[loc] - if lib.isscalar(loc): + if is_scalar(loc): new_values = self._data.fast_xs(loc) # may need to box a datelike-scalar @@ -1770,9 +1798,9 @@ def xs(self, key, axis=0, level=None, copy=None, drop_level=True): if not is_list_like(new_values) or self.ndim == 1: return _maybe_box_datetimelike(new_values) - result = self._constructor_sliced(new_values, index=self.columns, - name=self.index[loc], copy=copy, - dtype=new_values.dtype) + result = self._constructor_sliced( + new_values, index=self.columns, + name=self.index[loc], dtype=new_values.dtype) else: result = self.iloc[loc] @@ -1956,21 +1984,21 @@ def add_suffix(self, suffix): .. versionadded:: 0.17.0 Parameters - ---------- - by : string name or list of names which refer to the axis items - axis : %(axes)s to direct sorting - ascending : bool or list of bool + ----------%(optional_by)s + axis : %(axes_single_arg)s, default 0 + Axis to direct sorting + ascending : bool or list of bool, default True Sort ascending vs. descending. Specify list for multiple sort orders. If this is a list of bools, must match the length of the by. - inplace : bool + inplace : bool, default False if True, perform operation in-place - kind : {`quicksort`, `mergesort`, `heapsort`} + kind : {'quicksort', 'mergesort', 'heapsort'}, default 'quicksort' Choice of sorting algorithm. See also ndarray.np.sort for more information. `mergesort` is the only stable algorithm. For DataFrames, this option is only applied when sorting on a single column or label. - na_position : {'first', 'last'} + na_position : {'first', 'last'}, default 'last' `first` puts NaNs at the beginning, `last` puts NaNs at the end Returns @@ -1992,16 +2020,16 @@ def sort_values(self, by, axis=0, ascending=True, inplace=False, if not None, sort on values in specified index level(s) ascending : boolean, default True Sort ascending vs. descending - inplace : bool + inplace : bool, default False if True, perform operation in-place - kind : {`quicksort`, `mergesort`, `heapsort`} + kind : {'quicksort', 'mergesort', 'heapsort'}, default 'quicksort' Choice of sorting algorithm. See also ndarray.np.sort for more information. `mergesort` is the only stable algorithm. For DataFrames, this option is only applied when sorting on a single column or label. - na_position : {'first', 'last'} + na_position : {'first', 'last'}, default 'last' `first` puts NaNs at the beginning, `last` puts NaNs at the end - sort_remaining : bool + sort_remaining : bool, default True if true and sorting by level and index is multilevel, sort by other levels too (in order) after sorting by specified level @@ -2333,7 +2361,7 @@ def _reindex_with_indexers(self, reindexers, fill_value=np.nan, copy=False, index = _ensure_index(index) if indexer is not None: - indexer = com._ensure_int64(indexer) + indexer = _ensure_int64(indexer) # TODO: speed up on homogeneous DataFrame objects new_data = new_data.reindex_indexer(index, indexer, axis=baxis, @@ -2357,7 +2385,11 @@ def _reindex_axis(self, new_index, fill_method, axis, copy): def filter(self, items=None, like=None, regex=None, axis=None): """ - Restrict the info axis to set of items or wildcard + Subset rows or columns of dataframe according to labels in + the specified index. + + Note that this routine does not filter a dataframe on its + contents. The filter is applied to the labels of the index. Parameters ---------- @@ -2367,19 +2399,57 @@ def filter(self, items=None, like=None, regex=None, axis=None): Keep info axis where "arg in col == True" regex : string (regular expression) Keep info axis with re.search(regex, col) == True - axis : int or None - The axis to filter on. By default this is the info axis. The "info - axis" is the axis that is used when indexing with ``[]``. For - example, ``df = DataFrame({'a': [1, 2, 3, 4]]}); df['a']``. So, - the ``DataFrame`` columns are the info axis. + axis : int or string axis name + The axis to filter on. By default this is the info axis, + 'index' for Series, 'columns' for DataFrame + + Returns + ------- + same type as input object + + Examples + -------- + >>> df + one two three + mouse 1 2 3 + rabbit 4 5 6 + + >>> # select columns by name + >>> df.filter(items=['one', 'three']) + one three + mouse 1 3 + rabbit 4 6 + + >>> # select columns by regular expression + >>> df.filter(regex='e$', axis=1) + one three + mouse 1 3 + rabbit 4 6 + + >>> # select rows containing 'bbi' + >>> df.filter(like='bbi', axis=0) + one two three + rabbit 4 5 6 + + See Also + -------- + pandas.DataFrame.select Notes ----- - Arguments are mutually exclusive, but this is not checked for + The ``items``, ``like``, and ``regex`` parameters are + enforced to be mutually exclusive. + ``axis`` defaults to the info axis that is used when indexing + with ``[]``. """ import re + nkw = sum([x is not None for x in [items, like, regex]]) + if nkw > 1: + raise TypeError('Keyword arguments `items`, `like`, or `regex` ' + 'are mutually exclusive') + if axis is None: axis = self._info_axis_name axis_name = self._get_axis_name(axis) @@ -2821,7 +2891,8 @@ def as_matrix(self, columns=None): e.g. If the dtypes are float16 and float32, dtype will be upcast to float32. If dtypes are int32 and uint8, dtype will be upcase to - int32. + int32. By numpy.find_common_type convention, mixing int64 and uint64 + will result in a flot64 dtype. This method is provided for backwards compatibility. Generally, it is recommended to use '.values'. @@ -2847,8 +2918,9 @@ def values(self): with care if you are not dealing with the blocks. e.g. If the dtypes are float16 and float32, dtype will be upcast to - float32. If dtypes are int32 and uint8, dtype will be upcase to - int32. + float32. If dtypes are int32 and uint8, dtype will be upcast to + int32. By numpy.find_common_type convention, mixing int64 and uint64 + will result in a flot64 dtype. """ return self.as_matrix() @@ -2938,7 +3010,11 @@ def astype(self, dtype, copy=True, raise_on_error=True, **kwargs): Parameters ---------- - dtype : numpy.dtype or Python type + dtype : data type, or dict of column name -> data type + Use a numpy.dtype or Python type to cast entire pandas object to + the same type. Alternatively, use {col: dtype, ...}, where col is a + column label and dtype is a numpy.dtype or Python type to cast one + or more of the DataFrame's columns to column-specific types. raise_on_error : raise on invalid input kwargs : keyword arguments to pass on to the constructor @@ -2946,10 +3022,36 @@ def astype(self, dtype, copy=True, raise_on_error=True, **kwargs): ------- casted : type of caller """ + if isinstance(dtype, collections.Mapping): + if self.ndim == 1: # i.e. Series + if len(dtype) > 1 or list(dtype.keys())[0] != self.name: + raise KeyError('Only the Series name can be used for ' + 'the key in Series dtype mappings.') + new_type = list(dtype.values())[0] + return self.astype(new_type, copy, raise_on_error, **kwargs) + elif self.ndim > 2: + raise NotImplementedError( + 'astype() only accepts a dtype arg of type dict when ' + 'invoked on Series and DataFrames. A single dtype must be ' + 'specified when invoked on a Panel.' + ) + for col_name in dtype.keys(): + if col_name not in self: + raise KeyError('Only a column name can be used for the ' + 'key in a dtype mappings argument.') + from pandas import concat + results = [] + for col_name, col in self.iteritems(): + if col_name in dtype: + results.append(col.astype(dtype[col_name], copy=copy)) + else: + results.append(results.append(col.copy() if copy else col)) + return concat(results, axis=1, copy=False) - mgr = self._data.astype(dtype=dtype, copy=copy, - raise_on_error=raise_on_error, **kwargs) - return self._constructor(mgr).__finalize__(self) + # else, only a single dtype is given + new_data = self._data.astype(dtype=dtype, copy=copy, + raise_on_error=raise_on_error, **kwargs) + return self._constructor(new_data).__finalize__(self) def copy(self, deep=True): """ @@ -3153,10 +3255,10 @@ def fillna(self, value=None, method=None, axis=None, inplace=False, return self if self.ndim == 1: - if isinstance(value, (dict, com.ABCSeries)): + if isinstance(value, (dict, ABCSeries)): from pandas import Series value = Series(value) - elif not com.is_list_like(value): + elif not is_list_like(value): pass else: raise ValueError("invalid fill value with a %s" % @@ -3166,7 +3268,7 @@ def fillna(self, value=None, method=None, axis=None, inplace=False, inplace=inplace, downcast=downcast) - elif isinstance(value, (dict, com.ABCSeries)): + elif isinstance(value, (dict, ABCSeries)): if axis == 1: raise NotImplementedError('Currently only can fill ' 'with dict/Series column ' @@ -3179,7 +3281,7 @@ def fillna(self, value=None, method=None, axis=None, inplace=False, obj = result[k] obj.fillna(v, limit=limit, inplace=True) return result - elif not com.is_list_like(value): + elif not is_list_like(value): new_data = self._data.fillna(value=value, limit=limit, inplace=inplace, downcast=downcast) @@ -3305,7 +3407,7 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, and play with this method to gain intuition about how it works. """ - if not com.is_bool(regex) and to_replace is not None: + if not is_bool(regex) and to_replace is not None: raise AssertionError("'to_replace' must be 'None' if 'regex' is " "not a bool") if axis is not None: @@ -3318,15 +3420,15 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, if value is None: # passing a single value that is scalar like # when value is None (GH5319), for compat - if not is_dictlike(to_replace) and not is_dictlike(regex): + if not is_dict_like(to_replace) and not is_dict_like(regex): to_replace = [to_replace] if isinstance(to_replace, (tuple, list)): return _single_replace(self, to_replace, method, inplace, limit) - if not is_dictlike(to_replace): - if not is_dictlike(regex): + if not is_dict_like(to_replace): + if not is_dict_like(regex): raise TypeError('If "to_replace" and "value" are both None' ' and "to_replace" is not a list, then ' 'regex must be a mapping') @@ -3336,7 +3438,7 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, items = list(compat.iteritems(to_replace)) keys, values = zip(*items) - are_mappings = [is_dictlike(v) for v in values] + are_mappings = [is_dict_like(v) for v in values] if any(are_mappings): if not all(are_mappings): @@ -3369,8 +3471,8 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, return self new_data = self._data - if is_dictlike(to_replace): - if is_dictlike(value): # {'A' : NA} -> {'A' : 0} + if is_dict_like(to_replace): + if is_dict_like(value): # {'A' : NA} -> {'A' : 0} res = self if inplace else self.copy() for c, src in compat.iteritems(to_replace): if c in value and c in self: @@ -3380,7 +3482,7 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, return None if inplace else res # {'A': NA} -> 0 - elif not com.is_list_like(value): + elif not is_list_like(value): for k, src in compat.iteritems(to_replace): if k in self: new_data = new_data.replace(to_replace=src, @@ -3392,8 +3494,8 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, raise TypeError('value argument must be scalar, dict, or ' 'Series') - elif com.is_list_like(to_replace): # [NA, ''] -> [0, 'missing'] - if com.is_list_like(value): + elif is_list_like(to_replace): # [NA, ''] -> [0, 'missing'] + if is_list_like(value): if len(to_replace) != len(value): raise ValueError('Replacement lists must match ' 'in length. Expecting %d got %d ' % @@ -3409,8 +3511,8 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, value=value, inplace=inplace, regex=regex) elif to_replace is None: - if not (com.is_re_compilable(regex) or - com.is_list_like(regex) or is_dictlike(regex)): + if not (is_re_compilable(regex) or + is_list_like(regex) or is_dict_like(regex)): raise TypeError("'regex' must be a string or a compiled " "regular expression or a list or dict of " "strings or regular expressions, you " @@ -3421,7 +3523,7 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, else: # dest iterable dict-like - if is_dictlike(value): # NA -> {'A' : 0, 'B' : -1} + if is_dict_like(value): # NA -> {'A' : 0, 'B' : -1} new_data = self._data for k, v in compat.iteritems(value): @@ -3431,7 +3533,7 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, inplace=inplace, regex=regex) - elif not com.is_list_like(value): # NA -> 0 + elif not is_list_like(value): # NA -> 0 new_data = self._data.replace(to_replace=to_replace, value=value, inplace=inplace, regex=regex) @@ -3588,26 +3690,118 @@ def interpolate(self, method='linear', axis=0, limit=None, inplace=False, return res # ---------------------------------------------------------------------- - # Action Methods + # Timeseries methods Methods + + def asof(self, where, subset=None): + """ + The last row without any NaN is taken (or the last row without + NaN considering only the subset of columns in the case of a DataFrame) + + .. versionadded:: 0.19.0 For DataFrame + + If there is no good value, NaN is returned. + + Parameters + ---------- + where : date or array of dates + subset : string or list of strings, default None + if not None use these columns for NaN propagation + + Notes + ----- + Dates are assumed to be sorted + Raises if this is not the case + + Returns + ------- + where is scalar + + - value or NaN if input is Series + - Series if input is DataFrame + + where is Index: same shape object as input + + See Also + -------- + merge_asof - def isnull(self): """ + + if isinstance(where, compat.string_types): + from pandas import to_datetime + where = to_datetime(where) + + if not self.index.is_monotonic: + raise ValueError("asof requires a sorted index") + + if isinstance(self, ABCSeries): + if subset is not None: + raise ValueError("subset is not valid for Series") + nulls = self.isnull() + elif self.ndim > 2: + raise NotImplementedError("asof is not implemented " + "for {type}".format(type(self))) + else: + if subset is None: + subset = self.columns + if not is_list_like(subset): + subset = [subset] + nulls = self[subset].isnull().any(1) + + if not is_list_like(where): + start = self.index[0] + if isinstance(self.index, PeriodIndex): + where = Period(where, freq=self.index.freq).ordinal + start = start.ordinal + + if where < start: + return np.nan + + loc = self.index.searchsorted(where, side='right') + if loc > 0: + loc -= 1 + while nulls[loc] and loc > 0: + loc -= 1 + return self.iloc[loc] + + if not isinstance(where, Index): + where = Index(where) + + locs = self.index.asof_locs(where, ~(nulls.values)) + + # mask the missing + missing = locs == -1 + data = self.take(locs, is_copy=False) + data.index = where + data.loc[missing] = np.nan + return data + + # ---------------------------------------------------------------------- + # Action Methods + + _shared_docs['isnull'] = """ Return a boolean same-sized object indicating if the values are null. See Also -------- notnull : boolean inverse of isnull """ + + @Appender(_shared_docs['isnull']) + def isnull(self): return isnull(self).__finalize__(self) - def notnull(self): - """Return a boolean same-sized object indicating if the values are + _shared_docs['isnotnull'] = """ + Return a boolean same-sized object indicating if the values are not null. See Also -------- isnull : boolean inverse of notnull """ + + @Appender(_shared_docs['isnotnull']) + def notnull(self): return notnull(self).__finalize__(self) def clip(self, lower=None, upper=None, axis=None, *args, **kwargs): @@ -3656,14 +3850,14 @@ def clip(self, lower=None, upper=None, axis=None, *args, **kwargs): 3 0.230930 0.000000 4 1.100000 0.570967 """ - if isinstance(self, com.ABCPanel): + if isinstance(self, ABCPanel): raise NotImplementedError("clip is not supported yet for panels") axis = nv.validate_clip_with_axis(axis, args, kwargs) # GH 2747 (arguments were reversed) if lower is not None and upper is not None: - if lib.isscalar(lower) and lib.isscalar(upper): + if is_scalar(lower) and is_scalar(upper): lower, upper = min(lower, upper), max(lower, upper) result = self @@ -3780,16 +3974,20 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, def asfreq(self, freq, method=None, how=None, normalize=False): """ - Convert all TimeSeries inside to specified frequency using DateOffset - objects. Optionally provide fill method to pad/backfill missing values. + Convert TimeSeries to specified frequency. + + Optionally provide filling method to pad/backfill missing values. Parameters ---------- freq : DateOffset object, or string - method : {'backfill', 'bfill', 'pad', 'ffill', None} - Method to use for filling holes in reindexed Series - pad / ffill: propagate last valid observation forward to next valid - backfill / bfill: use NEXT valid observation to fill method + method : {'backfill'/'bfill', 'pad'/'ffill'}, default None + Method to use for filling holes in reindexed Series (note this + does not fill NaNs that already were present): + + * 'pad' / 'ffill': propagate last valid observation forward to next + valid + * 'backfill' / 'bfill': use NEXT valid observation to fill how : {'start', 'end'}, default end For PeriodIndex only, see PeriodIndex.asfreq normalize : bool, default False @@ -3798,6 +3996,9 @@ def asfreq(self, freq, method=None, how=None, normalize=False): Returns ------- converted : type of caller + + To learn more about the frequency strings, please see `this link +`__. """ from pandas.tseries.resample import asfreq return asfreq(self, freq, method=method, how=how, normalize=normalize) @@ -3846,10 +4047,12 @@ def between_time(self, start_time, end_time, include_start=True, def resample(self, rule, how=None, axis=0, fill_method=None, closed=None, label=None, convention='start', kind=None, loffset=None, - limit=None, base=0): + limit=None, base=0, on=None, level=None): """ - Convenience method for frequency conversion and resampling of regular - time-series data. + Convenience method for frequency conversion and resampling of time + series. Object must have a datetime-like index (DatetimeIndex, + PeriodIndex, or TimedeltaIndex), or pass datetime-like values + to the on or level keyword. Parameters ---------- @@ -3867,7 +4070,20 @@ def resample(self, rule, how=None, axis=0, fill_method=None, closed=None, For frequencies that evenly subdivide 1 day, the "origin" of the aggregated intervals. For example, for '5min' frequency, base could range from 0 through 4. Defaults to 0 + on : string, optional + For a DataFrame, column to use instead of index for resampling. + Column must be datetime-like. + + .. versionadded:: 0.19.0 + level : string or int, optional + For a MultiIndex, level (name or number) to use for + resampling. Level must be datetime-like. + + .. versionadded:: 0.19.0 + + To learn more about the offset strings, please see `this link + `__. Examples -------- @@ -3969,12 +4185,11 @@ def resample(self, rule, how=None, axis=0, fill_method=None, closed=None, """ from pandas.tseries.resample import (resample, _maybe_process_deprecations) - axis = self._get_axis_number(axis) r = resample(self, freq=rule, label=label, closed=closed, axis=axis, kind=kind, loffset=loffset, convention=convention, - base=base) + base=base, key=on, level=level) return _maybe_process_deprecations(r, how=how, fill_method=fill_method, @@ -4290,54 +4505,14 @@ def _align_series(self, other, join='outer', axis=None, level=None, right = right.fillna(fill_value, method=method, limit=limit) return left.__finalize__(self), right.__finalize__(other) - _shared_docs['where'] = (""" - Return an object of same shape as self and whose corresponding - entries are from self where cond is %(cond)s and otherwise are from - other. - - Parameters - ---------- - cond : boolean %(klass)s, array or callable - If cond is callable, it is computed on the %(klass)s and - should return boolean %(klass)s or array. - The callable must not change input %(klass)s - (though pandas doesn't check it). - - .. versionadded:: 0.18.1 - - A callable can be used as cond. - - other : scalar, %(klass)s, or callable - If other is callable, it is computed on the %(klass)s and - should return scalar or %(klass)s. - The callable must not change input %(klass)s - (though pandas doesn't check it). - - .. versionadded:: 0.18.1 - - A callable can be used as other. - - inplace : boolean, default False - Whether to perform the operation in place on the data - axis : alignment axis if needed, default None - level : alignment level if needed, default None - try_cast : boolean, default False - try to cast the result back to the input type (if possible), - raise_on_error : boolean, default True - Whether to raise on invalid data types (e.g. trying to where on - strings) - - Returns - ------- - wh : same type as caller - """) - - @Appender(_shared_docs['where'] % dict(_shared_doc_kwargs, cond="True")) - def where(self, cond, other=np.nan, inplace=False, axis=None, level=None, - try_cast=False, raise_on_error=True): + def _where(self, cond, other=np.nan, inplace=False, axis=None, level=None, + try_cast=False, raise_on_error=True): + """ + Equivalent to public method `where`, except that `other` is not + applied as a function even if callable. Used in __setitem__. + """ cond = com._apply_if_callable(cond, self) - other = com._apply_if_callable(other, self) if isinstance(cond, NDFrame): cond, _ = cond.align(self, join='right', broadcast_axis=1) @@ -4385,10 +4560,12 @@ def where(self, cond, other=np.nan, inplace=False, axis=None, level=None, new_other = np.array(other, dtype=self.dtype) except ValueError: new_other = np.array(other) + except TypeError: + new_other = other # we can end up comparing integers and m8[ns] # which is a numpy no no - is_i8 = com.needs_i8_conversion(self.dtype) + is_i8 = needs_i8_conversion(self.dtype) if is_i8: matches = False else: @@ -4397,7 +4574,7 @@ def where(self, cond, other=np.nan, inplace=False, axis=None, level=None, if matches is False or not matches.all(): # coerce other to a common dtype if we can - if com.needs_i8_conversion(self.dtype): + if needs_i8_conversion(self.dtype): try: other = np.array(other, dtype=self.dtype) except: @@ -4450,7 +4627,7 @@ def where(self, cond, other=np.nan, inplace=False, axis=None, level=None, dtype, fill_value = _maybe_promote(other.dtype) new_other = np.empty(len(icond), dtype=dtype) new_other.fill(fill_value) - com._maybe_upcast_putmask(new_other, icond, other) + _maybe_upcast_putmask(new_other, icond, other) other = new_other else: @@ -4493,7 +4670,111 @@ def where(self, cond, other=np.nan, inplace=False, axis=None, level=None, return self._constructor(new_data).__finalize__(self) - @Appender(_shared_docs['where'] % dict(_shared_doc_kwargs, cond="False")) + _shared_docs['where'] = (""" + Return an object of same shape as self and whose corresponding + entries are from self where cond is %(cond)s and otherwise are from + other. + + Parameters + ---------- + cond : boolean %(klass)s, array or callable + If cond is callable, it is computed on the %(klass)s and + should return boolean %(klass)s or array. + The callable must not change input %(klass)s + (though pandas doesn't check it). + + .. versionadded:: 0.18.1 + + A callable can be used as cond. + + other : scalar, %(klass)s, or callable + If other is callable, it is computed on the %(klass)s and + should return scalar or %(klass)s. + The callable must not change input %(klass)s + (though pandas doesn't check it). + + .. versionadded:: 0.18.1 + + A callable can be used as other. + + inplace : boolean, default False + Whether to perform the operation in place on the data + axis : alignment axis if needed, default None + level : alignment level if needed, default None + try_cast : boolean, default False + try to cast the result back to the input type (if possible), + raise_on_error : boolean, default True + Whether to raise on invalid data types (e.g. trying to where on + strings) + + Returns + ------- + wh : same type as caller + + Notes + ----- + The %(name)s method is an application of the if-then idiom. For each + element in the calling DataFrame, if ``cond`` is ``%(cond)s`` the + element is used; otherwise the corresponding element from the DataFrame + ``other`` is used. + + The signature for :func:`DataFrame.where` differs from + :func:`numpy.where`. Roughly ``df1.where(m, df2)`` is equivalent to + ``np.where(m, df1, df2)``. + + For further details and examples see the ``%(name)s`` documentation in + :ref:`indexing `. + + Examples + -------- + >>> s = pd.Series(range(5)) + >>> s.where(s > 0) + 0 NaN + 1 1.0 + 2 2.0 + 3 3.0 + 4 4.0 + + >>> df = pd.DataFrame(np.arange(10).reshape(-1, 2), columns=['A', 'B']) + >>> m = df %% 3 == 0 + >>> df.where(m, -df) + A B + 0 0 -1 + 1 -2 3 + 2 -4 -5 + 3 6 -7 + 4 -8 9 + >>> df.where(m, -df) == np.where(m, df, -df) + A B + 0 True True + 1 True True + 2 True True + 3 True True + 4 True True + >>> df.where(m, -df) == df.mask(~m, -df) + A B + 0 True True + 1 True True + 2 True True + 3 True True + 4 True True + + See Also + -------- + :func:`DataFrame.%(name_other)s` + """) + + @Appender(_shared_docs['where'] % dict(_shared_doc_kwargs, cond="True", + name='where', name_other='mask')) + def where(self, cond, other=np.nan, inplace=False, axis=None, level=None, + try_cast=False, raise_on_error=True): + + other = com._apply_if_callable(other, self) + return self._where(cond, other, inplace, axis, level, try_cast, + raise_on_error) + + @Appender(_shared_docs['where'] % dict(_shared_doc_kwargs, cond="False", + name='mask', name_other='where')) def mask(self, cond, other=np.nan, inplace=False, axis=None, level=None, try_cast=False, raise_on_error=True): @@ -4511,7 +4792,7 @@ def mask(self, cond, other=np.nan, inplace=False, axis=None, level=None, periods : int Number of periods to move, can be positive or negative freq : DateOffset, timedelta, or time rule string, optional - Increment to use from datetools module or time rule (e.g. 'EOM'). + Increment to use from the tseries module or time rule (e.g. 'EOM'). See Notes. axis : %(axes_single_arg)s @@ -4584,7 +4865,7 @@ def tshift(self, periods=1, freq=None, axis=0): periods : int Number of periods to move, can be positive or negative freq : DateOffset, timedelta, or time rule string, default None - Increment to use from datetools module or time rule (e.g. 'EOM') + Increment to use from the tseries module or time rule (e.g. 'EOM') axis : int or basestring Corresponds to the axis that contains the Index @@ -4614,11 +4895,11 @@ def tshift(self, periods=1, freq=None, axis=0): return self if isinstance(freq, string_types): - freq = datetools.to_offset(freq) + freq = to_offset(freq) block_axis = self._get_block_manager_axis(axis) if isinstance(index, PeriodIndex): - orig_freq = datetools.to_offset(index.freq) + orig_freq = to_offset(index.freq) if freq == orig_freq: new_data = self._data.copy() new_data.axes[block_axis] = index.shift(periods) @@ -4908,7 +5189,7 @@ def describe_categorical_1d(data): if result[1] > 0: top, freq = objcounts.index[0], objcounts.iloc[0] - if com.is_datetime64_dtype(data): + if is_datetime64_dtype(data): asint = data.dropna().values.view('i8') names += ['top', 'freq', 'first', 'last'] result += [lib.Timestamp(top), freq, @@ -4921,11 +5202,11 @@ def describe_categorical_1d(data): return pd.Series(result, index=names, name=data.name) def describe_1d(data): - if com.is_bool_dtype(data): + if is_bool_dtype(data): return describe_categorical_1d(data) - elif com.is_numeric_dtype(data): + elif is_numeric_dtype(data): return describe_numeric_1d(data) - elif com.is_timedelta64_dtype(data): + elif is_timedelta64_dtype(data): return describe_numeric_1d(data) else: return describe_categorical_1d(data) @@ -4933,10 +5214,9 @@ def describe_1d(data): if self.ndim == 1: return describe_1d(self) elif (include is None) and (exclude is None): - if len(self._get_numeric_data()._info_axis) > 0: - # when some numerics are found, keep only numerics - data = self.select_dtypes(include=[np.number]) - else: + # when some numerics are found, keep only numerics + data = self.select_dtypes(include=[np.number]) + if len(data.columns) == 0: data = self elif include == 'all': if exclude is not None: @@ -5012,7 +5292,7 @@ def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None, rs = (data.div(data.shift(periods=periods, freq=freq, axis=axis, **kwargs)) - 1) if freq is None: - mask = com.isnull(_values_from_object(self)) + mask = isnull(_values_from_object(self)) np.putmask(rs.values, mask, np.nan) return rs @@ -5177,11 +5457,12 @@ def _add_series_or_dataframe_operations(cls): @Appender(rwindow.rolling.__doc__) def rolling(self, window, min_periods=None, freq=None, center=False, - win_type=None, axis=0): + win_type=None, on=None, axis=0): axis = self._get_axis_number(axis) return rwindow.rolling(self, window=window, min_periods=min_periods, freq=freq, - center=center, win_type=win_type, axis=axis) + center=center, win_type=win_type, + on=on, axis=axis) cls.rolling = rolling @@ -5338,10 +5619,10 @@ def _make_cum_function(cls, name, name1, name2, axis_descr, desc, accum_func, mask_a, mask_b): @Substitution(outname=name, desc=desc, name1=name1, name2=name2, axis_descr=axis_descr) - @Appender("Return cumulative {0} over requested axis.".format(name) + + @Appender("Return {0} over requested axis.".format(desc) + _cnum_doc) - def cum_func(self, axis=None, dtype=None, out=None, skipna=True, **kwargs): - nv.validate_cum_func(tuple(), kwargs, fname=name) + def cum_func(self, axis=None, skipna=True, *args, **kwargs): + skipna = nv.validate_cum_func_with_skipna(skipna, args, kwargs, name) if axis is None: axis = self._stat_axis_number else: diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index bea62e98e4a2a..66e30229cd52b 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -13,6 +13,25 @@ from pandas import compat from pandas.compat.numpy import function as nv from pandas.compat.numpy import _np_version_under1p8 + +from pandas.types.common import (_DATELIKE_DTYPES, + is_numeric_dtype, + is_timedelta64_dtype, is_datetime64_dtype, + is_categorical_dtype, + is_datetime_or_timedelta_dtype, + is_bool, is_integer_dtype, + is_complex_dtype, + is_bool_dtype, + is_scalar, + _ensure_float64, + _ensure_platform_int, + _ensure_int64, + _ensure_object, + _ensure_float) +from pandas.types.cast import _possibly_downcast_to_dtype +from pandas.types.missing import isnull, notnull, _maybe_fill + +from pandas.core.common import _values_from_object, AbstractMethodError from pandas.core.base import (PandasObject, SelectionMixin, GroupByError, DataError, SpecificationError) from pandas.core.categorical import Categorical @@ -30,14 +49,7 @@ import pandas.core.algorithms as algos import pandas.core.common as com -from pandas.core.common import(_possibly_downcast_to_dtype, isnull, - notnull, _DATELIKE_DTYPES, is_numeric_dtype, - is_timedelta64_dtype, is_datetime64_dtype, - is_categorical_dtype, _values_from_object, - is_datetime_or_timedelta_dtype, is_bool, - is_bool_dtype, AbstractMethodError, - _maybe_fill) -from pandas.core.config import option_context, is_callable +from pandas.core.config import option_context import pandas.lib as lib from pandas.lib import Timestamp import pandas.tslib as tslib @@ -95,7 +107,7 @@ def _groupby_function(name, alias, npfunc, numeric_only=True, @Appender(_doc_template) @Appender(_local_template) def f(self): - self._set_selection_from_grouper() + self._set_group_selection() try: return self._cython_agg_general(alias, numeric_only=numeric_only) except AssertionError as e: @@ -243,7 +255,8 @@ def _set_grouper(self, obj, sort=False): Parameters ---------- obj : the subject object - + sort : bool, default False + whether the resulting grouper should be sorted """ if self.key is not None and self.level is not None: @@ -457,8 +470,21 @@ def _selected_obj(self): else: return self.obj[self._selection] - def _set_selection_from_grouper(self): - """ we may need create a selection if we have non-level groupers """ + def _reset_group_selection(self): + """ + Clear group based selection. Used for methods needing to return info on + each group regardless of whether a group selection was previously set. + """ + if self._group_selection is not None: + self._group_selection = None + # GH12839 clear cached selection too when changing group selection + self._reset_cache('_selected_obj') + + def _set_group_selection(self): + """ + Create group based selection. Used when selection is not passed + directly but instead via a grouper. + """ grp = self.grouper if self.as_index and getattr(grp, 'groupings', None) is not None and \ self.obj.ndim > 1: @@ -468,6 +494,8 @@ def _set_selection_from_grouper(self): if len(groupers): self._group_selection = ax.difference(Index(groupers)).tolist() + # GH12839 clear selected obj cache when group selection changes + self._reset_cache('_selected_obj') def _set_result_index_ordered(self, result): # set the result index on the passed values object and @@ -511,7 +539,7 @@ def _make_wrapper(self, name): # need to setup the selection # as are not passed directly but in the grouper - self._set_selection_from_grouper() + self._set_group_selection() f = getattr(self._selected_obj, name) if not isinstance(f, types.MethodType): @@ -647,11 +675,12 @@ def apply(self, func, *args, **kwargs): # resolve functions to their callable functions prior, this # wouldn't be needed if args or kwargs: - if is_callable(func): + if callable(func): @wraps(func) def f(g): - return func(g, *args, **kwargs) + with np.errstate(all='ignore'): + return func(g, *args, **kwargs) else: raise ValueError('func must be a callable if args or ' 'kwargs are supplied') @@ -737,7 +766,7 @@ def _try_cast(self, result, obj): else: dtype = obj.dtype - if not lib.isscalar(result): + if not is_scalar(result): result = _possibly_downcast_to_dtype(result, dtype) return result @@ -802,7 +831,7 @@ def _python_agg_general(self, func, *args, **kwargs): # since we are masking, make sure that we have a float object values = result if is_numeric_dtype(values.dtype): - values = com.ensure_float(values) + values = _ensure_float(values) output[name] = self._try_cast(values[mask], result) @@ -979,7 +1008,7 @@ def mean(self, *args, **kwargs): except GroupByError: raise except Exception: # pragma: no cover - self._set_selection_from_grouper() + self._set_group_selection() f = lambda x: x.mean(axis=self.axis) return self._python_agg_general(f) @@ -997,7 +1026,7 @@ def median(self): raise except Exception: # pragma: no cover - self._set_selection_from_grouper() + self._set_group_selection() def f(x): if isinstance(x, np.ndarray): @@ -1040,7 +1069,7 @@ def var(self, ddof=1, *args, **kwargs): if ddof == 1: return self._cython_agg_general('var') else: - self._set_selection_from_grouper() + self._set_group_selection() f = lambda x: x.var(ddof=ddof) return self._python_agg_general(f) @@ -1178,31 +1207,55 @@ def nth(self, n, dropna=None): Examples -------- - >>> df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) + + >>> df = pd.DataFrame({'A': [1, 1, 2, 1, 2], + ... 'B': [np.nan, 2, 3, 4, 5]}, columns=['A', 'B']) >>> g = df.groupby('A') >>> g.nth(0) - A B - 0 1 NaN - 2 5 6 + B + A + 1 NaN + 2 3.0 >>> g.nth(1) - A B - 1 1 4 + B + A + 1 2.0 + 2 5.0 >>> g.nth(-1) - A B - 1 1 4 - 2 5 6 + B + A + 1 4.0 + 2 5.0 + >>> g.nth([0, 1]) + B + A + 1 NaN + 1 2.0 + 2 3.0 + 2 5.0 + + Specifying ``dropna`` allows count ignoring NaN + >>> g.nth(0, dropna='any') - B - A - 1 4 - 5 6 + B + A + 1 2.0 + 2 3.0 + + NaNs denote group exhausted when using dropna - # NaNs denote group exhausted when using dropna - >>> g.nth(1, dropna='any') + >>> g.nth(3, dropna='any') B - A + A 1 NaN - 5 NaN + 2 NaN + + Specifying ``as_index=False`` in ``groupby`` keeps the original index. + + >>> df.groupby('A', as_index=False).nth(1) + A B + 1 1 2.0 + 4 2 5.0 """ if isinstance(n, int): @@ -1216,7 +1269,7 @@ def nth(self, n, dropna=None): raise TypeError("n needs to be an int or a list/set/tuple of ints") nth_values = np.array(nth_values, dtype=np.intp) - self._set_selection_from_grouper() + self._set_group_selection() if not dropna: mask = np.in1d(self._cumcount_array(), nth_values) | \ @@ -1324,7 +1377,7 @@ def cumcount(self, ascending=True): dtype: int64 """ - self._set_selection_from_grouper() + self._set_group_selection() index = self._selected_obj.index cumcounts = self._cumcount_array(ascending=ascending) @@ -1346,7 +1399,7 @@ def cumsum(self, axis=0, *args, **kwargs): """Cumulative sum for each group""" nv.validate_groupby_func('cumsum', args, kwargs) if axis != 0: - return self.apply(lambda x: x.cumprod(axis=axis)) + return self.apply(lambda x: x.cumsum(axis=axis)) return self._cython_transform('cumsum') @@ -1402,6 +1455,7 @@ def head(self, n=5): 0 1 2 2 5 6 """ + self._reset_group_selection() mask = self._cumcount_array() < n return self._selected_obj[mask] @@ -1428,6 +1482,7 @@ def tail(self, n=5): 0 a 1 2 b 1 """ + self._reset_group_selection() mask = self._cumcount_array(ascending=False) < n return self._selected_obj[mask] @@ -1577,7 +1632,7 @@ def size(self): """ ids, _, ngroup = self.group_info - ids = com._ensure_platform_int(ids) + ids = _ensure_platform_int(ids) out = np.bincount(ids[ids != -1], minlength=ngroup or None) return Series(out, index=self.result_index, dtype='int64') @@ -1613,7 +1668,7 @@ def group_info(self): comp_ids, obs_group_ids = self._get_compressed_labels() ngroups = len(obs_group_ids) - comp_ids = com._ensure_int64(comp_ids) + comp_ids = _ensure_int64(comp_ids) return comp_ids, obs_group_ids, ngroups def _get_compressed_labels(self): @@ -1653,7 +1708,7 @@ def get_group_levels(self): name_list = [] for ping, labels in zip(self.groupings, self.recons_labels): - labels = com._ensure_platform_int(labels) + labels = _ensure_platform_int(labels) levels = ping.group_index.take(labels) name_list.append(levels) @@ -1762,11 +1817,11 @@ def _cython_operation(self, kind, values, how, axis): values = values.view('int64') is_numeric = True elif is_bool_dtype(values.dtype): - values = _algos.ensure_float64(values) - elif com.is_integer_dtype(values): + values = _ensure_float64(values) + elif is_integer_dtype(values): values = values.astype('int64', copy=False) - elif is_numeric and not com.is_complex_dtype(values): - values = _algos.ensure_float64(values) + elif is_numeric and not is_complex_dtype(values): + values = _ensure_float64(values) else: values = values.astype(object) @@ -1775,7 +1830,7 @@ def _cython_operation(self, kind, values, how, axis): kind, how, values, is_numeric) except NotImplementedError: if is_numeric: - values = _algos.ensure_float64(values) + values = _ensure_float64(values) func, dtype_str = self._get_cython_function( kind, how, values, is_numeric) else: @@ -1803,7 +1858,7 @@ def _cython_operation(self, kind, values, how, axis): result = self._transform( result, accum, values, labels, func, is_numeric) - if com.is_integer_dtype(result): + if is_integer_dtype(result): if len(result[result == tslib.iNaT]) > 0: result = result.astype('float64') result[result == tslib.iNaT] = np.nan @@ -1816,7 +1871,7 @@ def _cython_operation(self, kind, values, how, axis): result, (counts > 0).view(np.uint8)) except ValueError: result = lib.row_bool_subset_object( - com._ensure_object(result), + _ensure_object(result), (counts > 0).view(np.uint8)) else: result = result[counts > 0] @@ -1978,7 +2033,7 @@ def generate_bins_generic(values, binner, closed): class BinGrouper(BaseGrouper): def __init__(self, bins, binlabels, filter_empty=False, mutated=False): - self.bins = com._ensure_int64(bins) + self.bins = _ensure_int64(bins) self.binlabels = _ensure_index(binlabels) self._filter_empty_groups = filter_empty self.mutated = mutated @@ -2043,7 +2098,7 @@ def group_info(self): obs_group_ids = np.arange(ngroups) rep = np.diff(np.r_[0, self.bins]) - rep = com._ensure_platform_int(rep) + rep = _ensure_platform_int(rep) if ngroups == len(self.bins): comp_ids = np.repeat(np.arange(ngroups), rep) else: @@ -2249,7 +2304,7 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, self.grouper = to_timedelta(self.grouper) def __repr__(self): - return 'Grouping(%s)' % self.name + return 'Grouping({0})'.format(self.name) def __iter__(self): return iter(self.indices) @@ -2431,7 +2486,7 @@ def is_in_obj(gpr): def _is_label_like(val): return (isinstance(val, compat.string_types) or - (val is not None and lib.isscalar(val))) + (val is not None and is_scalar(val))) def _convert_grouper(axis, grouper): @@ -2653,7 +2708,7 @@ def _aggregate_multiple_funcs(self, arg, _level): results[name] = obj.aggregate(func) if isinstance(list(compat.itervalues(results))[0], - com.ABCDataFrame): + DataFrame): # let higher level handle if _level: @@ -2852,9 +2907,9 @@ def nunique(self, dropna=True): 'val.dtype must be object, got %s' % val.dtype val, _ = algos.factorize(val, sort=False) sorter = np.lexsort((val, ids)) - isnull = lambda a: a == -1 + _isnull = lambda a: a == -1 else: - isnull = com.isnull + _isnull = isnull ids, val = ids[sorter], val[sorter] @@ -2864,7 +2919,7 @@ def nunique(self, dropna=True): inc = np.r_[1, val[1:] != val[:-1]] # 1st item of each group is a new unique observation - mask = isnull(val) + mask = _isnull(val) if dropna: inc[idx] = 1 inc[mask] = 0 @@ -2980,8 +3035,8 @@ def value_counts(self, normalize=False, sort=True, ascending=False, mi = MultiIndex(levels=levels, labels=labels, names=names, verify_integrity=False) - if com.is_integer_dtype(out): - out = com._ensure_int64(out) + if is_integer_dtype(out): + out = _ensure_int64(out) return Series(out, index=mi, name=self.name) # for compat. with algos.value_counts need to ensure every @@ -3011,8 +3066,8 @@ def value_counts(self, normalize=False, sort=True, ascending=False, mi = MultiIndex(levels=levels, labels=labels, names=names, verify_integrity=False) - if com.is_integer_dtype(out): - out = com._ensure_int64(out) + if is_integer_dtype(out): + out = _ensure_int64(out) return Series(out, index=mi, name=self.name) def count(self): @@ -3021,7 +3076,7 @@ def count(self): val = self.obj.get_values() mask = (ids != -1) & ~isnull(val) - ids = com._ensure_platform_int(ids) + ids = _ensure_platform_int(ids) out = np.bincount(ids[mask], minlength=ngroups or None) return Series(out, @@ -3385,11 +3440,14 @@ def first_non_None_value(values): return self._reindex_output(result) + # values are not series or array-like but scalars else: # only coerce dates if we find at least 1 datetime coerce = True if any([isinstance(x, Timestamp) for x in values]) else False - return (Series(values, index=key_index, name=self.name) + # self.name not passed through to Series as the result + # should not take the name of original selection of columns + return (Series(values, index=key_index) ._convert(datetime=True, coerce=coerce)) @@ -3595,7 +3653,7 @@ def filter(self, func, dropna=True, *args, **kwargs): # noqa pass # interpret the result of the filter - if is_bool(res) or (lib.isscalar(res) and isnull(res)): + if is_bool(res) or (is_scalar(res) and isnull(res)): if res and notnull(res): indices.append(self._get_index(name)) else: @@ -3740,9 +3798,39 @@ def _reindex_output(self, result): return result levels_list = [ping.group_index for ping in groupings] - index = MultiIndex.from_product(levels_list, names=self.grouper.names) - d = {self.obj._get_axis_name(self.axis): index, 'copy': False} - return result.reindex(**d).sortlevel(axis=self.axis) + index, _ = MultiIndex.from_product( + levels_list, names=self.grouper.names).sortlevel() + + if self.as_index: + d = {self.obj._get_axis_name(self.axis): index, 'copy': False} + return result.reindex(**d) + + # GH 13204 + # Here, the categorical in-axis groupers, which need to be fully + # expanded, are columns in `result`. An idea is to do: + # result = result.set_index(self.grouper.names) + # .reindex(index).reset_index() + # but special care has to be taken because of possible not-in-axis + # groupers. + # So, we manually select and drop the in-axis grouper columns, + # reindex `result`, and then reset the in-axis grouper columns. + + # Select in-axis groupers + in_axis_grps = [(i, ping.name) for (i, ping) + in enumerate(groupings) if ping.in_axis] + g_nums, g_names = zip(*in_axis_grps) + + result = result.drop(labels=list(g_names), axis=1) + + # Set a temp index and reindex (possibly expanding) + result = result.set_index(self.grouper.result_index + ).reindex(index, copy=False) + + # Reset in-axis grouper columns + # (using level numbers `g_nums` because level names may not be unique) + result = result.reset_index(level=g_nums) + + return result.reset_index(drop=True) def _iterate_column_groupbys(self): for i, colname in enumerate(self._selected_obj.columns): @@ -3762,7 +3850,7 @@ def count(self): """ Compute count of group, excluding missing values """ from functools import partial from pandas.lib import count_level_2d - from pandas.core.common import _isnull_ndarraylike as isnull + from pandas.types.missing import _isnull_ndarraylike as isnull data, _ = self._get_data_to_aggregate() ids, _, ngroups = self.grouper.group_info @@ -3883,7 +3971,7 @@ class DataSplitter(object): def __init__(self, data, labels, ngroups, axis=0): self.data = data - self.labels = com._ensure_int64(labels) + self.labels = _ensure_int64(labels) self.ngroups = ngroups self.axis = axis @@ -4040,7 +4128,10 @@ def loop(labels, shape): out = stride * labels[0].astype('i8', subok=False, copy=False) for i in range(1, nlev): - stride //= shape[i] + if shape[i] == 0: + stride = 0 + else: + stride //= shape[i] out += labels[i] * stride if xnull: # exclude nulls @@ -4064,7 +4155,7 @@ def loop(labels, shape): def maybe_lift(lab, size): # pormote nan values return (lab + 1, size + 1) if (lab == -1).any() else (lab, size) - labels = map(com._ensure_int64, labels) + labels = map(_ensure_int64, labels) if not xnull: labels, shape = map(list, zip(*map(maybe_lift, labels, shape))) @@ -4279,10 +4370,12 @@ def _get_group_index_sorter(group_index, ngroups): count = len(group_index) alpha = 0.0 # taking complexities literally; there may be beta = 1.0 # some room for fine-tuning these parameters - if alpha + beta * ngroups < count * np.log(count): - sorter, _ = _algos.groupsort_indexer(com._ensure_int64(group_index), + do_groupsort = (count > 0 and ((alpha + beta * ngroups) < + (count * np.log(count)))) + if do_groupsort: + sorter, _ = _algos.groupsort_indexer(_ensure_int64(group_index), ngroups) - return com._ensure_platform_int(sorter) + return _ensure_platform_int(sorter) else: return group_index.argsort(kind='mergesort') @@ -4297,7 +4390,7 @@ def _compress_group_index(group_index, sort=True): size_hint = min(len(group_index), _hash._SIZE_HINT_LIMIT) table = _hash.Int64HashTable(size_hint) - group_index = com._ensure_int64(group_index) + group_index = _ensure_int64(group_index) # note, group labels come out ascending (ie, 1,2,3 etc) comp_ids, obs_group_ids = table.get_labels_groupby(group_index) @@ -4329,8 +4422,19 @@ def _reorder_by_uniques(uniques, labels): def _groupby_indices(values): - return _algos.groupby_indices(_values_from_object( - com._ensure_object(values))) + + if is_categorical_dtype(values): + + # we have a categorical, so we can do quite a bit + # bit better than factorizing again + reverse = dict(enumerate(values.categories)) + codes = values.codes.astype('int64') + _, counts = _hash.value_count_int64(codes, False) + else: + reverse, codes, counts = _algos.group_labels( + _values_from_object(_ensure_object(values))) + + return _algos.groupby_indices(reverse, codes, counts) def numpy_groupby(data, labels, axis=0): diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 9485f50ed07f1..a7cc3b9dddd36 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1,17 +1,25 @@ # pylint: disable=W0223 -from pandas.core.index import Index, MultiIndex +import numpy as np from pandas.compat import range, zip import pandas.compat as compat +from pandas.types.generic import ABCDataFrame, ABCPanel, ABCSeries +from pandas.types.common import (is_integer_dtype, + is_integer, is_float, + is_categorical_dtype, + is_list_like, + is_sequence, + is_scalar, + is_sparse, + _ensure_platform_int) +from pandas.types.missing import isnull, _infer_fill_value + +from pandas.core.index import Index, MultiIndex + import pandas.core.common as com -import pandas.lib as lib -from pandas.core.common import (is_bool_indexer, is_integer_dtype, - _asarray_tuplesafe, is_list_like, isnull, - is_null_slice, is_full_slice, ABCSeries, - ABCDataFrame, ABCPanel, is_float, - _values_from_object, _infer_fill_value, - is_integer) -import numpy as np +from pandas.core.common import (is_bool_indexer, _asarray_tuplesafe, + is_null_slice, is_full_slice, + _values_from_object) # the supported indexers @@ -67,7 +75,7 @@ def __getitem__(self, key): key = tuple(com._apply_if_callable(x, self.obj) for x in key) try: values = self.obj.get_value(*key) - if lib.isscalar(values): + if is_scalar(values): return values except Exception: pass @@ -625,7 +633,7 @@ def _align_series(self, indexer, ser, multiindex_indexer=False): # we have a frame, with multiple indexers on both axes; and a # series, so need to broadcast (see GH5206) if (sum_aligners == self.ndim and - all([com.is_sequence(_) for _ in indexer])): + all([is_sequence(_) for _ in indexer])): ser = ser.reindex(obj.axes[0][indexer[0]], copy=True)._values # single indexer @@ -639,7 +647,7 @@ def _align_series(self, indexer, ser, multiindex_indexer=False): ax = obj.axes[i] # multiple aligners (or null slices) - if com.is_sequence(idx) or isinstance(idx, slice): + if is_sequence(idx) or isinstance(idx, slice): if single_aligner and is_null_slice(idx): continue new_ix = ax[idx] @@ -685,7 +693,7 @@ def _align_series(self, indexer, ser, multiindex_indexer=False): return ser - elif lib.isscalar(indexer): + elif is_scalar(indexer): ax = self.obj._get_axis(1) if ser.index.equals(ax): @@ -710,7 +718,7 @@ def _align_frame(self, indexer, df): sindexers = [] for i, ix in enumerate(indexer): ax = self.obj.axes[i] - if com.is_sequence(ix) or isinstance(ix, slice): + if is_sequence(ix) or isinstance(ix, slice): if idx is None: idx = ax[ix].ravel() elif cols is None: @@ -761,7 +769,7 @@ def _align_frame(self, indexer, df): val = df.reindex(index=ax)._values return val - elif lib.isscalar(indexer) and is_panel: + elif is_scalar(indexer) and is_panel: idx = self.obj.axes[1] cols = self.obj.axes[2] @@ -857,7 +865,7 @@ def _convert_for_reindex(self, key, axis=0): keyarr = _asarray_tuplesafe(key) if is_integer_dtype(keyarr) and not labels.is_integer(): - keyarr = com._ensure_platform_int(keyarr) + keyarr = _ensure_platform_int(keyarr) return labels.take(keyarr) return keyarr @@ -968,7 +976,7 @@ def _getitem_nested_tuple(self, tup): axis += 1 # if we have a scalar, we are done - if lib.isscalar(obj) or not hasattr(obj, 'ndim'): + if is_scalar(obj) or not hasattr(obj, 'ndim'): break # has the dim of the obj changed? @@ -1038,7 +1046,7 @@ def _getitem_iterable(self, key, axis=0): # asarray can be unsafe, NumPy strings are weird keyarr = _asarray_tuplesafe(key) - if com.is_categorical_dtype(labels): + if is_categorical_dtype(labels): keyarr = labels._shallow_copy(keyarr) # have the index handle the indexer and possibly return @@ -1210,7 +1218,9 @@ def _convert_to_indexer(self, obj, axis=0, is_setter=False): else: (indexer, missing) = labels.get_indexer_non_unique(objarr) - check = indexer + # 'indexer' has dupes, create 'check' using 'missing' + check = np.zeros_like(objarr) + check[missing] = -1 mask = check == -1 if mask.any(): @@ -1799,12 +1809,13 @@ def check_bool_indexer(ax, key): result = key if isinstance(key, ABCSeries) and not key.index.equals(ax): result = result.reindex(ax) - mask = com.isnull(result._values) + mask = isnull(result._values) if mask.any(): raise IndexingError('Unalignable boolean Series key provided') - result = result.astype(bool)._values - + elif is_sparse(result): + result = result.to_dense() + result = np.asarray(result, dtype=bool) else: # is_bool_indexer has already checked for nulls in the case of an # object array key, so no check needed here @@ -1941,9 +1952,9 @@ def _non_reducing_slice(slice_): def pred(part): # true when slice does *not* reduce - return isinstance(part, slice) or com.is_list_like(part) + return isinstance(part, slice) or is_list_like(part) - if not com.is_list_like(slice_): + if not is_list_like(slice_): if not isinstance(slice_, slice): # a 1-d slice, like df.loc[1] slice_ = [[slice_]] diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 97df81ad6be48..da72309b8eae1 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -10,29 +10,49 @@ from pandas.core.base import PandasObject -from pandas.core.common import (_possibly_downcast_to_dtype, isnull, _NS_DTYPE, - _TD_DTYPE, ABCSeries, is_list_like, - _infer_dtype_from_scalar, is_null_slice, - is_dtype_equal, is_null_datelike_scalar, - _maybe_promote, is_timedelta64_dtype, - is_datetime64_dtype, is_datetimetz, is_sparse, - array_equivalent, _is_na_compat, - _maybe_convert_string_to_object, - _maybe_convert_scalar, - is_categorical, is_datetimelike_v_numeric, - is_numeric_v_string_like, is_extension_type) +from pandas.types.dtypes import DatetimeTZDtype, CategoricalDtype +from pandas.types.common import (_TD_DTYPE, _NS_DTYPE, + _ensure_int64, _ensure_platform_int, + is_integer, + is_dtype_equal, + is_timedelta64_dtype, + is_datetime64_dtype, is_datetimetz, is_sparse, + is_categorical, is_categorical_dtype, + is_integer_dtype, + is_datetime64tz_dtype, + is_object_dtype, + is_datetimelike_v_numeric, + is_numeric_v_string_like, is_extension_type, + is_list_like, + is_re, + is_re_compilable, + is_scalar, + _get_dtype) +from pandas.types.cast import (_possibly_downcast_to_dtype, + _maybe_convert_string_to_object, + _maybe_upcast, + _maybe_convert_scalar, _maybe_promote, + _infer_dtype_from_scalar, + _soft_convert_objects, + _possibly_convert_objects, + _astype_nansafe, + _find_common_type) +from pandas.types.missing import (isnull, array_equivalent, + _is_na_compat, + is_null_datelike_scalar) +import pandas.types.concat as _concat + +from pandas.types.generic import ABCSeries +from pandas.core.common import is_null_slice import pandas.core.algorithms as algos -from pandas.types.api import DatetimeTZDtype from pandas.core.index import Index, MultiIndex, _ensure_index from pandas.core.indexing import maybe_convert_indices, length_of_indexer from pandas.core.categorical import Categorical, maybe_to_categorical from pandas.tseries.index import DatetimeIndex from pandas.formats.printing import pprint_thing -import pandas.core.common as com -import pandas.types.concat as _concat + import pandas.core.missing as missing -import pandas.core.convert as convert from pandas.sparse.array import _maybe_to_sparse, SparseArray import pandas.lib as lib import pandas.tslib as tslib @@ -112,8 +132,8 @@ def is_categorical_astype(self, dtype): validate that we have a astypeable to categorical, returns a boolean if we are a categorical """ - if com.is_categorical_dtype(dtype): - if dtype == com.CategoricalDtype(): + if is_categorical_dtype(dtype): + if dtype == CategoricalDtype(): return True # this is a pd.Categorical, but is not @@ -137,7 +157,7 @@ def get_values(self, dtype=None): return an internal format, currently just the ndarray this is often overriden to handle to_dense like operations """ - if com.is_object_dtype(dtype): + if is_object_dtype(dtype): return self.values.astype(object) return self.values @@ -328,7 +348,8 @@ def apply(self, func, mgr=None, **kwargs): """ apply the function to my values; return a block if we are not one """ - result = func(self.values, **kwargs) + with np.errstate(all='ignore'): + result = func(self.values, **kwargs) if not isinstance(result, Block): result = self.make_block(values=_block_shape(result, ndim=self.ndim)) @@ -481,7 +502,7 @@ def _astype(self, dtype, copy=False, raise_on_error=True, values=None, values = self.get_values(dtype=dtype) # _astype_nansafe works fine with 1-d only - values = com._astype_nansafe(values.ravel(), dtype, copy=True) + values = _astype_nansafe(values.ravel(), dtype, copy=True) values = values.reshape(self.shape) newb = make_block(values, placement=self.mgr_locs, dtype=dtype, @@ -651,7 +672,7 @@ def setitem(self, indexer, value, mgr=None): # cast the values to a type that can hold nan (if necessary) if not self._can_hold_element(value): - dtype, _ = com._maybe_promote(arr_value.dtype) + dtype, _ = _maybe_promote(arr_value.dtype) values = values.astype(dtype) transf = (lambda x: x.T) if self.ndim == 2 else (lambda x: x) @@ -684,7 +705,7 @@ def _is_scalar_indexer(indexer): if arr_value.ndim == 1: if not isinstance(indexer, tuple): indexer = tuple([indexer]) - return all([lib.isscalar(idx) for idx in indexer]) + return all([is_scalar(idx) for idx in indexer]) return False def _is_empty_indexer(indexer): @@ -724,7 +745,7 @@ def _is_empty_indexer(indexer): if hasattr(value, 'dtype') and is_dtype_equal(values.dtype, value.dtype): dtype = value.dtype - elif lib.isscalar(value): + elif is_scalar(value): dtype, _ = _infer_dtype_from_scalar(value) else: dtype = 'infer' @@ -838,7 +859,7 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0, n = np.array(new) # type of the new block - dtype, _ = com._maybe_promote(n.dtype) + dtype, _ = _maybe_promote(n.dtype) # we need to explicitly astype here to make a copy n = n.astype(dtype) @@ -1027,7 +1048,7 @@ def shift(self, periods, axis=0, mgr=None): # convert integer to float if necessary. need to do a lot more than # that, handle boolean etc also - new_values, fill_value = com._maybe_upcast(self.values) + new_values, fill_value = _maybe_upcast(self.values) # make sure array sent to np.roll is c_contiguous f_ordered = new_values.flags.f_contiguous @@ -1036,7 +1057,7 @@ def shift(self, periods, axis=0, mgr=None): axis = new_values.ndim - axis - 1 if np.prod(new_values.shape): - new_values = np.roll(new_values, com._ensure_platform_int(periods), + new_values = np.roll(new_values, _ensure_platform_int(periods), axis=axis) axis_indexer = [slice(None)] * self.ndim @@ -1136,7 +1157,8 @@ def handle_error(): # get the result try: - result = get_result(other) + with np.errstate(all='ignore'): + result = get_result(other) # if we have an invalid shape/broadcast error # GH4576, so raise instead of allowing to pass through @@ -1306,7 +1328,7 @@ def quantile(self, qs, interpolation='linear', axis=0, mgr=None): from pandas import Float64Index is_empty = values.shape[axis] == 0 - if com.is_list_like(qs): + if is_list_like(qs): ax = Float64Index(qs) if is_empty: @@ -1350,7 +1372,7 @@ def quantile(self, qs, interpolation='linear', axis=0, mgr=None): ndim = getattr(result, 'ndim', None) or 0 result = self._try_coerce_result(result) - if lib.isscalar(result): + if is_scalar(result): return ax, self.make_block_scalar(result) return ax, make_block(result, placement=np.arange(len(result)), @@ -1471,7 +1493,7 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0, if isinstance(new, np.ndarray) and len(new) == len(mask): new = new[mask] - mask = mask.reshape(new_values.shape) + mask = _safe_reshape(mask, new_values.shape) new_values[mask] = new new_values = self._try_coerce_result(new_values) return [self.make_block(values=new_values)] @@ -1529,6 +1551,20 @@ def to_native_types(self, slicer=None, na_rep='', float_format=None, if slicer is not None: values = values[:, slicer] + # see gh-13418: no special formatting is desired at the + # output (important for appropriate 'quoting' behaviour), + # so do not pass it through the FloatArrayFormatter + if float_format is None and decimal == '.': + mask = isnull(values) + + if not quoting: + values = values.astype(str) + else: + values = np.array(values, dtype='object') + + values[mask] = na_rep + return values + from pandas.formats.format import FloatArrayFormatter formatter = FloatArrayFormatter(values, na_rep=na_rep, float_format=float_format, @@ -1577,7 +1613,7 @@ def _can_hold_element(self, element): tipo = element.dtype.type return (issubclass(tipo, np.integer) and not issubclass(tipo, (np.datetime64, np.timedelta64))) - return com.is_integer(element) + return is_integer(element) def _try_cast(self, element): try: @@ -1586,7 +1622,7 @@ def _try_cast(self, element): return element def should_store(self, value): - return com.is_integer_dtype(value) and value.dtype == self.dtype + return is_integer_dtype(value) and value.dtype == self.dtype class DatetimeLikeBlockMixin(object): @@ -1607,7 +1643,7 @@ def get_values(self, dtype=None): """ return object dtype as boxed values, such as Timestamps/Timedelta """ - if com.is_object_dtype(dtype): + if is_object_dtype(dtype): return lib.map_infer(self.values.ravel(), self._box_func).reshape(self.values.shape) return self.values @@ -1627,7 +1663,7 @@ def fillna(self, value, **kwargs): # allow filling with integers to be # interpreted as seconds - if not isinstance(value, np.timedelta64) and com.is_integer(value): + if not isinstance(value, np.timedelta64) and is_integer(value): value = Timedelta(value, unit='s') return super(TimeDeltaBlock, self).fillna(value, **kwargs) @@ -1660,7 +1696,7 @@ def _try_coerce_args(self, values, other): other = other.value elif isinstance(other, np.timedelta64): other_mask = isnull(other) - other = other.view('i8') + other = Timedelta(other).value elif isinstance(other, timedelta): other = Timedelta(other).value elif isinstance(other, np.ndarray): @@ -1781,10 +1817,10 @@ def convert(self, *args, **kwargs): new_style |= kw in kwargs if new_style: - fn = convert._soft_convert_objects + fn = _soft_convert_objects fn_inputs = new_inputs else: - fn = convert._possibly_convert_objects + fn = _possibly_convert_objects fn_inputs = ['convert_dates', 'convert_numeric', 'convert_timedeltas'] fn_inputs += ['copy'] @@ -1806,7 +1842,7 @@ def convert(self, *args, **kwargs): try: values = values.reshape(shape) values = _block_shape(values, ndim=self.ndim) - except AttributeError: + except (AttributeError, NotImplementedError): pass newb = make_block(values, ndim=self.ndim, placement=[rl]) blocks.append(newb) @@ -1870,15 +1906,15 @@ def should_store(self, value): def replace(self, to_replace, value, inplace=False, filter=None, regex=False, convert=True, mgr=None): - to_rep_is_list = com.is_list_like(to_replace) - value_is_list = com.is_list_like(value) + to_rep_is_list = is_list_like(to_replace) + value_is_list = is_list_like(value) both_lists = to_rep_is_list and value_is_list either_list = to_rep_is_list or value_is_list result_blocks = [] blocks = [self] - if not either_list and com.is_re(to_replace): + if not either_list and is_re(to_replace): return self._replace_single(to_replace, value, inplace=inplace, filter=filter, regex=True, convert=convert, mgr=mgr) @@ -1916,10 +1952,10 @@ def replace(self, to_replace, value, inplace=False, filter=None, def _replace_single(self, to_replace, value, inplace=False, filter=None, regex=False, convert=True, mgr=None): # to_replace is regex compilable - to_rep_re = regex and com.is_re_compilable(to_replace) + to_rep_re = regex and is_re_compilable(to_replace) # regex is regex compilable - regex_re = com.is_re_compilable(regex) + regex_re = is_re_compilable(regex) # only one will survive if to_rep_re and regex_re: @@ -2032,7 +2068,7 @@ def _try_coerce_result(self, result): # GH12564: CategoricalBlock is 1-dim only # while returned results could be any dim - if ((not com.is_categorical_dtype(result)) and + if ((not is_categorical_dtype(result)) and isinstance(result, np.ndarray)): result = _block_shape(result, ndim=self.ndim) @@ -2137,7 +2173,7 @@ def _astype(self, dtype, mgr=None, **kwargs): """ # if we are passed a datetime64[ns, tz] - if com.is_datetime64tz_dtype(dtype): + if is_datetime64tz_dtype(dtype): dtype = DatetimeTZDtype(dtype) values = self.values @@ -2153,7 +2189,7 @@ def _can_hold_element(self, element): if is_list_like(element): element = np.array(element) return element.dtype == _NS_DTYPE or element.dtype == np.int64 - return (com.is_integer(element) or isinstance(element, datetime) or + return (is_integer(element) or isinstance(element, datetime) or isnull(element)) def _try_cast(self, element): @@ -2195,7 +2231,7 @@ def _try_coerce_args(self, values, other): "naive Block") other_mask = isnull(other) other = other.asm8.view('i8') - elif hasattr(other, 'dtype') and com.is_integer_dtype(other): + elif hasattr(other, 'dtype') and is_integer_dtype(other): other = other.view('i8') else: try: @@ -2301,7 +2337,7 @@ def external_values(self): def get_values(self, dtype=None): # return object dtype as Timestamps with the zones - if com.is_object_dtype(dtype): + if is_object_dtype(dtype): f = lambda x: lib.Timestamp(x, tz=self.values.tz) return lib.map_infer( self.values.ravel(), f).reshape(self.values.shape) @@ -2405,15 +2441,14 @@ def shift(self, periods, axis=0, mgr=None): else: indexer[:periods] = np.arange(-periods, N) - # move to UTC & take - new_values = self.values.tz_localize(None).asi8.take(indexer) + new_values = self.values.asi8.take(indexer) if periods > 0: new_values[:periods] = tslib.iNaT else: new_values[periods:] = tslib.iNaT - new_values = DatetimeIndex(new_values, tz=self.values.tz) + new_values = self.values._shallow_copy(new_values) return [self.make_block_same_class(new_values, placement=self.mgr_locs)] @@ -2443,9 +2478,6 @@ def fill_value(self): @fill_value.setter def fill_value(self, v): - # we may need to upcast our fill to match our dtype - if issubclass(self.dtype.type, np.floating): - v = float(v) self.values.fill_value = v def to_dense(self): @@ -2471,6 +2503,14 @@ def sp_index(self): def kind(self): return self.values.kind + def _astype(self, dtype, copy=False, raise_on_error=True, values=None, + klass=None, mgr=None, **kwargs): + if values is None: + values = self.values + values = values.astype(dtype, copy=copy) + return self.make_block_same_class(values=values, + placement=self.mgr_locs) + def __len__(self): try: return self.sp_index.length @@ -2488,7 +2528,7 @@ def make_block_same_class(self, values, placement, sparse_index=None, copy=False, fastpath=True, **kwargs): """ return a new block """ if dtype is None: - dtype = self.dtype + dtype = values.dtype if fill_value is None and not isinstance(values, SparseArray): fill_value = self.values.fill_value @@ -2547,7 +2587,7 @@ def shift(self, periods, axis=0, mgr=None): new_values = self.values.to_dense().take(indexer) # convert integer to float if necessary. need to do a lot more than # that, handle boolean etc also - new_values, fill_value = com._maybe_upcast(new_values) + new_values, fill_value = _maybe_upcast(new_values) if periods > 0: new_values[:periods] = fill_value else: @@ -3071,7 +3111,7 @@ def reduction(self, f, axis=0, consolidate=True, transposed=False, # compute the orderings of our original data if len(self.blocks) > 1: - indexer = np.empty(len(self.axes[0]), dtype='int64') + indexer = np.empty(len(self.axes[0]), dtype=np.intp) i = 0 for b in self.blocks: for j in b.mgr_locs: @@ -3477,7 +3517,7 @@ def get(self, item, fastpath=True): indexer = np.arange(len(self.items))[isnull(self.items)] # allow a single nan location indexer - if not lib.isscalar(indexer): + if not is_scalar(indexer): if len(indexer) == 1: loc = indexer.item() else: @@ -3583,7 +3623,7 @@ def value_getitem(placement): return value else: if value.ndim == self.ndim - 1: - value = value.reshape((1,) + value.shape) + value = _safe_reshape(value, (1,) + value.shape) def value_getitem(placement): return value @@ -3809,7 +3849,7 @@ def _slice_take_blocks_ax0(self, slice_or_indexer, fill_tuple=None): return [blk.getitem_block(slobj, new_mgr_locs=slice(0, sllen))] elif not allow_fill or self.ndim == 1: if allow_fill and fill_tuple[0] is None: - _, fill_value = com._maybe_promote(blk.dtype) + _, fill_value = _maybe_promote(blk.dtype) fill_tuple = (fill_value, ) return [blk.take_nd(slobj, axis=0, @@ -3867,7 +3907,7 @@ def _make_na_block(self, placement, fill_value=None): block_shape = list(self.shape) block_shape[0] = len(placement) - dtype, fill_value = com._infer_dtype_from_scalar(fill_value) + dtype, fill_value = _infer_dtype_from_scalar(fill_value) block_values = np.empty(block_shape, dtype=dtype) block_values.fill(fill_value) return make_block(block_values, placement=placement) @@ -4395,14 +4435,6 @@ def _interleaved_dtype(blocks): for x in blocks: counts[type(x)].append(x) - def _lcd_dtype(l): - """ find the lowest dtype that can accomodate the given types """ - m = l[0].dtype - for x in l[1:]: - if x.dtype.itemsize > m.itemsize: - m = x.dtype - return m - have_int = len(counts[IntBlock]) > 0 have_bool = len(counts[BoolBlock]) > 0 have_object = len(counts[ObjectBlock]) > 0 @@ -4415,7 +4447,6 @@ def _lcd_dtype(l): # TODO: have_sparse is not used have_sparse = len(counts[SparseBlock]) > 0 # noqa have_numeric = have_float or have_complex or have_int - has_non_numeric = have_dt64 or have_dt64_tz or have_td64 or have_cat if (have_object or @@ -4427,10 +4458,9 @@ def _lcd_dtype(l): elif have_bool: return np.dtype(bool) elif have_int and not have_float and not have_complex: - # if we are mixing unsigned and signed, then return # the next biggest int type (if we can) - lcd = _lcd_dtype(counts[IntBlock]) + lcd = _find_common_type([b.dtype for b in counts[IntBlock]]) kinds = set([i.dtype.kind for i in counts[IntBlock]]) if len(kinds) == 1: return lcd @@ -4446,7 +4476,8 @@ def _lcd_dtype(l): elif have_complex: return np.dtype('c16') else: - return _lcd_dtype(counts[FloatBlock] + counts[SparseBlock]) + introspection_blks = counts[FloatBlock] + counts[SparseBlock] + return _find_common_type([b.dtype for b in introspection_blks]) def _consolidate(blocks): @@ -4546,7 +4577,7 @@ def _possibly_compare(a, b, op): else: result = op(a, b) - if lib.isscalar(result) and (is_a_array or is_b_array): + if is_scalar(result) and (is_a_array or is_b_array): type_names = [type(a).__name__, type(b).__name__] if is_a_array: @@ -4597,7 +4628,7 @@ def _factor_indexer(shape, labels): expanded label indexer """ mult = np.array(shape)[::-1].cumprod()[::-1] - return com._ensure_platform_int( + return _ensure_platform_int( np.sum(np.array(labels).T * np.append(mult, [1]), axis=1).T) @@ -4617,7 +4648,7 @@ def _get_blkno_placements(blknos, blk_count, group=True): """ - blknos = com._ensure_int64(blknos) + blknos = _ensure_int64(blknos) # FIXME: blk_count is unused, but it may avoid the use of dicts in cython for blkno, indexer in lib.get_blkno_indexers(blknos, group): @@ -4653,6 +4684,28 @@ def rrenamer(x): _transform_index(right, rrenamer)) +def _safe_reshape(arr, new_shape): + """ + If possible, reshape `arr` to have shape `new_shape`, + with a couple of exceptions (see gh-13012): + + 1) If `arr` is a Categorical or Index, `arr` will be + returned as is. + 2) If `arr` is a Series, the `_values` attribute will + be reshaped and returned. + + Parameters + ---------- + arr : array-like, object to be reshaped + new_shape : int or tuple of ints, the new shape + """ + if isinstance(arr, ABCSeries): + arr = arr._values + if not isinstance(arr, Categorical): + arr = arr.reshape(new_shape) + return arr + + def _transform_index(index, func): """ Apply function to all values found in index. @@ -4707,7 +4760,7 @@ def _putmask_smart(v, m, n): pass # change the dtype - dtype, _ = com._maybe_promote(n.dtype) + dtype, _ = _maybe_promote(n.dtype) nv = v.astype(dtype) try: nv[m] = n[m] @@ -4734,10 +4787,9 @@ def concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy): [get_mgr_concatenation_plan(mgr, indexers) for mgr, indexers in mgrs_indexers], concat_axis) - blocks = [make_block(concatenate_join_units(join_units, concat_axis, - copy=copy), - placement=placement) - for placement, join_units in concat_plan] + blocks = [make_block( + concatenate_join_units(join_units, concat_axis, copy=copy), + placement=placement) for placement, join_units in concat_plan] return BlockManager(blocks, axes) @@ -4773,9 +4825,9 @@ def get_empty_dtype_and_na(join_units): if dtype is None: continue - if com.is_categorical_dtype(dtype): + if is_categorical_dtype(dtype): upcast_cls = 'category' - elif com.is_datetimetz(dtype): + elif is_datetimetz(dtype): upcast_cls = 'datetimetz' elif issubclass(dtype.type, np.bool_): upcast_cls = 'bool' @@ -5048,8 +5100,8 @@ def dtype(self): if not self.needs_filling: return self.block.dtype else: - return com._get_dtype(com._maybe_promote(self.block.dtype, - self.block.fill_value)[0]) + return _get_dtype(_maybe_promote(self.block.dtype, + self.block.fill_value)[0]) return self._dtype diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 911fcaf529f98..b847415f274db 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -5,10 +5,15 @@ import numpy as np from distutils.version import LooseVersion -import pandas.core.common as com import pandas.algos as algos import pandas.lib as lib from pandas.compat import range, string_types +from pandas.types.common import (is_numeric_v_string_like, + is_float_dtype, is_datetime64_dtype, + is_integer_dtype, _ensure_float64, + is_scalar, + _DATELIKE_DTYPES) +from pandas.types.missing import isnull def mask_missing(arr, values_to_mask): @@ -24,7 +29,7 @@ def mask_missing(arr, values_to_mask): except Exception: values_to_mask = np.array(values_to_mask, dtype=object) - na_mask = com.isnull(values_to_mask) + na_mask = isnull(values_to_mask) nonna = values_to_mask[~na_mask] mask = None @@ -32,28 +37,28 @@ def mask_missing(arr, values_to_mask): if mask is None: # numpy elementwise comparison warning - if com.is_numeric_v_string_like(arr, x): + if is_numeric_v_string_like(arr, x): mask = False else: mask = arr == x # if x is a string and arr is not, then we get False and we must # expand the mask to size arr.shape - if lib.isscalar(mask): + if is_scalar(mask): mask = np.zeros(arr.shape, dtype=bool) else: # numpy elementwise comparison warning - if com.is_numeric_v_string_like(arr, x): + if is_numeric_v_string_like(arr, x): mask |= False else: mask |= arr == x if na_mask.any(): if mask is None: - mask = com.isnull(arr) + mask = isnull(arr) else: - mask |= com.isnull(arr) + mask |= isnull(arr) return mask @@ -110,7 +115,7 @@ def interpolate_1d(xvalues, yvalues, method='linear', limit=None, """ # Treat the original, non-scipy methods first. - invalid = com.isnull(yvalues) + invalid = isnull(yvalues) valid = ~invalid if not valid.any(): @@ -442,12 +447,12 @@ def pad_1d(values, limit=None, mask=None, dtype=None): if dtype is None: dtype = values.dtype _method = None - if com.is_float_dtype(values): + if is_float_dtype(values): _method = getattr(algos, 'pad_inplace_%s' % dtype.name, None) - elif dtype in com._DATELIKE_DTYPES or com.is_datetime64_dtype(values): + elif dtype in _DATELIKE_DTYPES or is_datetime64_dtype(values): _method = _pad_1d_datetime - elif com.is_integer_dtype(values): - values = com._ensure_float64(values) + elif is_integer_dtype(values): + values = _ensure_float64(values) _method = algos.pad_inplace_float64 elif values.dtype == np.object_: _method = algos.pad_inplace_object @@ -456,7 +461,7 @@ def pad_1d(values, limit=None, mask=None, dtype=None): raise ValueError('Invalid dtype for pad_1d [%s]' % dtype.name) if mask is None: - mask = com.isnull(values) + mask = isnull(values) mask = mask.view(np.uint8) _method(values, mask, limit=limit) return values @@ -467,12 +472,12 @@ def backfill_1d(values, limit=None, mask=None, dtype=None): if dtype is None: dtype = values.dtype _method = None - if com.is_float_dtype(values): + if is_float_dtype(values): _method = getattr(algos, 'backfill_inplace_%s' % dtype.name, None) - elif dtype in com._DATELIKE_DTYPES or com.is_datetime64_dtype(values): + elif dtype in _DATELIKE_DTYPES or is_datetime64_dtype(values): _method = _backfill_1d_datetime - elif com.is_integer_dtype(values): - values = com._ensure_float64(values) + elif is_integer_dtype(values): + values = _ensure_float64(values) _method = algos.backfill_inplace_float64 elif values.dtype == np.object_: _method = algos.backfill_inplace_object @@ -481,7 +486,7 @@ def backfill_1d(values, limit=None, mask=None, dtype=None): raise ValueError('Invalid dtype for backfill_1d [%s]' % dtype.name) if mask is None: - mask = com.isnull(values) + mask = isnull(values) mask = mask.view(np.uint8) _method(values, mask, limit=limit) @@ -493,12 +498,12 @@ def pad_2d(values, limit=None, mask=None, dtype=None): if dtype is None: dtype = values.dtype _method = None - if com.is_float_dtype(values): + if is_float_dtype(values): _method = getattr(algos, 'pad_2d_inplace_%s' % dtype.name, None) - elif dtype in com._DATELIKE_DTYPES or com.is_datetime64_dtype(values): + elif dtype in _DATELIKE_DTYPES or is_datetime64_dtype(values): _method = _pad_2d_datetime - elif com.is_integer_dtype(values): - values = com._ensure_float64(values) + elif is_integer_dtype(values): + values = _ensure_float64(values) _method = algos.pad_2d_inplace_float64 elif values.dtype == np.object_: _method = algos.pad_2d_inplace_object @@ -507,7 +512,7 @@ def pad_2d(values, limit=None, mask=None, dtype=None): raise ValueError('Invalid dtype for pad_2d [%s]' % dtype.name) if mask is None: - mask = com.isnull(values) + mask = isnull(values) mask = mask.view(np.uint8) if np.all(values.shape): @@ -523,12 +528,12 @@ def backfill_2d(values, limit=None, mask=None, dtype=None): if dtype is None: dtype = values.dtype _method = None - if com.is_float_dtype(values): + if is_float_dtype(values): _method = getattr(algos, 'backfill_2d_inplace_%s' % dtype.name, None) - elif dtype in com._DATELIKE_DTYPES or com.is_datetime64_dtype(values): + elif dtype in _DATELIKE_DTYPES or is_datetime64_dtype(values): _method = _backfill_2d_datetime - elif com.is_integer_dtype(values): - values = com._ensure_float64(values) + elif is_integer_dtype(values): + values = _ensure_float64(values) _method = algos.backfill_2d_inplace_float64 elif values.dtype == np.object_: _method = algos.backfill_2d_inplace_object @@ -537,7 +542,7 @@ def backfill_2d(values, limit=None, mask=None, dtype=None): raise ValueError('Invalid dtype for backfill_2d [%s]' % dtype.name) if mask is None: - mask = com.isnull(values) + mask = isnull(values) mask = mask.view(np.uint8) if np.all(values.shape): @@ -570,22 +575,22 @@ def fill_zeros(result, x, y, name, fill): mask the nan's from x """ - if fill is None or com.is_float_dtype(result): + if fill is None or is_float_dtype(result): return result if name.startswith(('r', '__r')): x, y = y, x - is_typed_variable = (hasattr(y, 'dtype') or hasattr(y, 'type')) - is_scalar = lib.isscalar(y) + is_variable_type = (hasattr(y, 'dtype') or hasattr(y, 'type')) + is_scalar_type = is_scalar(y) - if not is_typed_variable and not is_scalar: + if not is_variable_type and not is_scalar_type: return result - if is_scalar: + if is_scalar_type: y = np.array(y) - if com.is_integer_dtype(y): + if is_integer_dtype(y): if (y == 0).any(): diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index f390e3f04a6c3..a76e348b7dee2 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -11,16 +11,20 @@ import pandas.hashtable as _hash from pandas import compat, lib, algos, tslib -from pandas.core.common import (isnull, notnull, _values_from_object, - _maybe_upcast_putmask, _ensure_float64, - _ensure_int64, _ensure_object, is_float, - is_integer, is_complex, is_float_dtype, - is_complex_dtype, is_integer_dtype, - is_bool_dtype, is_object_dtype, - is_datetime64_dtype, is_timedelta64_dtype, - is_datetime_or_timedelta_dtype, _get_dtype, - is_int_or_datetime_dtype, is_any_int_dtype, - _int64_max) +from pandas.types.common import (_ensure_int64, _ensure_object, + _ensure_float64, _get_dtype, + is_float, is_scalar, + is_integer, is_complex, is_float_dtype, + is_complex_dtype, is_integer_dtype, + is_bool_dtype, is_object_dtype, + is_numeric_dtype, + is_datetime64_dtype, is_timedelta64_dtype, + is_datetime_or_timedelta_dtype, + is_int_or_datetime_dtype, is_any_int_dtype) +from pandas.types.cast import _int64_max, _maybe_upcast_putmask +from pandas.types.missing import isnull, notnull + +from pandas.core.common import _values_from_object class disallow(object): @@ -41,7 +45,8 @@ def _f(*args, **kwargs): 'this dtype'.format( f.__name__.replace('nan', ''))) try: - return f(*args, **kwargs) + with np.errstate(invalid='ignore'): + return f(*args, **kwargs) except ValueError as e: # we want to transform an object array # ValueError message to the more typical TypeError @@ -351,7 +356,7 @@ def _get_counts_nanvar(mask, axis, ddof, dtype=float): d = count - dtype.type(ddof) # always return NaN, never inf - if lib.isscalar(count): + if is_scalar(count): if count <= ddof: count = np.nan d = np.nan @@ -509,7 +514,8 @@ def nanskew(values, axis=None, skipna=True): m2 = _zero_out_fperr(m2) m3 = _zero_out_fperr(m3) - result = (count * (count - 1) ** 0.5 / (count - 2)) * (m3 / m2 ** 1.5) + with np.errstate(invalid='ignore', divide='ignore'): + result = (count * (count - 1) ** 0.5 / (count - 2)) * (m3 / m2 ** 1.5) dtype = values.dtype if is_float_dtype(dtype): @@ -558,10 +564,11 @@ def nankurt(values, axis=None, skipna=True): m2 = adjusted2.sum(axis, dtype=np.float64) m4 = adjusted4.sum(axis, dtype=np.float64) - adj = 3 * (count - 1) ** 2 / ((count - 2) * (count - 3)) - numer = count * (count + 1) * (count - 1) * m4 - denom = (count - 2) * (count - 3) * m2**2 - result = numer / denom - adj + with np.errstate(invalid='ignore', divide='ignore'): + adj = 3 * (count - 1) ** 2 / ((count - 2) * (count - 3)) + numer = count * (count + 1) * (count - 1) * m4 + denom = (count - 2) * (count - 3) * m2**2 + result = numer / denom - adj # floating point error numer = _zero_out_fperr(numer) @@ -575,7 +582,8 @@ def nankurt(values, axis=None, skipna=True): if denom == 0: return 0 - result = numer / denom - adj + with np.errstate(invalid='ignore', divide='ignore'): + result = numer / denom - adj dtype = values.dtype if is_float_dtype(dtype): @@ -623,7 +631,7 @@ def _get_counts(mask, axis, dtype=float): return dtype.type(mask.size - mask.sum()) count = mask.shape[axis] - mask.sum(axis) - if lib.isscalar(count): + if is_scalar(count): return dtype.type(count) try: return count.astype(dtype) @@ -635,11 +643,15 @@ def _maybe_null_out(result, axis, mask): if axis is not None and getattr(result, 'ndim', False): null_mask = (mask.shape[axis] - mask.sum(axis)) == 0 if np.any(null_mask): - if np.iscomplexobj(result): - result = result.astype('c16') + if is_numeric_dtype(result): + if np.iscomplexobj(result): + result = result.astype('c16') + else: + result = result.astype('f8') + result[null_mask] = np.nan else: - result = result.astype('f8') - result[null_mask] = np.nan + # GH12941, use None to auto cast null + result[null_mask] = None elif result is not tslib.NaT: null_mask = mask.size - mask.sum() if null_mask == 0: @@ -650,7 +662,8 @@ def _maybe_null_out(result, axis, mask): def _zero_out_fperr(arg): if isinstance(arg, np.ndarray): - return np.where(np.abs(arg) < 1e-14, 0, arg) + with np.errstate(invalid='ignore'): + return np.where(np.abs(arg) < 1e-14, 0, arg) else: return arg.dtype.type(0) if np.abs(arg) < 1e-14 else arg @@ -752,7 +765,8 @@ def f(x, y): ymask = isnull(y) mask = xmask | ymask - result = op(x, y) + with np.errstate(all='ignore'): + result = op(x, y) if mask.any(): if is_bool_dtype(result): diff --git a/pandas/core/ops.py b/pandas/core/ops.py index d1bb67fa0bc13..b81d62c3cda18 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -13,28 +13,32 @@ from pandas import compat, lib, tslib import pandas.index as _index from pandas.util.decorators import Appender -import pandas.core.common as com import pandas.computation.expressions as expressions from pandas.lib import isscalar from pandas.tslib import iNaT from pandas.compat import bind_method import pandas.core.missing as missing import pandas.algos as _algos -import pandas.core.algorithms as algos -from pandas.core.common import (is_list_like, notnull, isnull, - _values_from_object, _maybe_match_name, - needs_i8_conversion, is_datetimelike_v_numeric, - is_integer_dtype, is_categorical_dtype, - is_object_dtype, is_timedelta64_dtype, - is_datetime64_dtype, is_datetime64tz_dtype, - is_bool_dtype, PerformanceWarning, ABCSeries) +from pandas.core.common import (_values_from_object, _maybe_match_name, + PerformanceWarning) +from pandas.types.missing import notnull, isnull +from pandas.types.common import (needs_i8_conversion, + is_datetimelike_v_numeric, + is_integer_dtype, is_categorical_dtype, + is_object_dtype, is_timedelta64_dtype, + is_datetime64_dtype, is_datetime64tz_dtype, + is_bool_dtype, is_datetimetz, + is_list_like, + _ensure_object) +from pandas.types.cast import _maybe_upcast_putmask, _find_common_type +from pandas.types.generic import ABCSeries, ABCIndex, ABCPeriodIndex # ----------------------------------------------------------------------------- # Functions that add arithmetic methods to objects, given arithmetic factory # methods -def _create_methods(arith_method, radd_func, comp_method, bool_method, +def _create_methods(arith_method, comp_method, bool_method, use_numexpr, special=False, default_axis='columns'): # creates actual methods based upon arithmetic, comp and bool method # constructors. @@ -55,14 +59,14 @@ def names(x): return "__%s__" % x else: names = lambda x: x - radd_func = radd_func or operator.add + # Inframe, all special methods have default_axis=None, flex methods have # default_axis set to the default (columns) # yapf: disable new_methods = dict( add=arith_method(operator.add, names('add'), op('+'), default_axis=default_axis), - radd=arith_method(radd_func, names('radd'), op('+'), + radd=arith_method(lambda x, y: y + x, names('radd'), op('+'), default_axis=default_axis), sub=arith_method(operator.sub, names('sub'), op('-'), default_axis=default_axis), @@ -149,7 +153,7 @@ def add_methods(cls, new_methods, force, select, exclude): # ---------------------------------------------------------------------- # Arithmetic -def add_special_arithmetic_methods(cls, arith_method=None, radd_func=None, +def add_special_arithmetic_methods(cls, arith_method=None, comp_method=None, bool_method=None, use_numexpr=True, force=False, select=None, exclude=None): @@ -162,8 +166,6 @@ def add_special_arithmetic_methods(cls, arith_method=None, radd_func=None, arith_method : function (optional) factory for special arithmetic methods, with op string: f(op, name, str_rep, default_axis=None, fill_zeros=None, **eval_kwargs) - radd_func : function (optional) - Possible replacement for ``operator.add`` for compatibility comp_method : function, optional, factory for rich comparison - signature: f(op, name, str_rep) use_numexpr : bool, default True @@ -176,12 +178,11 @@ def add_special_arithmetic_methods(cls, arith_method=None, radd_func=None, exclude : iterable of strings (optional) if passed, will not set functions with names in exclude """ - radd_func = radd_func or operator.add # in frame, special methods have default_axis = None, comp methods use # 'columns' - new_methods = _create_methods(arith_method, radd_func, comp_method, + new_methods = _create_methods(arith_method, comp_method, bool_method, use_numexpr, default_axis=None, special=True) @@ -218,7 +219,7 @@ def f(self, other): exclude=exclude) -def add_flex_arithmetic_methods(cls, flex_arith_method, radd_func=None, +def add_flex_arithmetic_methods(cls, flex_arith_method, flex_comp_method=None, flex_bool_method=None, use_numexpr=True, force=False, select=None, exclude=None): @@ -231,9 +232,6 @@ def add_flex_arithmetic_methods(cls, flex_arith_method, radd_func=None, flex_arith_method : function factory for special arithmetic methods, with op string: f(op, name, str_rep, default_axis=None, fill_zeros=None, **eval_kwargs) - radd_func : function (optional) - Possible replacement for ``lambda x, y: operator.add(y, x)`` for - compatibility flex_comp_method : function, optional, factory for rich comparison - signature: f(op, name, str_rep) use_numexpr : bool, default True @@ -246,9 +244,8 @@ def add_flex_arithmetic_methods(cls, flex_arith_method, radd_func=None, exclude : iterable of strings (optional) if passed, will not set functions with names in exclude """ - radd_func = radd_func or (lambda x, y: operator.add(y, x)) # in frame, default axis is 'columns', doesn't matter for series and panel - new_methods = _create_methods(flex_arith_method, radd_func, + new_methods = _create_methods(flex_arith_method, flex_comp_method, flex_bool_method, use_numexpr, default_axis='columns', special=False) @@ -264,30 +261,76 @@ def add_flex_arithmetic_methods(cls, flex_arith_method, radd_func=None, exclude=exclude) -class _TimeOp(object): +class _Op(object): + """ - Wrapper around Series datetime/time/timedelta arithmetic operations. - Generally, you should use classmethod ``maybe_convert_for_time_op`` as an - entry point. + Wrapper around Series arithmetic operations. + Generally, you should use classmethod ``_Op.get_op`` as an entry point. + + This validates and coerces lhs and rhs depending on its dtype and + based on op. See _TimeOp also. + + Parameters + ---------- + left : Series + lhs of op + right : object + rhs of op + name : str + name of op + na_op : callable + a function which wraps op """ - fill_value = iNaT + + fill_value = np.nan wrap_results = staticmethod(lambda x: x) dtype = None def __init__(self, left, right, name, na_op): + self.left = left + self.right = right - # need to make sure that we are aligning the data - if isinstance(left, ABCSeries) and isinstance(right, ABCSeries): - left, right = left.align(right, copy=False) + self.name = name + self.na_op = na_op + + self.lvalues = left + self.rvalues = right + + @classmethod + def get_op(cls, left, right, name, na_op): + """ + Get op dispatcher, returns _Op or _TimeOp. + + If ``left`` and ``right`` are appropriate for datetime arithmetic with + operation ``name``, processes them and returns a ``_TimeOp`` object + that stores all the required values. Otherwise, it will generate + either a ``_Op``, indicating that the operation is performed via + normal numpy path. + """ + is_timedelta_lhs = is_timedelta64_dtype(left) + is_datetime_lhs = (is_datetime64_dtype(left) or + is_datetime64tz_dtype(left)) + + if not (is_datetime_lhs or is_timedelta_lhs): + return _Op(left, right, name, na_op) + else: + return _TimeOp(left, right, name, na_op) + + +class _TimeOp(_Op): + """ + Wrapper around Series datetime/time/timedelta arithmetic operations. + Generally, you should use classmethod ``_Op.get_op`` as an entry point. + """ + fill_value = iNaT + + def __init__(self, left, right, name, na_op): + super(_TimeOp, self).__init__(left, right, name, na_op) lvalues = self._convert_to_array(left, name=name) rvalues = self._convert_to_array(right, name=name, other=lvalues) - self.name = name - self.na_op = na_op - # left - self.left = left self.is_offset_lhs = self._is_offset(left) self.is_timedelta_lhs = is_timedelta64_dtype(lvalues) self.is_datetime64_lhs = is_datetime64_dtype(lvalues) @@ -298,7 +341,6 @@ def __init__(self, left, right, name, na_op): self.is_floating_lhs = left.dtype.kind == 'f' # right - self.right = right self.is_offset_rhs = self._is_offset(right) self.is_datetime64_rhs = is_datetime64_dtype(rvalues) self.is_datetime64tz_rhs = is_datetime64tz_dtype(rvalues) @@ -397,7 +439,7 @@ def _convert_to_array(self, values, name=None, other=None): supplied_dtype = values.dtype inferred_type = supplied_dtype or lib.infer_dtype(values) if (inferred_type in ('datetime64', 'datetime', 'date', 'time') or - com.is_datetimetz(inferred_type)): + is_datetimetz(inferred_type)): # if we have a other of timedelta, but use pd.NaT here we # we are in the wrong path if (supplied_dtype is None and other is not None and @@ -411,10 +453,10 @@ def _convert_to_array(self, values, name=None, other=None): values = values.to_series() # datetime with tz elif (isinstance(ovalues, datetime.datetime) and - hasattr(ovalues, 'tz')): + hasattr(ovalues, 'tzinfo')): values = pd.DatetimeIndex(values) # datetime array with tz - elif com.is_datetimetz(values): + elif is_datetimetz(values): if isinstance(values, ABCSeries): values = values._values elif not (isinstance(values, (np.ndarray, ABCSeries)) and @@ -544,30 +586,36 @@ def _is_offset(self, arr_or_obj): """ check if obj or all elements of list-like is DateOffset """ if isinstance(arr_or_obj, pd.DateOffset): return True - elif is_list_like(arr_or_obj): + elif is_list_like(arr_or_obj) and len(arr_or_obj): return all(isinstance(x, pd.DateOffset) for x in arr_or_obj) - else: - return False + return False - @classmethod - def maybe_convert_for_time_op(cls, left, right, name, na_op): - """ - if ``left`` and ``right`` are appropriate for datetime arithmetic with - operation ``name``, processes them and returns a ``_TimeOp`` object - that stores all the required values. Otherwise, it will generate - either a ``NotImplementedError`` or ``None``, indicating that the - operation is unsupported for datetimes (e.g., an unsupported r_op) or - that the data is not the right type for time ops. - """ - # decide if we can do it - is_timedelta_lhs = is_timedelta64_dtype(left) - is_datetime_lhs = (is_datetime64_dtype(left) or - is_datetime64tz_dtype(left)) - if not (is_datetime_lhs or is_timedelta_lhs): - return None +def _align_method_SERIES(left, right, align_asobject=False): + """ align lhs and rhs Series """ - return cls(left, right, name, na_op) + # ToDo: Different from _align_method_FRAME, list, tuple and ndarray + # are not coerced here + # because Series has inconsistencies described in #13637 + + if isinstance(right, ABCSeries): + # avoid repeated alignment + if not left.index.equals(right.index): + + if align_asobject: + # to keep original value's dtype for bool ops + left = left.astype(object) + right = right.astype(object) + + left, right = left.align(right, copy=False) + + index, lidx, ridx = left.index.join(right.index, how='outer', + return_indexers=True) + # if DatetimeIndex have different tz, convert to UTC + left.index = index + right.index = index + + return left, right def _arith_method_SERIES(op, name, str_rep, fill_zeros=None, default_axis=None, @@ -583,7 +631,7 @@ def na_op(x, y): raise_on_error=True, **eval_kwargs) except TypeError: if isinstance(y, (np.ndarray, ABCSeries, pd.Index)): - dtype = np.find_common_type([x.dtype, y.dtype], []) + dtype = _find_common_type([x.dtype, y.dtype]) result = np.empty(x.size, dtype=dtype) mask = notnull(x) & notnull(y) result[mask] = op(x[mask], _values_from_object(y[mask])) @@ -596,14 +644,15 @@ def na_op(x, y): "{op}".format(typ=type(x).__name__, op=str_rep)) - result, changed = com._maybe_upcast_putmask(result, ~mask, np.nan) + result, changed = _maybe_upcast_putmask(result, ~mask, np.nan) result = missing.fill_zeros(result, x, y, name, fill_zeros) return result def safe_na_op(lvalues, rvalues): try: - return na_op(lvalues, rvalues) + with np.errstate(all='ignore'): + return na_op(lvalues, rvalues) except Exception: if isinstance(rvalues, ABCSeries): if is_object_dtype(rvalues): @@ -621,56 +670,49 @@ def wrapper(left, right, name=name, na_op=na_op): if isinstance(right, pd.DataFrame): return NotImplemented - time_converted = _TimeOp.maybe_convert_for_time_op(left, right, name, - na_op) + left, right = _align_method_SERIES(left, right) - if time_converted is None: - lvalues, rvalues = left, right - dtype = None - wrap_results = lambda x: x - elif time_converted is NotImplemented: - return NotImplemented - else: - left, right = time_converted.left, time_converted.right - lvalues, rvalues = time_converted.lvalues, time_converted.rvalues - dtype = time_converted.dtype - wrap_results = time_converted.wrap_results - na_op = time_converted.na_op + converted = _Op.get_op(left, right, name, na_op) + + left, right = converted.left, converted.right + lvalues, rvalues = converted.lvalues, converted.rvalues + dtype = converted.dtype + wrap_results = converted.wrap_results + na_op = converted.na_op if isinstance(rvalues, ABCSeries): - rindex = getattr(rvalues, 'index', rvalues) name = _maybe_match_name(left, rvalues) lvalues = getattr(lvalues, 'values', lvalues) rvalues = getattr(rvalues, 'values', rvalues) - if left.index.equals(rindex): - index = left.index - else: - index, lidx, ridx = left.index.join(rindex, how='outer', - return_indexers=True) - - if lidx is not None: - lvalues = algos.take_1d(lvalues, lidx) - - if ridx is not None: - rvalues = algos.take_1d(rvalues, ridx) - - result = wrap_results(safe_na_op(lvalues, rvalues)) - return left._constructor(result, index=index, - name=name, dtype=dtype) + # _Op aligns left and right else: - # scalars + name = left.name if (hasattr(lvalues, 'values') and not isinstance(lvalues, pd.DatetimeIndex)): lvalues = lvalues.values - result = wrap_results(safe_na_op(lvalues, rvalues)) - return left._constructor(result, - index=left.index, name=left.name, - dtype=dtype) - + result = wrap_results(safe_na_op(lvalues, rvalues)) + return left._constructor(result, index=left.index, + name=name, dtype=dtype) return wrapper +def _comp_method_OBJECT_ARRAY(op, x, y): + if isinstance(y, list): + y = lib.list_to_object_array(y) + if isinstance(y, (np.ndarray, ABCSeries, ABCIndex)): + if not is_object_dtype(y.dtype): + y = y.astype(np.object_) + + if isinstance(y, (ABCSeries, ABCIndex)): + y = y.values + + result = lib.vec_compare(x, y, op) + else: + result = lib.scalar_compare(x, y, op) + return result + + def _comp_method_SERIES(op, name, str_rep, masker=False): """ Wrapper function for Series arithmetic operations, to avoid @@ -687,16 +729,7 @@ def na_op(x, y): return op(y, x) if is_object_dtype(x.dtype): - if isinstance(y, list): - y = lib.list_to_object_array(y) - - if isinstance(y, (np.ndarray, ABCSeries)): - if not is_object_dtype(y.dtype): - result = lib.vec_compare(x, y.astype(np.object_), op) - else: - result = lib.vec_compare(x, y, op) - else: - result = lib.scalar_compare(x, y, op) + result = _comp_method_OBJECT_ARRAY(op, x, y) else: # we want to compare like types @@ -720,16 +753,16 @@ def na_op(x, y): (not isscalar(y) and needs_i8_conversion(y))): if isscalar(y): + mask = isnull(x) y = _index.convert_scalar(x, _values_from_object(y)) else: + mask = isnull(x) | isnull(y) y = y.view('i8') - - mask = isnull(x) - x = x.view('i8') try: - result = getattr(x, name)(y) + with np.errstate(all='ignore'): + result = getattr(x, name)(y) if result is NotImplemented: raise TypeError("invalid type comparison") except AttributeError: @@ -747,17 +780,31 @@ def wrapper(self, other, axis=None): if isinstance(other, ABCSeries): name = _maybe_match_name(self, other) - if len(self) != len(other): - raise ValueError('Series lengths must match to compare') + if not self._indexed_same(other): + msg = 'Can only compare identically-labeled Series objects' + raise ValueError(msg) return self._constructor(na_op(self.values, other.values), index=self.index, name=name) elif isinstance(other, pd.DataFrame): # pragma: no cover return NotImplemented elif isinstance(other, (np.ndarray, pd.Index)): - if len(self) != len(other): + # do not check length of zerodim array + # as it will broadcast + if (not lib.isscalar(lib.item_from_zerodim(other)) and + len(self) != len(other)): raise ValueError('Lengths must match to compare') + + if isinstance(other, ABCPeriodIndex): + # temp workaround until fixing GH 13637 + # tested in test_nat_comparisons + # (pandas.tests.series.test_operators.TestSeriesOperators) + return self._constructor(na_op(self.values, + other.asobject.values), + index=self.index) + return self._constructor(na_op(self.values, np.asarray(other)), index=self.index).__finalize__(self) + elif isinstance(other, pd.Categorical): if not is_categorical_dtype(self): msg = ("Cannot compare a Categorical for op {op} with Series " @@ -770,13 +817,15 @@ def wrapper(self, other, axis=None): # which would then not take categories ordering into account # we can go directly to op, as the na_op would just test again and # dispatch to it. - res = op(self.values, other) + with np.errstate(all='ignore'): + res = op(self.values, other) else: values = self.get_values() if isinstance(other, (list, np.ndarray)): other = np.asarray(other) - res = na_op(values, other) + with np.errstate(all='ignore'): + res = na_op(values, other) if isscalar(res): raise TypeError('Could not compare %s type with Series' % type(other)) @@ -807,8 +856,8 @@ def na_op(x, y): if (is_bool_dtype(x.dtype) and is_bool_dtype(y.dtype)): result = op(x, y) # when would this be hit? else: - x = com._ensure_object(x) - y = com._ensure_object(y) + x = _ensure_object(x) + y = _ensure_object(y) result = lib.vec_binop(x, y, op) else: try: @@ -830,9 +879,10 @@ def wrapper(self, other): fill_int = lambda x: x.fillna(0) fill_bool = lambda x: x.fillna(False).astype(bool) + self, other = _align_method_SERIES(self, other, align_asobject=True) + if isinstance(other, ABCSeries): name = _maybe_match_name(self, other) - other = other.reindex_like(self) is_other_int_dtype = is_integer_dtype(other.dtype) other = fill_int(other) if is_other_int_dtype else fill_bool(other) @@ -855,17 +905,6 @@ def wrapper(self, other): return wrapper -def _radd_compat(left, right): - radd = lambda x, y: y + x - # GH #353, NumPy 1.5.1 workaround - try: - output = radd(left, right) - except TypeError: - raise - - return output - - _op_descriptions = {'add': {'op': '+', 'desc': 'Addition', 'reversed': False, @@ -893,7 +932,32 @@ def _radd_compat(left, right): 'floordiv': {'op': '//', 'desc': 'Integer division', 'reversed': False, - 'reverse': 'rfloordiv'}} + 'reverse': 'rfloordiv'}, + + 'eq': {'op': '==', + 'desc': 'Equal to', + 'reversed': False, + 'reverse': None}, + 'ne': {'op': '!=', + 'desc': 'Not equal to', + 'reversed': False, + 'reverse': None}, + 'lt': {'op': '<', + 'desc': 'Less than', + 'reversed': False, + 'reverse': None}, + 'le': {'op': '<=', + 'desc': 'Less than or equal to', + 'reversed': False, + 'reverse': None}, + 'gt': {'op': '>', + 'desc': 'Greater than', + 'reversed': False, + 'reverse': None}, + 'ge': {'op': '>=', + 'desc': 'Greater than or equal to', + 'reversed': False, + 'reverse': None}} _op_names = list(_op_descriptions.keys()) for k in _op_names: @@ -903,6 +967,32 @@ def _radd_compat(left, right): _op_descriptions[reverse_op]['reverse'] = k +_flex_doc_SERIES = """ +%s of series and other, element-wise (binary operator `%s`). + +Equivalent to ``%s``, but with support to substitute a fill_value for +missing data in one of the inputs. + +Parameters +---------- +other: Series or scalar value +fill_value : None or float value, default None (NaN) + Fill missing (NaN) values with this value. If both Series are + missing, the result will be missing +level : int or name + Broadcast across a level, matching Index values on the + passed MultiIndex level + +Returns +------- +result : Series + +See also +-------- +Series.%s +""" + + def _flex_method_SERIES(op, name, str_rep, default_axis=None, fill_zeros=None, **eval_kwargs): op_name = name.replace('__', '') @@ -912,38 +1002,17 @@ def _flex_method_SERIES(op, name, str_rep, default_axis=None, fill_zeros=None, else: equiv = 'series ' + op_desc['op'] + ' other' - doc = """ - %s of series and other, element-wise (binary operator `%s`). - - Equivalent to ``%s``, but with support to substitute a fill_value for - missing data in one of the inputs. - - Parameters - ---------- - other: Series or scalar value - fill_value : None or float value, default None (NaN) - Fill missing (NaN) values with this value. If both Series are - missing, the result will be missing - level : int or name - Broadcast across a level, matching Index values on the - passed MultiIndex level - - Returns - ------- - result : Series - - See also - -------- - Series.%s - """ % (op_desc['desc'], op_name, equiv, op_desc['reverse']) + doc = _flex_doc_SERIES % (op_desc['desc'], op_name, equiv, + op_desc['reverse']) @Appender(doc) def flex_wrapper(self, other, level=None, fill_value=None, axis=0): # validate axis - self._get_axis_number(axis) + if axis is not None: + self._get_axis_number(axis) if isinstance(other, ABCSeries): return self._binop(other, op, level=level, fill_value=fill_value) - elif isinstance(other, (np.ndarray, ABCSeries, list, tuple)): + elif isinstance(other, (np.ndarray, list, tuple)): if len(other) != len(self): raise ValueError('Lengths must be equal') return self._binop(self._constructor(other, self.index), op, @@ -952,7 +1021,7 @@ def flex_wrapper(self, other, level=None, fill_value=None, axis=0): if fill_value is not None: self = self.fillna(fill_value) - return self._constructor(op(self.values, other), + return self._constructor(op(self, other), self.index).__finalize__(self) flex_wrapper.__name__ = name @@ -960,11 +1029,9 @@ def flex_wrapper(self, other, level=None, fill_value=None, axis=0): series_flex_funcs = dict(flex_arith_method=_flex_method_SERIES, - radd_func=_radd_compat, - flex_comp_method=_comp_method_SERIES) + flex_comp_method=_flex_method_SERIES) series_special_funcs = dict(arith_method=_arith_method_SERIES, - radd_func=_radd_compat, comp_method=_comp_method_SERIES, bool_method=_bool_method_SERIES) @@ -993,6 +1060,75 @@ def flex_wrapper(self, other, level=None, fill_value=None, axis=0): result : DataFrame """ +_flex_doc_FRAME = """ +%s of dataframe and other, element-wise (binary operator `%s`). + +Equivalent to ``%s``, but with support to substitute a fill_value for +missing data in one of the inputs. + +Parameters +---------- +other : Series, DataFrame, or constant +axis : {0, 1, 'index', 'columns'} + For Series input, axis to match Series index on +fill_value : None or float value, default None + Fill missing (NaN) values with this value. If both DataFrame + locations are missing, the result will be missing +level : int or name + Broadcast across a level, matching Index values on the + passed MultiIndex level + +Notes +----- +Mismatched indices will be unioned together + +Returns +------- +result : DataFrame + +See also +-------- +DataFrame.%s +""" + + +def _align_method_FRAME(left, right, axis): + """ convert rhs to meet lhs dims if input is list, tuple or np.ndarray """ + + def to_series(right): + msg = 'Unable to coerce to Series, length must be {0}: given {1}' + if axis is not None and left._get_axis_name(axis) == 'index': + if len(left.index) != len(right): + raise ValueError(msg.format(len(left.index), len(right))) + right = left._constructor_sliced(right, index=left.index) + else: + if len(left.columns) != len(right): + raise ValueError(msg.format(len(left.columns), len(right))) + right = left._constructor_sliced(right, index=left.columns) + return right + + if isinstance(right, (list, tuple)): + right = to_series(right) + + elif isinstance(right, np.ndarray) and right.ndim: # skips np scalar + + if right.ndim == 1: + right = to_series(right) + + elif right.ndim == 2: + if left.shape != right.shape: + msg = ("Unable to coerce to DataFrame, " + "shape must be {0}: given {1}") + raise ValueError(msg.format(left.shape, right.shape)) + + right = left._constructor(right, index=left.index, + columns=left.columns) + else: + msg = 'Unable to coerce to Series/DataFrame, dim must be <= 2: {0}' + raise ValueError(msg.format(right.shape, )) + + return right + def _arith_method_FRAME(op, name, str_rep=None, default_axis='columns', fill_zeros=None, **eval_kwargs): @@ -1010,19 +1146,21 @@ def na_op(x, y): xrav = xrav[mask] yrav = yrav[mask] if np.prod(xrav.shape) and np.prod(yrav.shape): - result[mask] = op(xrav, yrav) + with np.errstate(all='ignore'): + result[mask] = op(xrav, yrav) elif hasattr(x, 'size'): result = np.empty(x.size, dtype=x.dtype) mask = notnull(xrav) xrav = xrav[mask] if np.prod(xrav.shape): - result[mask] = op(xrav, y) + with np.errstate(all='ignore'): + result[mask] = op(xrav, y) else: raise TypeError("cannot perform operation {op} between " "objects of type {x} and {y}".format( op=name, x=type(x), y=type(y))) - result, changed = com._maybe_upcast_putmask(result, ~mask, np.nan) + result, changed = _maybe_upcast_putmask(result, ~mask, np.nan) result = result.reshape(x.shape) result = missing.fill_zeros(result, x, y, name, fill_zeros) @@ -1037,75 +1175,20 @@ def na_op(x, y): else: equiv = 'dataframe ' + op_desc['op'] + ' other' - doc = """ - %s of dataframe and other, element-wise (binary operator `%s`). - - Equivalent to ``%s``, but with support to substitute a fill_value for - missing data in one of the inputs. - - Parameters - ---------- - other : Series, DataFrame, or constant - axis : {0, 1, 'index', 'columns'} - For Series input, axis to match Series index on - fill_value : None or float value, default None - Fill missing (NaN) values with this value. If both DataFrame - locations are missing, the result will be missing - level : int or name - Broadcast across a level, matching Index values on the - passed MultiIndex level - - Notes - ----- - Mismatched indices will be unioned together - - Returns - ------- - result : DataFrame - - See also - -------- - DataFrame.%s - """ % (op_desc['desc'], op_name, equiv, op_desc['reverse']) + doc = _flex_doc_FRAME % (op_desc['desc'], op_name, equiv, + op_desc['reverse']) else: doc = _arith_doc_FRAME % name @Appender(doc) def f(self, other, axis=default_axis, level=None, fill_value=None): + + other = _align_method_FRAME(self, other, axis) + if isinstance(other, pd.DataFrame): # Another DataFrame return self._combine_frame(other, na_op, fill_value, level) elif isinstance(other, ABCSeries): return self._combine_series(other, na_op, fill_value, axis, level) - elif isinstance(other, (list, tuple)): - if axis is not None and self._get_axis_name(axis) == 'index': - # TODO: Get all of these to use _constructor_sliced - # casted = self._constructor_sliced(other, index=self.index) - casted = pd.Series(other, index=self.index) - else: - # casted = self._constructor_sliced(other, index=self.columns) - casted = pd.Series(other, index=self.columns) - return self._combine_series(casted, na_op, fill_value, axis, level) - elif isinstance(other, np.ndarray) and other.ndim: # skips np scalar - if other.ndim == 1: - if axis is not None and self._get_axis_name(axis) == 'index': - # casted = self._constructor_sliced(other, - # index=self.index) - casted = pd.Series(other, index=self.index) - else: - # casted = self._constructor_sliced(other, - # index=self.columns) - casted = pd.Series(other, index=self.columns) - return self._combine_series(casted, na_op, fill_value, axis, - level) - elif other.ndim == 2: - # casted = self._constructor(other, index=self.index, - # columns=self.columns) - casted = pd.DataFrame(other, index=self.index, - columns=self.columns) - return self._combine_frame(casted, na_op, fill_value, level) - else: - raise ValueError("Incompatible argument shape: %s" % - (other.shape, )) else: if fill_value is not None: self = self.fillna(fill_value) @@ -1145,39 +1228,14 @@ def na_op(x, y): @Appender('Wrapper for flexible comparison methods %s' % name) def f(self, other, axis=default_axis, level=None): + + other = _align_method_FRAME(self, other, axis) + if isinstance(other, pd.DataFrame): # Another DataFrame return self._flex_compare_frame(other, na_op, str_rep, level) elif isinstance(other, ABCSeries): return self._combine_series(other, na_op, None, axis, level) - - elif isinstance(other, (list, tuple)): - if axis is not None and self._get_axis_name(axis) == 'index': - casted = pd.Series(other, index=self.index) - else: - casted = pd.Series(other, index=self.columns) - - return self._combine_series(casted, na_op, None, axis, level) - - elif isinstance(other, np.ndarray): - if other.ndim == 1: - if axis is not None and self._get_axis_name(axis) == 'index': - casted = pd.Series(other, index=self.index) - else: - casted = pd.Series(other, index=self.columns) - - return self._combine_series(casted, na_op, None, axis, level) - - elif other.ndim == 2: - casted = pd.DataFrame(other, index=self.index, - columns=self.columns) - - return self._flex_compare_frame(casted, na_op, str_rep, level) - - else: - raise ValueError("Incompatible argument shape: %s" % - (other.shape, )) - else: return self._combine_const(other, na_op) @@ -1206,11 +1264,9 @@ def f(self, other): frame_flex_funcs = dict(flex_arith_method=_arith_method_FRAME, - radd_func=_radd_compat, flex_comp_method=_flex_comp_method_FRAME) frame_special_funcs = dict(arith_method=_arith_method_FRAME, - radd_func=_radd_compat, comp_method=_comp_method_FRAME, bool_method=_arith_method_FRAME) @@ -1229,7 +1285,7 @@ def na_op(x, y): result = np.empty(len(x), dtype=x.dtype) mask = notnull(x) result[mask] = op(x[mask], y) - result, changed = com._maybe_upcast_putmask(result, ~mask, np.nan) + result, changed = _maybe_upcast_putmask(result, ~mask, np.nan) result = missing.fill_zeros(result, x, y, name, fill_zeros) return result diff --git a/pandas/core/panel.py b/pandas/core/panel.py index 7d0bedcc2b381..f708774dd84ff 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -8,17 +8,20 @@ import numpy as np +from pandas.types.cast import (_infer_dtype_from_scalar, + _possibly_cast_item) +from pandas.types.common import (is_integer, is_list_like, + is_string_like, is_scalar) +from pandas.types.missing import notnull + import pandas.computation.expressions as expressions import pandas.core.common as com import pandas.core.ops as ops import pandas.core.missing as missing from pandas import compat -from pandas import lib from pandas.compat import (map, zip, range, u, OrderedDict, OrderedDefaultdict) from pandas.compat.numpy import function as nv -from pandas.core.categorical import Categorical -from pandas.core.common import (PandasError, _try_sort, _default_index, - _infer_dtype_from_scalar, is_list_like) +from pandas.core.common import PandasError, _try_sort, _default_index from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame, _shared_docs from pandas.core.index import (Index, MultiIndex, _ensure_index, @@ -31,7 +34,7 @@ from pandas.core.ops import _op_descriptions from pandas.core.series import Series from pandas.tools.util import cartesian_product -from pandas.util.decorators import (deprecate, Appender, deprecate_kwarg) +from pandas.util.decorators import (deprecate, Appender) _shared_doc_kwargs = dict( axes='items, major_axis, minor_axis', @@ -99,13 +102,7 @@ def panel_index(time, panels, names=None): if names is None: names = ['time', 'panel'] time, panels = _ensure_like_indices(time, panels) - time_factor = Categorical.from_array(time, ordered=True) - panel_factor = Categorical.from_array(panels, ordered=True) - - labels = [time_factor.codes, panel_factor.codes] - levels = [time_factor.categories, panel_factor.categories] - return MultiIndex(levels, labels, sortorder=None, names=names, - verify_integrity=False) + return MultiIndex.from_arrays([time, panels], sortorder=None, names=names) class Panel(NDFrame): @@ -168,7 +165,7 @@ def _init_data(self, data, copy, dtype, **kwargs): mgr = self._init_matrix(data, passed_axes, dtype=dtype, copy=copy) copy = False dtype = None - elif lib.isscalar(data) and all(x is not None for x in passed_axes): + elif is_scalar(data) and all(x is not None for x in passed_axes): if dtype is None: dtype, data = _infer_dtype_from_scalar(data) values = np.empty([len(x) for x in passed_axes], dtype=dtype) @@ -389,25 +386,15 @@ def _get_plane_axes(self, axis): fromDict = from_dict - def to_sparse(self, fill_value=None, kind='block'): + def to_sparse(self, *args, **kwargs): """ - Convert to SparsePanel + NOT IMPLEMENTED: do not call this method, as sparsifying is not + supported for Panel objects and will raise an error. - Parameters - ---------- - fill_value : float, default NaN - kind : {'block', 'integer'} - - Returns - ------- - y : SparseDataFrame + Convert to SparsePanel """ - from pandas.core.sparse import SparsePanel - frames = dict(self.iteritems()) - return SparsePanel(frames, items=self.items, - major_axis=self.major_axis, - minor_axis=self.minor_axis, default_kind=kind, - default_fill_value=fill_value) + raise NotImplementedError("sparsifying is not supported " + "for Panel objects") def to_excel(self, path, na_rep='', engine=None, **kwargs): """ @@ -552,7 +539,7 @@ def set_value(self, *args, **kwargs): made_bigger = not np.array_equal(axes[0], self._info_axis) # how to make this logic simpler? if made_bigger: - com._possibly_cast_item(result, args[0], likely_dtype) + _possibly_cast_item(result, args[0], likely_dtype) return result.set_value(*args) @@ -582,7 +569,7 @@ def __setitem__(self, key, value): 'object was {1}'.format( shape[1:], tuple(map(int, value.shape)))) mat = np.asarray(value) - elif lib.isscalar(value): + elif is_scalar(value): dtype, value = _infer_dtype_from_scalar(value) mat = np.empty(shape[1:], dtype=dtype) mat.fill(value) @@ -653,7 +640,7 @@ def round(self, decimals=0, *args, **kwargs): """ nv.validate_round(args, kwargs) - if com.is_integer(decimals): + if is_integer(decimals): result = np.apply_along_axis(np.round, 0, self.values) return self._wrap_result(result, axis=0) raise TypeError("decimals must be an integer") @@ -687,7 +674,7 @@ def dropna(self, axis=0, how='any', inplace=False): axis = self._get_axis_number(axis) values = self.values - mask = com.notnull(values) + mask = notnull(values) for ax in reversed(sorted(set(range(self._AXIS_LEN)) - set([axis]))): mask = mask.sum(ax) @@ -711,7 +698,7 @@ def _combine(self, other, func, axis=0): return self._combine_panel(other, func) elif isinstance(other, DataFrame): return self._combine_frame(other, func, axis=axis) - elif lib.isscalar(other): + elif is_scalar(other): return self._combine_const(other, func) else: raise NotImplementedError("%s is not supported in combine " @@ -719,7 +706,8 @@ def _combine(self, other, func, axis=0): (str(type(other)), str(type(self)))) def _combine_const(self, other, func): - new_values = func(self.values, other) + with np.errstate(all='ignore'): + new_values = func(self.values, other) d = self._construct_axes_dict() return self._constructor(new_values, **d) @@ -729,14 +717,15 @@ def _combine_frame(self, other, func, axis=0): other = other.reindex(index=index, columns=columns) - if axis == 0: - new_values = func(self.values, other.values) - elif axis == 1: - new_values = func(self.values.swapaxes(0, 1), other.values.T) - new_values = new_values.swapaxes(0, 1) - elif axis == 2: - new_values = func(self.values.swapaxes(0, 2), other.values) - new_values = new_values.swapaxes(0, 2) + with np.errstate(all='ignore'): + if axis == 0: + new_values = func(self.values, other.values) + elif axis == 1: + new_values = func(self.values.swapaxes(0, 1), other.values.T) + new_values = new_values.swapaxes(0, 1) + elif axis == 2: + new_values = func(self.values.swapaxes(0, 2), other.values) + new_values = new_values.swapaxes(0, 2) return self._constructor(new_values, self.items, self.major_axis, self.minor_axis) @@ -750,11 +739,12 @@ def _combine_panel(self, other, func): this = self.reindex(items=items, major=major, minor=minor) other = other.reindex(items=items, major=major, minor=minor) - result_values = func(this.values, other.values) + with np.errstate(all='ignore'): + result_values = func(this.values, other.values) return self._constructor(result_values, items, major, minor) - def major_xs(self, key, copy=None): + def major_xs(self, key): """ Return slice of panel along major axis @@ -762,8 +752,6 @@ def major_xs(self, key, copy=None): ---------- key : object Major axis label - copy : boolean [deprecated] - Whether to make a copy of the data Returns ------- @@ -779,13 +767,9 @@ def major_xs(self, key, copy=None): :ref:`MultiIndex Slicers ` """ - if copy is not None: - warnings.warn("copy keyword is deprecated, " - "default is to return a copy or a view if possible") - return self.xs(key, axis=self._AXIS_LEN - 2) - def minor_xs(self, key, copy=None): + def minor_xs(self, key): """ Return slice of panel along minor axis @@ -793,8 +777,6 @@ def minor_xs(self, key, copy=None): ---------- key : object Minor axis label - copy : boolean [deprecated] - Whether to make a copy of the data Returns ------- @@ -810,13 +792,9 @@ def minor_xs(self, key, copy=None): :ref:`MultiIndex Slicers ` """ - if copy is not None: - warnings.warn("copy keyword is deprecated, " - "default is to return a copy or a view if possible") - return self.xs(key, axis=self._AXIS_LEN - 1) - def xs(self, key, axis=1, copy=None): + def xs(self, key, axis=1): """ Return slice of panel along selected axis @@ -825,8 +803,6 @@ def xs(self, key, axis=1, copy=None): key : object Label axis : {'items', 'major', 'minor}, default 1/'major' - copy : boolean [deprecated] - Whether to make a copy of the data Returns ------- @@ -841,10 +817,6 @@ def xs(self, key, axis=1, copy=None): :ref:`MultiIndex Slicers ` """ - if copy is not None: - warnings.warn("copy keyword is deprecated, " - "default is to return a copy or a view if possible") - axis = self._get_axis_number(axis) if axis == 0: return self[key] @@ -924,7 +896,7 @@ def to_frame(self, filter_observations=True): if filter_observations: # shaped like the return DataFrame - mask = com.notnull(self.values).all(axis=0) + mask = notnull(self.values).all(axis=0) # size = mask.sum() selector = mask.ravel() else: @@ -1035,7 +1007,8 @@ def apply(self, func, axis='major', **kwargs): # try ufunc like if isinstance(f, np.ufunc): try: - result = np.apply_along_axis(func, axis, self.values) + with np.errstate(all='ignore'): + result = np.apply_along_axis(func, axis, self.values) return self._wrap_result(result, axis=axis) except (AttributeError): pass @@ -1137,7 +1110,8 @@ def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, axis_number = self._get_axis_number(axis_name) f = lambda x: op(x, axis=axis_number, skipna=skipna, **kwds) - result = f(self.values) + with np.errstate(all='ignore'): + result = f(self.values) axes = self._get_plane_axes(axis_name) if result.ndim == 2 and axis_name != self._info_axis_name: @@ -1218,7 +1192,7 @@ def transpose(self, *args, **kwargs): # check if a list of axes was passed in instead as a # single *args element if (len(args) == 1 and hasattr(args[0], '__iter__') and - not com.is_string_like(args[0])): + not is_string_like(args[0])): axes = args[0] else: axes = args @@ -1258,7 +1232,6 @@ def count(self, axis='major'): return self._wrap_result(result, axis) - @deprecate_kwarg(old_arg_name='lags', new_arg_name='periods') def shift(self, periods=1, freq=None, axis='major'): """ Shift index by desired number of periods with an optional time freq. diff --git a/pandas/core/panel4d.py b/pandas/core/panel4d.py index 33bd79195cc77..f32de29c5c167 100644 --- a/pandas/core/panel4d.py +++ b/pandas/core/panel4d.py @@ -1,5 +1,6 @@ """ Panel4D: a 4-d dict like collection of panels """ +import warnings from pandas.core.panelnd import create_nd_panel_factory from pandas.core.panel import Panel @@ -18,6 +19,11 @@ having 4 named dimensions. It is intended as a test bed for more N-Dimensional named containers. + DEPRECATED. Panel4D is deprecated and will be removed in a future version. + The recommended way to represent these types of n-dimensional data are with + the `xarray package `__. + Pandas provides a `.to_xarray()` method to automate this conversion. + Parameters ---------- data : ndarray (labels x items x major x minor), or dict of Panels @@ -37,6 +43,15 @@ def panel4d_init(self, data=None, labels=None, items=None, major_axis=None, minor_axis=None, copy=False, dtype=None): + # deprecation GH13564 + warnings.warn("\nPanel4D is deprecated and will be removed in a " + "future version.\nThe recommended way to represent " + "these types of n-dimensional data are with\n" + "the `xarray package " + "`__.\n" + "Pandas provides a `.to_xarray()` method to help " + "automate this conversion.\n", + FutureWarning, stacklevel=2) self._init_data(data=data, labels=labels, items=items, major_axis=major_axis, minor_axis=minor_axis, copy=copy, dtype=dtype) diff --git a/pandas/core/panelnd.py b/pandas/core/panelnd.py index 04fbaab30b42e..26ceeea654e4e 100644 --- a/pandas/core/panelnd.py +++ b/pandas/core/panelnd.py @@ -1,5 +1,6 @@ """ Factory methods to create N-D panels """ +import warnings from pandas.compat import zip import pandas.compat as compat @@ -8,6 +9,11 @@ def create_nd_panel_factory(klass_name, orders, slices, slicer, aliases=None, stat_axis=2, info_axis=0, ns=None): """ manufacture a n-d class: + DEPRECATED. Panelnd is deprecated and will be removed in a future version. + The recommended way to represent these types of n-dimensional data are with + the `xarray package `__. + Pandas provides a `.to_xarray()` method to automate this conversion. + Parameters ---------- klass_name : the klass name @@ -44,6 +50,18 @@ def create_nd_panel_factory(klass_name, orders, slices, slicer, aliases=None, # define the methods #### def __init__(self, *args, **kwargs): + + # deprecation GH13564 + warnings.warn("\n{klass} is deprecated and will be removed in a " + "future version.\nThe recommended way to represent " + "these types of n-dimensional data are with the\n" + "`xarray package " + "`__.\n" + "Pandas provides a `.to_xarray()` method to help " + "automate this conversion.\n".format( + klass=self.__class__.__name__), + FutureWarning, stacklevel=2) + if not (kwargs.get('data') or len(args)): raise Exception("must supply at least a data argument to [%s]" % klass_name) diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 8d237016d1b33..fa5d16bd85e98 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -6,6 +6,11 @@ import numpy as np +from pandas.types.common import _ensure_platform_int, is_list_like +from pandas.types.cast import _maybe_promote +from pandas.types.missing import notnull +import pandas.types.concat as _concat + from pandas.core.series import Series from pandas.core.frame import DataFrame @@ -13,12 +18,9 @@ from pandas.sparse.array import SparseArray from pandas._sparse import IntIndex -from pandas.core.categorical import Categorical -from pandas.core.common import notnull, _ensure_platform_int, _maybe_promote +from pandas.core.categorical import Categorical, _factorize_from_iterable from pandas.core.groupby import get_group_index, _compress_group_index -import pandas.core.common as com -import pandas.types.concat as _concat import pandas.core.algorithms as algos import pandas.algos as _algos @@ -164,9 +166,8 @@ def get_result(self): if self.is_categorical is not None: categories = self.is_categorical.categories ordered = self.is_categorical.ordered - values = [Categorical.from_array(values[:, i], - categories=categories, - ordered=ordered) + values = [Categorical(values[:, i], categories=categories, + ordered=ordered) for i in range(values.shape[-1])] return DataFrame(values, index=index, columns=columns) @@ -469,8 +470,8 @@ def stack(frame, level=-1, dropna=True): def factorize(index): if index.is_unique: return index, np.arange(len(index)) - cat = Categorical(index, ordered=True) - return cat.categories, cat.codes + codes, categories = _factorize_from_iterable(index) + return categories, codes N, K = frame.shape if isinstance(frame.columns, MultiIndex): @@ -983,7 +984,7 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, .. versionadded:: 0.16.1 drop_first : bool, default False - Whether to get k-1 dummies out of n categorical levels by removing the + Whether to get k-1 dummies out of k categorical levels by removing the first level. .. versionadded:: 0.18.0 @@ -1063,7 +1064,7 @@ def check_len(item, name): length_msg = ("Length of '{0}' ({1}) did not match the length of " "the columns being encoded ({2}).") - if com.is_list_like(item): + if is_list_like(item): if not len(item) == len(columns_to_encode): raise ValueError(length_msg.format(name, len(item), len(columns_to_encode))) @@ -1105,8 +1106,7 @@ def check_len(item, name): def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False, sparse=False, drop_first=False): # Series avoids inconsistent NaN handling - cat = Categorical.from_array(Series(data), ordered=True) - levels = cat.categories + codes, levels = _factorize_from_iterable(Series(data)) def get_empty_Frame(data, sparse): if isinstance(data, Series): @@ -1122,10 +1122,10 @@ def get_empty_Frame(data, sparse): if not dummy_na and len(levels) == 0: return get_empty_Frame(data, sparse) - codes = cat.codes.copy() + codes = codes.copy() if dummy_na: - codes[codes == -1] = len(cat.categories) - levels = np.append(cat.categories, np.nan) + codes[codes == -1] = len(levels) + levels = np.append(levels, np.nan) # if dummy_na, we just fake a nan level. drop_first will drop it again if drop_first and len(levels) == 1: @@ -1159,14 +1159,17 @@ def get_empty_Frame(data, sparse): sp_indices = sp_indices[1:] dummy_cols = dummy_cols[1:] for col, ixs in zip(dummy_cols, sp_indices): - sarr = SparseArray(np.ones(len(ixs)), - sparse_index=IntIndex(N, ixs), fill_value=0) + sarr = SparseArray(np.ones(len(ixs), dtype=np.uint8), + sparse_index=IntIndex(N, ixs), fill_value=0, + dtype=np.uint8) sparse_series[col] = SparseSeries(data=sarr, index=index) - return SparseDataFrame(sparse_series, index=index, columns=dummy_cols) + out = SparseDataFrame(sparse_series, index=index, columns=dummy_cols, + dtype=np.uint8) + return out else: - dummy_mat = np.eye(number_of_cols).take(codes, axis=0) + dummy_mat = np.eye(number_of_cols, dtype=np.uint8).take(codes, axis=0) if not dummy_na: # reset NaN GH4446 @@ -1207,9 +1210,7 @@ def make_axis_dummies(frame, axis='minor', transform=None): labels = frame.index.labels[num] if transform is not None: mapped_items = items.map(transform) - cat = Categorical.from_array(mapped_items.take(labels), ordered=True) - labels = cat.codes - items = cat.categories + labels, items = _factorize_from_iterable(mapped_items.take(labels)) values = np.eye(len(items), dtype=float) values = values.take(labels, axis=0) diff --git a/pandas/core/series.py b/pandas/core/series.py index 43b4ba3a51212..8379c8bcdcae8 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -13,18 +13,34 @@ import numpy as np import numpy.ma as ma -from pandas.core.common import (isnull, notnull, is_bool_indexer, - _default_index, _maybe_upcast, - _asarray_tuplesafe, _infer_dtype_from_scalar, - is_list_like, _values_from_object, - is_categorical_dtype, - _possibly_cast_to_datetime, - _possibly_castable, _possibly_convert_platform, - _try_sort, is_extension_type, is_datetimetz, - _maybe_match_name, ABCSparseArray, - _coerce_to_dtype, SettingWithCopyError, - _maybe_box_datetimelike, ABCDataFrame, - _dict_compat, is_integer) +from pandas.types.common import (_coerce_to_dtype, is_categorical_dtype, + is_integer, is_integer_dtype, + is_float_dtype, + is_extension_type, is_datetimetz, + is_datetimelike, + is_datetime64tz_dtype, + is_timedelta64_dtype, + is_list_like, + is_hashable, + is_iterator, + is_dict_like, + is_scalar, + _ensure_platform_int) +from pandas.types.generic import ABCSparseArray, ABCDataFrame +from pandas.types.cast import (_maybe_upcast, _infer_dtype_from_scalar, + _possibly_convert_platform, + _possibly_cast_to_datetime, _possibly_castable) +from pandas.types.missing import isnull, notnull + +from pandas.core.common import (is_bool_indexer, + _default_index, + _asarray_tuplesafe, + _values_from_object, + _try_sort, + _maybe_match_name, + SettingWithCopyError, + _maybe_box_datetimelike, + _dict_compat) from pandas.core.index import (Index, MultiIndex, InvalidIndexError, Float64Index, _ensure_index) from pandas.core.indexing import check_bool_indexer, maybe_convert_indices @@ -36,7 +52,7 @@ CombinedDatetimelikeProperties) from pandas.tseries.index import DatetimeIndex from pandas.tseries.tdi import TimedeltaIndex -from pandas.tseries.period import PeriodIndex, Period +from pandas.tseries.period import PeriodIndex from pandas import compat from pandas.util.terminal import get_terminal_size from pandas.compat import zip, u, OrderedDict, StringIO @@ -46,7 +62,6 @@ import pandas.core.algorithms as algos import pandas.core.common as com -import pandas.core.datetools as datetools import pandas.core.nanops as nanops import pandas.formats.format as fmt from pandas.util.decorators import Appender, deprecate_kwarg, Substitution @@ -63,7 +78,8 @@ axes='index', klass='Series', axes_single_arg="{0, 'index'}", inplace="""inplace : boolean, default False If True, performs operation inplace and returns None.""", - duplicated='Series') + unique='np.ndarray', duplicated='Series', + optional_by='') def _coerce_method(converter): @@ -273,14 +289,18 @@ def _set_axis(self, axis, labels, fastpath=False): is_all_dates = labels.is_all_dates if is_all_dates: - if not isinstance(labels, (DatetimeIndex, PeriodIndex, TimedeltaIndex)): - labels = DatetimeIndex(labels) + try: + labels = DatetimeIndex(labels) + # need to set here becuase we changed the index + if fastpath: + self._data.set_axis(axis, labels) + except (tslib.OutOfBoundsDatetime, ValueError): + # labels may exceeds datetime bounds, + # or not be a DatetimeIndex + pass - # need to set here becuase we changed the index - if fastpath: - self._data.set_axis(axis, labels) self._set_subtyp(is_all_dates) object.__setattr__(self, '_index', labels) @@ -303,7 +323,7 @@ def name(self): @name.setter def name(self, value): - if value is not None and not com.is_hashable(value): + if value is not None and not is_hashable(value): raise TypeError('Series.name must be a hashable type') object.__setattr__(self, '_name', value) @@ -580,7 +600,7 @@ def __getitem__(self, key): try: result = self.index.get_value(self, key) - if not lib.isscalar(result): + if not is_scalar(result): if is_list_like(result) and not isinstance(result, Series): # we need to box if we have a non-unique index here @@ -613,10 +633,10 @@ def __getitem__(self, key): except Exception: raise - if com.is_iterator(key): + if is_iterator(key): key = list(key) - if is_bool_indexer(key): + if com.is_bool_indexer(key): key = check_bool_indexer(self.index, key) return self._get_with(key) @@ -710,9 +730,9 @@ def setitem(key, value): elif key is Ellipsis: self[:] = value return - elif is_bool_indexer(key): + elif com.is_bool_indexer(key): pass - elif com.is_timedelta64_dtype(self.dtype): + elif is_timedelta64_dtype(self.dtype): # reassign a null value to iNaT if isnull(value): value = tslib.iNaT @@ -736,10 +756,10 @@ def setitem(key, value): if 'unorderable' in str(e): # pragma: no cover raise IndexError(key) - if is_bool_indexer(key): + if com.is_bool_indexer(key): key = check_bool_indexer(self.index, key) try: - self.where(~key, value, inplace=True) + self._where(~key, value, inplace=True) return except InvalidIndexError: pass @@ -828,14 +848,22 @@ def repeat(self, reps, *args, **kwargs): def reshape(self, *args, **kwargs): """ - Return the values attribute of `self` with shape `args`. - However, if the specified shape matches exactly the current - shape, `self` is returned for compatibility reasons. + DEPRECATED: calling this method will raise an error in a + future release. Please call ``.values.reshape(...)`` instead. + + return an ndarray with the values shape + if the specified shape matches exactly the current shape, then + return self (for compat) See also -------- numpy.ndarray.reshape """ + warnings.warn("reshape is deprecated and will raise " + "in a subsequent release. Please use " + ".values.reshape(...) instead", FutureWarning, + stacklevel=2) + if len(args) == 1 and hasattr(args[0], '__iter__'): shape = args[0] else: @@ -1060,7 +1088,7 @@ def _get_repr(self, name=False, header=True, index=True, length=True, def __iter__(self): """ provide iteration over the values of the Series box values if necessary """ - if com.is_datetimelike(self): + if is_datetimelike(self): return (_maybe_box_datetimelike(x) for x in self._values) else: return iter(self._values) @@ -1206,6 +1234,15 @@ def mode(self): # TODO: Add option for bins like value_counts() return algos.mode(self) + @Appender(base._shared_docs['unique'] % _shared_doc_kwargs) + def unique(self): + result = super(Series, self).unique() + if is_datetime64tz_dtype(self.dtype): + # to return array of Timestamp with tz + # ToDo: it must return DatetimeArray with tz in pandas 2.0 + return result.asobject.values + return result + @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'}) @Appender(base._shared_docs['drop_duplicates'] % _shared_doc_kwargs) @@ -1349,7 +1386,7 @@ def quantile(self, q=0.5, interpolation='linear'): result = self._data.quantile(qs=q, interpolation=interpolation) - if com.is_list_like(q): + if is_list_like(q): return self._constructor(result, index=Float64Index(q), name=self.name) @@ -1481,20 +1518,25 @@ def dot(self, other): @Appender(base._shared_docs['searchsorted']) def searchsorted(self, v, side='left', sorter=None): if sorter is not None: - sorter = com._ensure_platform_int(sorter) + sorter = _ensure_platform_int(sorter) return self._values.searchsorted(Series(v)._values, side=side, sorter=sorter) # ------------------------------------------------------------------- # Combination - def append(self, to_append, verify_integrity=False): + def append(self, to_append, ignore_index=False, verify_integrity=False): """ Concatenate two or more Series. Parameters ---------- to_append : Series or list/tuple of Series + ignore_index : boolean, default False + If True, do not use the index labels. + + .. versionadded: 0.19.0 + verify_integrity : boolean, default False If True, raise Exception on creating index with duplicates @@ -1525,6 +1567,17 @@ def append(self, to_append, verify_integrity=False): 5 6 dtype: int64 + With `ignore_index` set to True: + + >>> s1.append(s2, ignore_index=True) + 0 1 + 1 2 + 2 3 + 3 4 + 4 5 + 5 6 + dtype: int64 + With `verify_integrity` set to True: >>> s1.append(s2, verify_integrity=True) @@ -1538,7 +1591,7 @@ def append(self, to_append, verify_integrity=False): to_concat = [self] + to_append else: to_concat = [self, to_append] - return concat(to_concat, ignore_index=False, + return concat(to_concat, ignore_index=ignore_index, verify_integrity=verify_integrity) def _binop(self, other, func, level=None, fill_value=None): @@ -1585,7 +1638,8 @@ def _binop(self, other, func, level=None, fill_value=None): this_vals[this_mask & mask] = fill_value other_vals[other_mask & mask] = fill_value - result = func(this_vals, other_vals) + with np.errstate(all='ignore'): + result = func(this_vals, other_vals) name = _maybe_match_name(self, other) result = self._constructor(result, index=new_index, name=name) result = result.__finalize__(self) @@ -1617,10 +1671,12 @@ def combine(self, other, func, fill_value=nan): for i, idx in enumerate(new_index): lv = self.get(idx, fill_value) rv = other.get(idx, fill_value) - new_values[i] = func(lv, rv) + with np.errstate(all='ignore'): + new_values[i] = func(lv, rv) else: new_index = self.index - new_values = func(self._values, other) + with np.errstate(all='ignore'): + new_values = func(self._values, other) new_name = self.name return self._constructor(new_values, index=new_index, name=new_name) @@ -1727,7 +1783,7 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, elif isinstance(index, MultiIndex): from pandas.core.groupby import _lexsort_indexer indexer = _lexsort_indexer(index.labels, orders=ascending) - indexer = com._ensure_platform_int(indexer) + indexer = _ensure_platform_int(indexer) new_index = index.take(indexer) else: new_index, indexer = index.sort_values(return_indexer=True, @@ -2199,14 +2255,15 @@ def apply(self, func, convert_dtype=True, args=(), **kwds): else: f = func - if isinstance(f, np.ufunc): - return f(self) + with np.errstate(all='ignore'): + if isinstance(f, np.ufunc): + return f(self) - if is_extension_type(self.dtype): - mapped = self._values.map(f) - else: - values = self.asobject - mapped = lib.map_infer(values, f, convert=convert_dtype) + if is_extension_type(self.dtype): + mapped = self._values.map(f) + else: + values = self.asobject + mapped = lib.map_infer(values, f, convert=convert_dtype) if len(mapped) and isinstance(mapped[0], Series): from pandas.core.frame import DataFrame @@ -2231,7 +2288,8 @@ def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, if numeric_only: raise NotImplementedError('Series.{0} does not implement ' 'numeric_only.'.format(name)) - return op(delegate, skipna=skipna, **kwds) + with np.errstate(all='ignore'): + return op(delegate, skipna=skipna, **kwds) return delegate._reduce(op=op, name=name, axis=axis, skipna=skipna, numeric_only=numeric_only, @@ -2265,8 +2323,8 @@ def align(self, other, join='outer', axis=None, level=None, copy=True, @Appender(generic._shared_docs['rename'] % _shared_doc_kwargs) def rename(self, index=None, **kwargs): - non_mapping = lib.isscalar(index) or (com.is_list_like(index) and - not com.is_dict_like(index)) + non_mapping = is_scalar(index) or (is_list_like(index) and + not is_dict_like(index)) if non_mapping: return self._set_name(index, inplace=kwargs.get('inplace')) return super(Series, self).rename(index=index, **kwargs) @@ -2345,7 +2403,7 @@ def take(self, indices, axis=0, convert=True, is_copy=False, **kwargs): if convert: indices = maybe_convert_indices(indices, len(self._get_axis(axis))) - indices = com._ensure_platform_int(indices) + indices = _ensure_platform_int(indices) new_index = self.index.take(indices) new_values = self._values.take(indices) return self._constructor(new_values, @@ -2492,16 +2550,17 @@ def from_csv(cls, path, sep=',', parse_dates=True, header=None, return result - def to_csv(self, path, index=True, sep=",", na_rep='', float_format=None, - header=False, index_label=None, mode='w', nanRep=None, - encoding=None, date_format=None, decimal='.'): + def to_csv(self, path=None, index=True, sep=",", na_rep='', + float_format=None, header=False, index_label=None, + mode='w', encoding=None, date_format=None, decimal='.'): """ Write Series to a comma-separated values (csv) file Parameters ---------- - path : string file path or file handle / StringIO. If None is provided - the result is returned as a string. + path : string or file handle, default None + File path or object, if None is provided the result is returned as + a string. na_rep : string, default '' Missing data representation float_format : string, default None @@ -2531,7 +2590,7 @@ def to_csv(self, path, index=True, sep=",", na_rep='', float_format=None, # result is only a string if no path provided, otherwise None result = df.to_csv(path, index=index, sep=sep, na_rep=na_rep, float_format=float_format, header=header, - index_label=index_label, mode=mode, nanRep=nanRep, + index_label=index_label, mode=mode, encoding=encoding, date_format=date_format, decimal=decimal) if path is None: @@ -2601,52 +2660,6 @@ def last_valid_index(self): # ---------------------------------------------------------------------- # Time series-oriented methods - def asof(self, where): - """ - Return last good (non-NaN) value in Series if value is NaN for - requested date. - - If there is no good value, NaN is returned. - - Parameters - ---------- - where : date or array of dates - - Notes - ----- - Dates are assumed to be sorted - - Returns - ------- - value or NaN - """ - if isinstance(where, compat.string_types): - where = datetools.to_datetime(where) - - values = self._values - - if not hasattr(where, '__iter__'): - start = self.index[0] - if isinstance(self.index, PeriodIndex): - where = Period(where, freq=self.index.freq).ordinal - start = start.ordinal - - if where < start: - return np.nan - loc = self.index.searchsorted(where, side='right') - if loc > 0: - loc -= 1 - while isnull(values[loc]) and loc > 0: - loc -= 1 - return values[loc] - - if not isinstance(where, Index): - where = Index(where) - - locs = self.index.asof_locs(where, notnull(values)) - new_values = algos.take_1d(values, locs) - return self._constructor(new_values, index=where).__finalize__(self) - def to_timestamp(self, freq=None, how='start', copy=True): """ Cast to datetimeindex of timestamps, at *beginning* of period @@ -2817,7 +2830,7 @@ def _try_cast(arr, take_fast_path): subarr = np.array(data, copy=False) # possibility of nan -> garbage - if com.is_float_dtype(data.dtype) and com.is_integer_dtype(dtype): + if is_float_dtype(data.dtype) and is_integer_dtype(dtype): if not isnull(data).any(): subarr = _try_cast(data, True) elif copy: @@ -2843,7 +2856,7 @@ def _try_cast(arr, take_fast_path): subarr = data.copy() return subarr - elif isinstance(data, list) and len(data) > 0: + elif isinstance(data, (list, tuple)) and len(data) > 0: if dtype is not None: try: subarr = _try_cast(data, False) diff --git a/pandas/core/sparse.py b/pandas/core/sparse.py index 701e6b1102b05..4fc329844d616 100644 --- a/pandas/core/sparse.py +++ b/pandas/core/sparse.py @@ -8,4 +8,3 @@ from pandas.sparse.series import SparseSeries from pandas.sparse.frame import SparseDataFrame -from pandas.sparse.panel import SparsePanel diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 5b1b8bd05af42..3041b17b99b17 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1,15 +1,21 @@ import numpy as np from pandas.compat import zip -from pandas.core.common import (isnull, notnull, _values_from_object, - is_bool_dtype, - is_list_like, is_categorical_dtype, - is_object_dtype) +from pandas.types.generic import ABCSeries, ABCIndex +from pandas.types.missing import isnull, notnull +from pandas.types.common import (is_bool_dtype, + is_categorical_dtype, + is_object_dtype, + is_string_like, + is_list_like, + is_scalar, + is_integer) +from pandas.core.common import _values_from_object + from pandas.core.algorithms import take_1d import pandas.compat as compat from pandas.core.base import AccessorProperty, NoNewAttributesMixin -from pandas.types import api as gt -from pandas.util.decorators import Appender, deprecate_kwarg +from pandas.util.decorators import Appender import re import pandas.lib as lib import warnings @@ -152,14 +158,15 @@ def _map(f, arr, na_mask=False, na_value=np.nan, dtype=object): if not len(arr): return np.ndarray(0, dtype=dtype) - if isinstance(arr, gt.ABCSeries): + if isinstance(arr, ABCSeries): arr = arr.values if not isinstance(arr, np.ndarray): arr = np.asarray(arr, dtype=object) if na_mask: mask = isnull(arr) try: - result = lib.map_infer_mask(arr, f, mask.view(np.uint8)) + convert = not all(mask) + result = lib.map_infer_mask(arr, f, mask.view(np.uint8), convert) except (TypeError, AttributeError): def g(x): @@ -309,6 +316,10 @@ def str_replace(arr, pat, repl, n=-1, case=True, flags=0): ------- replaced : Series/Index of objects """ + + # Check whether repl is valid (GH 13438) + if not is_string_like(repl): + raise TypeError("repl must be a string") use_re = not case or len(pat) > 1 or flags if use_re: @@ -339,7 +350,7 @@ def str_repeat(arr, repeats): ------- repeated : Series/Index of objects """ - if lib.isscalar(repeats): + if is_scalar(repeats): def rep(x): try: @@ -543,7 +554,7 @@ def str_extract(arr, pat, flags=0, expand=None): each group. Any capture group names in regular expression pat will be used for column names; otherwise capture group numbers will be used. The dtype of each result column is always object, even when - no match is found. If expand=True and pat has only one capture group, + no match is found. If expand=False and pat has only one capture group, then return a Series (if subject is a Series) or Index (if subject is an Index). @@ -692,7 +703,7 @@ def str_extractall(arr, pat, flags=0): if regex.groups == 0: raise ValueError("pattern contains no capture groups") - if isinstance(arr, gt.ABCIndex): + if isinstance(arr, ABCIndex): arr = arr.to_series().reset_index(drop=True) names = dict(zip(regex.groupindex.values(), regex.groupindex.keys())) @@ -708,6 +719,8 @@ def str_extractall(arr, pat, flags=0): subject_key = (subject_key, ) for match_i, match_tuple in enumerate(regex.findall(subject)): + if isinstance(match_tuple, compat.string_types): + match_tuple = (match_tuple,) na_tuple = [np.NaN if group == "" else group for group in match_tuple] match_list.append(na_tuple) @@ -903,6 +916,10 @@ def str_pad(arr, width, side='left', fillchar=' '): if len(fillchar) != 1: raise TypeError('fillchar must be a character, not str') + if not is_integer(width): + msg = 'width must be of integer type, not {0}' + raise TypeError(msg.format(type(width).__name__)) + if side == 'left': f = lambda x: x.rjust(width, fillchar) elif side == 'right': @@ -1385,8 +1402,6 @@ def cat(self, others=None, sep=None, na_rep=None): result = str_cat(data, others=others, sep=sep, na_rep=na_rep) return self._wrap_result(result, use_codes=(not self._is_categorical)) - @deprecate_kwarg('return_type', 'expand', mapping={'series': False, - 'frame': True}) @copy(str_split) def split(self, pat=None, n=-1, expand=False): result = str_split(self._data, pat, n=n) @@ -1532,7 +1547,7 @@ def rjust(self, width, fillchar=' '): return self.pad(width, side='left', fillchar=fillchar) def zfill(self, width): - """" + """ Filling left side of strings in the Series/Index with 0. Equivalent to :meth:`str.zfill`. @@ -1814,7 +1829,7 @@ class StringAccessorMixin(object): def _make_str_accessor(self): from pandas.core.index import Index - if (isinstance(self, gt.ABCSeries) and + if (isinstance(self, ABCSeries) and not ((is_categorical_dtype(self.dtype) and is_object_dtype(self.values.categories)) or (is_object_dtype(self.dtype)))): diff --git a/pandas/core/window.py b/pandas/core/window.py index cd66d4e30c351..b7276aed506de 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -11,17 +11,33 @@ import numpy as np from collections import defaultdict +from pandas.types.generic import (ABCSeries, + ABCDataFrame, + ABCDatetimeIndex, + ABCTimedeltaIndex, + ABCPeriodIndex) +from pandas.types.common import (is_integer, + is_bool, + is_float_dtype, + is_integer_dtype, + needs_i8_conversion, + is_timedelta64_dtype, + is_list_like, + _ensure_float64) import pandas as pd from pandas.lib import isscalar from pandas.core.base import (PandasObject, SelectionMixin, GroupByMixin) import pandas.core.common as com -import pandas.algos as algos +import pandas._window as _window +from pandas.tseries.offsets import DateOffset from pandas import compat from pandas.compat.numpy import function as nv -from pandas.util.decorators import Substitution, Appender +from pandas.util.decorators import (Substitution, Appender, + cache_readonly) from textwrap import dedent + _shared_docs = dict() _doc_template = """ @@ -38,19 +54,21 @@ class _Window(PandasObject, SelectionMixin): _attributes = ['window', 'min_periods', 'freq', 'center', 'win_type', - 'axis'] + 'axis', 'on'] exclusions = set() def __init__(self, obj, window=None, min_periods=None, freq=None, - center=False, win_type=None, axis=0, **kwargs): + center=False, win_type=None, axis=0, on=None, **kwargs): if freq is not None: warnings.warn("The freq kw is deprecated and will be removed in a " "future version. You can resample prior to passing " "to a window function", FutureWarning, stacklevel=3) + self.__dict__.update(kwargs) self.blocks = [] self.obj = obj + self.on = on self.window = window self.min_periods = min_periods self.freq = freq @@ -63,19 +81,32 @@ def __init__(self, obj, window=None, min_periods=None, freq=None, def _constructor(self): return Window + @property + def is_datetimelike(self): + return None + + @property + def _on(self): + return None + + @property + def is_freq_type(self): + return self.win_type == 'freq' + def validate(self): - if self.center is not None and not com.is_bool(self.center): + if self.center is not None and not is_bool(self.center): raise ValueError("center must be a boolean") if self.min_periods is not None and not \ - com.is_integer(self.min_periods): + is_integer(self.min_periods): raise ValueError("min_periods must be an integer") def _convert_freq(self, how=None): """ resample according to the how, return a new object """ obj = self._selected_obj + index = None if (self.freq is not None and - isinstance(obj, (com.ABCSeries, com.ABCDataFrame))): + isinstance(obj, (ABCSeries, ABCDataFrame))): if how is not None: warnings.warn("The how kw argument is deprecated and removed " "in a future version. You can resample prior " @@ -83,13 +114,24 @@ def _convert_freq(self, how=None): stacklevel=6) obj = obj.resample(self.freq).aggregate(how or 'asfreq') - return obj + + return obj, index def _create_blocks(self, how): """ split data into blocks & return conformed data """ - obj = self._convert_freq(how) - return obj.as_blocks(copy=False).values(), obj + obj, index = self._convert_freq(how) + if index is not None: + index = self._on + + # filter out the on from the object + if self.on is not None: + if obj.ndim == 2: + obj = obj.reindex(columns=obj.columns.difference([self.on]), + copy=False) + blocks = obj.as_blocks(copy=False).values() + + return blocks, obj, index def _gotitem(self, key, ndim, subset=None): """ @@ -111,7 +153,7 @@ def _gotitem(self, key, ndim, subset=None): self = self._shallow_copy(subset) self._reset_cache() if subset.ndim == 2: - if isscalar(key) and key in subset or com.is_list_like(key): + if isscalar(key) and key in subset or is_list_like(key): self._selection = key return self @@ -143,6 +185,21 @@ def __unicode__(self): return "{klass} [{attrs}]".format(klass=self._window_type, attrs=','.join(attrs)) + def _get_index(self, index=None): + """ + Return index as ndarrays + + Returns + ------- + tuple of (index, index_as_ndarray) + """ + + if self.is_freq_type: + if index is None: + index = self._on + return index, index.asi8 + return index, index + def _prep_values(self, values=None, kill_inf=True, how=None): if values is None: @@ -150,11 +207,11 @@ def _prep_values(self, values=None, kill_inf=True, how=None): # GH #12373 : rolling functions error on float32 data # make sure the data is coerced to float64 - if com.is_float_dtype(values.dtype): - values = com._ensure_float64(values) - elif com.is_integer_dtype(values.dtype): - values = com._ensure_float64(values) - elif com.needs_i8_conversion(values.dtype): + if is_float_dtype(values.dtype): + values = _ensure_float64(values) + elif is_integer_dtype(values.dtype): + values = _ensure_float64(values) + elif needs_i8_conversion(values.dtype): raise NotImplementedError("ops for {action} for this " "dtype {dtype} are not " "implemented".format( @@ -162,7 +219,7 @@ def _prep_values(self, values=None, kill_inf=True, how=None): dtype=values.dtype)) else: try: - values = com._ensure_float64(values) + values = _ensure_float64(values) except (ValueError, TypeError): raise TypeError("cannot handle this type -> {0}" "".format(values.dtype)) @@ -178,13 +235,13 @@ def _wrap_result(self, result, block=None, obj=None): if obj is None: obj = self._selected_obj - index = obj.index + if isinstance(result, np.ndarray): # coerce if necessary if block is not None: - if com.is_timedelta64_dtype(block.values.dtype): + if is_timedelta64_dtype(block.values.dtype): result = pd.to_timedelta( result.ravel(), unit='ns').values.reshape(result.shape) @@ -206,6 +263,9 @@ def _wrap_results(self, results, blocks, obj): obj : conformed data (may be resampled) """ + from pandas import Series + from pandas.core.index import _ensure_index + final = [] for result, block in zip(results, blocks): @@ -214,9 +274,31 @@ def _wrap_results(self, results, blocks, obj): return result final.append(result) + # if we have an 'on' column + # we want to put it back into the results + # in the same location + columns = self._selected_obj.columns + if self.on is not None \ + and not self._on.equals(obj.index): + + name = self._on.name + final.append(Series(self._on, index=obj.index, name=name)) + + if self._selection is not None: + + selection = _ensure_index(self._selection) + + # need to reorder to include original location of + # the on column (if its not already there) + if name not in selection: + columns = self.obj.columns + indexer = columns.get_indexer(selection.tolist() + [name]) + columns = columns.take(sorted(indexer)) + if not len(final): return obj.astype('float64') - return pd.concat(final, axis=1).reindex(columns=obj.columns) + return pd.concat(final, axis=1).reindex(columns=columns, + copy=False) def _center_window(self, result, window): """ center the result in the window """ @@ -262,30 +344,116 @@ def aggregate(self, arg, *args, **kwargs): class Window(_Window): """ - Provides rolling transformations. + Provides rolling window calculcations. .. versionadded:: 0.18.0 Parameters ---------- - window : int - Size of the moving window. This is the number of observations used for - calculating the statistic. + window : int, or offset + Size of the moving window. This is the number of observations used for + calculating the statistic. Each window will be a fixed size. + + If its an offset then this will be the time period of each window. Each + window will be a variable sized based on the observations included in + the time-period. This is only valid for datetimelike indexes. This is + new in 0.19.0 min_periods : int, default None Minimum number of observations in window required to have a value - (otherwise result is NA). + (otherwise result is NA). For a window that is specified by an offset, + this will default to 1. freq : string or DateOffset object, optional (default None) (DEPRECATED) Frequency to conform the data to before computing the statistic. Specified as a frequency string or DateOffset object. center : boolean, default False Set the labels at the center of the window. win_type : string, default None - prove a window type, see the notes below - axis : int, default 0 + Provide a window type. See the notes below. + on : string, optional + For a DataFrame, column on which to calculate + the rolling window, rather than the index + + .. versionadded:: 0.19.0 + + axis : int or string, default 0 Returns ------- - a Window sub-classed for the particular operation + a Window or Rolling sub-classed for the particular operation + + Examples + -------- + + >>> df = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}) + >>> df + B + 0 0.0 + 1 1.0 + 2 2.0 + 3 NaN + 4 4.0 + + Rolling sum with a window length of 2, using the 'triang' + window type. + + >>> df.rolling(2, win_type='triang').sum() + B + 0 NaN + 1 1.0 + 2 2.5 + 3 NaN + 4 NaN + + Rolling sum with a window length of 2, min_periods defaults + to the window length. + + >>> df.rolling(2).sum() + B + 0 NaN + 1 1.0 + 2 3.0 + 3 NaN + 4 NaN + + Same as above, but explicity set the min_periods + + >>> df.rolling(2, min_periods=1).sum() + B + 0 0.0 + 1 1.0 + 2 3.0 + 3 2.0 + 4 4.0 + + A ragged (meaning not-a-regular frequency), time-indexed DataFrame + + >>> df = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}, + ....: index = [pd.Timestamp('20130101 09:00:00'), + ....: pd.Timestamp('20130101 09:00:02'), + ....: pd.Timestamp('20130101 09:00:03'), + ....: pd.Timestamp('20130101 09:00:05'), + ....: pd.Timestamp('20130101 09:00:06')]) + + >>> df + B + 2013-01-01 09:00:00 0.0 + 2013-01-01 09:00:02 1.0 + 2013-01-01 09:00:03 2.0 + 2013-01-01 09:00:05 NaN + 2013-01-01 09:00:06 4.0 + + + Contrasting to an integer rolling window, this will roll a variable + length window corresponding to the time period. + The default for min_periods is 1. + + >>> df.rolling('2s').sum() + B + 2013-01-01 09:00:00 0.0 + 2013-01-01 09:00:02 1.0 + 2013-01-01 09:00:03 3.0 + 2013-01-01 09:00:05 NaN + 2013-01-01 09:00:06 4.0 Notes ----- @@ -296,7 +464,10 @@ class Window(_Window): frequency by resampling the data. This is done with the default parameters of :meth:`~pandas.Series.resample` (i.e. using the `mean`). - The recognized window types are: + To learn more about the offsets & frequency strings, please see `this link + `__. + + The recognized win_types are: * ``boxcar`` * ``triang`` @@ -312,7 +483,8 @@ class Window(_Window): * ``gaussian`` (needs std) * ``general_gaussian`` (needs power, width) * ``slepian`` (needs width). -""" + + """ def validate(self): super(Window, self).validate() @@ -320,7 +492,9 @@ def validate(self): window = self.window if isinstance(window, (list, tuple, np.ndarray)): pass - elif com.is_integer(window): + elif is_integer(window): + if window < 0: + raise ValueError("window must be non-negative") try: import scipy.signal as sig except ImportError: @@ -343,7 +517,7 @@ def _prep_window(self, **kwargs): window = self._get_window() if isinstance(window, (list, tuple, np.ndarray)): return com._asarray_tuplesafe(window).astype(float) - elif com.is_integer(window): + elif is_integer(window): import scipy.signal as sig # the below may pop from kwargs @@ -389,7 +563,7 @@ def _apply_window(self, mean=True, how=None, **kwargs): window = self._prep_window(**kwargs) center = self.center - blocks, obj = self._create_blocks(how=how) + blocks, obj, index = self._create_blocks(how=how) results = [] for b in blocks: try: @@ -407,9 +581,10 @@ def _apply_window(self, mean=True, how=None, **kwargs): def f(arg, *args, **kwargs): minp = _use_window(self.min_periods, len(window)) - return algos.roll_window(np.concatenate((arg, additional_nans)) - if center else arg, window, minp, - avg=mean) + return _window.roll_window(np.concatenate((arg, + additional_nans)) + if center else arg, window, minp, + avg=mean) result = np.apply_along_axis(f, self.axis, values) @@ -517,7 +692,8 @@ def _apply(self, func, name=None, window=None, center=None, if check_minp is None: check_minp = _use_window - blocks, obj = self._create_blocks(how=how) + blocks, obj, index = self._create_blocks(how=how) + index, indexi = self._get_index(index=index) results = [] for b in blocks: try: @@ -532,17 +708,17 @@ def _apply(self, func, name=None, window=None, center=None, # if we have a string function name, wrap it if isinstance(func, compat.string_types): - if not hasattr(algos, func): + cfunc = getattr(_window, func, None) + if cfunc is None: raise ValueError("we do not support this function " - "algos.{0}".format(func)) - - cfunc = getattr(algos, func) + "in _window.{0}".format(func)) def func(arg, window, min_periods=None): minp = check_minp(min_periods, window) - # GH #12373: rolling functions error on float32 data - return cfunc(com._ensure_float64(arg), - window, minp, **kwargs) + # ensure we are only rolling on floats + arg = _ensure_float64(arg) + return cfunc(arg, + window, minp, indexi, **kwargs) # calculation function if center: @@ -557,10 +733,11 @@ def calc(x): def calc(x): return func(x, window, min_periods=self.min_periods) - if values.ndim > 1: - result = np.apply_along_axis(calc, self.axis, values) - else: - result = calc(values) + with np.errstate(all='ignore'): + if values.ndim > 1: + result = np.apply_along_axis(calc, self.axis, values) + else: + result = calc(values) if center: result = self._center_window(result, window) @@ -576,15 +753,17 @@ class _Rolling_and_Expanding(_Rolling): observations inside provided window.""" def count(self): - obj = self._convert_freq() + + blocks, obj, index = self._create_blocks(how=None) + index, indexi = self._get_index(index=index) + window = self._get_window() window = min(window, len(obj)) if not self.center else window - blocks, obj = self._create_blocks(how=None) results = [] for b in blocks: - if com.needs_i8_conversion(b.values): + if needs_i8_conversion(b.values): result = b.notnull().astype(int) else: try: @@ -614,11 +793,13 @@ def apply(self, func, args=(), kwargs={}): _level = kwargs.pop('_level', None) # noqa window = self._get_window() offset = _offset(window, self.center) + index, indexi = self._get_index() def f(arg, window, min_periods): minp = _use_window(min_periods, window) - return algos.roll_generic(arg, window, minp, offset, func, args, - kwargs) + return _window.roll_generic(arg, window, minp, indexi, + offset, func, args, + kwargs) return self._apply(f, func, args=args, kwargs=kwargs, center=False) @@ -684,10 +865,12 @@ def median(self, how=None, **kwargs): def std(self, ddof=1, *args, **kwargs): nv.validate_window_func('std', args, kwargs) window = self._get_window() + index, indexi = self._get_index() def f(arg, *args, **kwargs): minp = _require_min_periods(1)(self.min_periods, window) - return _zsqrt(algos.roll_var(arg, window, minp, ddof)) + return _zsqrt(_window.roll_var(arg, window, minp, indexi, + ddof)) return self._apply(f, 'std', check_minp=_require_min_periods(1), ddof=ddof, **kwargs) @@ -729,10 +912,12 @@ def kurt(self, **kwargs): def quantile(self, quantile, **kwargs): window = self._get_window() + index, indexi = self._get_index() def f(arg, *args, **kwargs): minp = _use_window(self.min_periods, window) - return algos.roll_quantile(arg, window, minp, quantile) + return _window.roll_quantile(arg, window, minp, indexi, + quantile) return self._apply(f, 'quantile', quantile=quantile, **kwargs) @@ -812,44 +997,66 @@ def _get_corr(a, b): class Rolling(_Rolling_and_Expanding): - """ - Provides rolling window calculcations. - - .. versionadded:: 0.18.0 - Parameters - ---------- - window : int - Size of the moving window. This is the number of observations used for - calculating the statistic. - min_periods : int, default None - Minimum number of observations in window required to have a value - (otherwise result is NA). - freq : string or DateOffset object, optional (default None) (DEPRECATED) - Frequency to conform the data to before computing the statistic. - Specified as a frequency string or DateOffset object. - center : boolean, default False - Set the labels at the center of the window. - axis : int, default 0 + @cache_readonly + def is_datetimelike(self): + return isinstance(self._on, + (ABCDatetimeIndex, + ABCTimedeltaIndex, + ABCPeriodIndex)) + + @cache_readonly + def _on(self): + + if self.on is None: + return self.obj.index + elif (isinstance(self.obj, ABCDataFrame) and + self.on in self.obj.columns): + return pd.Index(self.obj[self.on]) + else: + raise ValueError("invalid on specified as {0}, " + "must be a column (if DataFrame) " + "or None".format(self.on)) - Returns - ------- - a Window sub-classed for the particular operation + def validate(self): + super(Rolling, self).validate() - Notes - ----- - By default, the result is set to the right edge of the window. This can be - changed to the center of the window by setting ``center=True``. + # we allow rolling on a datetimelike index + if (self.is_datetimelike and + isinstance(self.window, (compat.string_types, DateOffset))): - The `freq` keyword is used to conform time series data to a specified - frequency by resampling the data. This is done with the default parameters - of :meth:`~pandas.Series.resample` (i.e. using the `mean`). - """ + # must be monotonic for on + if not self._on.is_monotonic: + formatted = self.on or 'index' + raise ValueError("{0} must be " + "monotonic".format(formatted)) - def validate(self): - super(Rolling, self).validate() - if not com.is_integer(self.window): + from pandas.tseries.frequencies import to_offset + try: + freq = to_offset(self.window) + except (TypeError, ValueError): + raise ValueError("passed window {0} in not " + "compat with a datetimelike " + "index".format(self.window)) + + # we don't allow center + if self.center: + raise NotImplementedError("center is not implemented " + "for datetimelike and offset " + "based windows") + + # this will raise ValueError on non-fixed freqs + self.window = freq.nanos + self.win_type = 'freq' + + # min_periods must be an integer + if self.min_periods is None: + self.min_periods = 1 + + elif not is_integer(self.window): raise ValueError("window must be an integer") + elif self.window < 0: + raise ValueError("window must be non-negative") @Substitution(name='rolling') @Appender(SelectionMixin._see_also_template) @@ -863,6 +1070,11 @@ def aggregate(self, arg, *args, **kwargs): @Appender(_doc_template) @Appender(_shared_docs['count']) def count(self): + + # different impl for freq counting + if self.is_freq_type: + return self._apply('roll_count', 'count') + return super(Rolling, self).count() @Substitution(name='rolling') @@ -980,12 +1192,31 @@ class Expanding(_Rolling_and_Expanding): Specified as a frequency string or DateOffset object. center : boolean, default False Set the labels at the center of the window. - axis : int, default 0 + axis : int or string, default 0 Returns ------- a Window sub-classed for the particular operation + Examples + -------- + + >>> df = DataFrame({'B': [0, 1, 2, np.nan, 4]}) + B + 0 0.0 + 1 1.0 + 2 2.0 + 3 NaN + 4 4.0 + + >>> df.expanding(2).sum() + B + 0 NaN + 1 1.0 + 2 3.0 + 3 3.0 + 4 7.0 + Notes ----- By default, the result is set to the right edge of the window. This can be @@ -1192,6 +1423,25 @@ class EWM(_Rolling): ------- a Window sub-classed for the particular operation + Examples + -------- + + >>> df = DataFrame({'B': [0, 1, 2, np.nan, 4]}) + B + 0 0.0 + 1 1.0 + 2 2.0 + 3 NaN + 4 4.0 + + >>> df.ewm(com=0.5).mean() + B + 0 0.000000 + 1 0.750000 + 2 1.615385 + 3 1.615385 + 4 3.670213 + Notes ----- Exactly one of center of mass, span, half-life, and alpha must be provided. @@ -1235,6 +1485,7 @@ def __init__(self, obj, com=None, span=None, halflife=None, alpha=None, self.adjust = adjust self.ignore_na = ignore_na self.axis = axis + self.on = None @property def _constructor(self): @@ -1263,7 +1514,7 @@ def _apply(self, func, how=None, **kwargs): y : type of input argument """ - blocks, obj = self._create_blocks(how=how) + blocks, obj, index = self._create_blocks(how=how) results = [] for b in blocks: try: @@ -1278,11 +1529,10 @@ def _apply(self, func, how=None, **kwargs): # if we have a string function name, wrap it if isinstance(func, compat.string_types): - if not hasattr(algos, func): + cfunc = getattr(_window, func, None) + if cfunc is None: raise ValueError("we do not support this function " - "algos.{0}".format(func)) - - cfunc = getattr(algos, func) + "in _window.{0}".format(func)) def func(arg): return cfunc(arg, self.com, int(self.adjust), @@ -1317,9 +1567,9 @@ def var(self, bias=False, *args, **kwargs): nv.validate_window_func('var', args, kwargs) def f(arg): - return algos.ewmcov(arg, arg, self.com, int(self.adjust), - int(self.ignore_na), int(self.min_periods), - int(bias)) + return _window.ewmcov(arg, arg, self.com, int(self.adjust), + int(self.ignore_na), int(self.min_periods), + int(bias)) return self._apply(f, **kwargs) @@ -1337,9 +1587,9 @@ def cov(self, other=None, pairwise=None, bias=False, **kwargs): def _get_cov(X, Y): X = self._shallow_copy(X) Y = self._shallow_copy(Y) - cov = algos.ewmcov(X._prep_values(), Y._prep_values(), self.com, - int(self.adjust), int(self.ignore_na), - int(self.min_periods), int(bias)) + cov = _window.ewmcov(X._prep_values(), Y._prep_values(), self.com, + int(self.adjust), int(self.ignore_na), + int(self.min_periods), int(bias)) return X._wrap_result(cov) return _flex_binary_moment(self._selected_obj, other._selected_obj, @@ -1361,16 +1611,18 @@ def _get_corr(X, Y): Y = self._shallow_copy(Y) def _cov(x, y): - return algos.ewmcov(x, y, self.com, int(self.adjust), - int(self.ignore_na), int(self.min_periods), - 1) + return _window.ewmcov(x, y, self.com, int(self.adjust), + int(self.ignore_na), + int(self.min_periods), + 1) x_values = X._prep_values() y_values = Y._prep_values() - cov = _cov(x_values, y_values) - x_var = _cov(x_values, x_values) - y_var = _cov(y_values, y_values) - corr = cov / _zsqrt(x_var * y_var) + with np.errstate(all='ignore'): + cov = _cov(x_values, y_values) + x_var = _cov(x_values, x_values) + y_var = _cov(y_values, y_values) + corr = cov / _zsqrt(x_var * y_var) return X._wrap_result(corr) return _flex_binary_moment(self._selected_obj, other._selected_obj, @@ -1480,7 +1732,7 @@ def _get_center_of_mass(com, span, halflife, alpha): def _offset(window, center): - if not com.is_integer(window): + if not is_integer(window): window = len(window) offset = (window - 1) / 2. if center else 0 try: @@ -1507,8 +1759,9 @@ def _use_window(minp, window): def _zsqrt(x): - result = np.sqrt(x) - mask = x < 0 + with np.errstate(all='ignore'): + result = np.sqrt(x) + mask = x < 0 from pandas import DataFrame if isinstance(x, DataFrame): diff --git a/pandas/formats/format.py b/pandas/formats/format.py index 27d8b553013b9..e1f5f97088512 100644 --- a/pandas/formats/format.py +++ b/pandas/formats/format.py @@ -1,12 +1,29 @@ # -*- coding: utf-8 -*- +""" +Internal module for formatting output data in csv, html, +and latex files. This module also applies to display formatting. +""" + from __future__ import print_function from distutils.version import LooseVersion # pylint: disable=W0141 import sys +from pandas.types.missing import isnull, notnull +from pandas.types.common import (is_categorical_dtype, + is_float_dtype, + is_period_arraylike, + is_integer_dtype, + is_datetimetz, + is_integer, + is_float, + is_numeric_dtype, + is_datetime64_dtype, + is_timedelta64_dtype) +from pandas.types.generic import ABCSparseArray + from pandas.core.base import PandasObject -from pandas.core.common import isnull, notnull, is_numeric_dtype from pandas.core.index import Index, MultiIndex, _ensure_index from pandas import compat from pandas.compat import (StringIO, lzip, range, map, zip, reduce, u, @@ -25,7 +42,6 @@ import itertools import csv -import warnings common_docstring = """ Parameters @@ -53,7 +69,9 @@ Set to False for a DataFrame with a hierarchical index to print every multiindex key at each row, default True index_names : bool, optional - Prints the names of the indexes, default True""" + Prints the names of the indexes, default True + line_width : int, optional + Width to wrap a line in characters, default no wrap""" justify_docstring = """ justify : {'left', 'right'}, default None @@ -190,7 +208,7 @@ def _get_footer(self): # level infos are added to the end and in a new line, like it is done # for Categoricals - if com.is_categorical_dtype(self.tr_series.dtype): + if is_categorical_dtype(self.tr_series.dtype): level_info = self.tr_series._values._repr_categories_info() if footer: footer += "\n" @@ -312,12 +330,12 @@ def should_show_dimensions(self): def _get_formatter(self, i): if isinstance(self.formatters, (list, tuple)): - if com.is_integer(i): + if is_integer(i): return self.formatters[i] else: return None else: - if com.is_integer(i) and i not in self.columns: + if is_integer(i) and i not in self.columns: i = self.columns[i] return self.formatters.get(i, None) @@ -586,8 +604,6 @@ def to_string(self): self._chk_truncate() strcols = self._to_str_columns() text = self.adj.adjoin(1, *strcols) - if not self.index: - text = text.replace('\n ', '\n').strip() self.buf.writelines(text) if self.should_show_dimensions: @@ -617,7 +633,8 @@ def _join_multiline(self, *strcols): st = 0 for i, ed in enumerate(col_bins): row = strcols[st:ed] - row.insert(0, idx) + if self.index: + row.insert(0, idx) if nbins > 1: if ed <= len(strcols) and i < nbins - 1: row.append([' \\'] + [' '] * (nrows - 1)) @@ -652,20 +669,28 @@ def _format_col(self, i): float_format=self.float_format, na_rep=self.na_rep, space=self.col_space, decimal=self.decimal) - def to_html(self, classes=None, notebook=False): + def to_html(self, classes=None, notebook=False, border=None): """ Render a DataFrame to a html table. Parameters ---------- + classes : str or list-like + classes to include in the `class` attribute of the opening + ``
`` tag, in addition to the default "dataframe". notebook : {True, False}, optional, default False Whether the generated HTML is for IPython Notebook. + border : int + A ``border=border`` attribute is included in the opening + ``
`` tag. Default ``pd.options.html.border``. - """ + .. versionadded:: 0.19.0 + """ html_renderer = HTMLFormatter(self, classes=classes, max_rows=self.max_rows, max_cols=self.max_cols, - notebook=notebook) + notebook=notebook, + border=border) if hasattr(self.buf, 'write'): html_renderer.write_result(self.buf) elif isinstance(self.buf, compat.string_types): @@ -708,7 +733,7 @@ def space_format(x, y): fmt_columns = columns.format() dtypes = self.frame.dtypes need_leadsp = dict(zip(fmt_columns, map(is_numeric_dtype, dtypes))) - str_columns = [[' ' + x if not self._get_formatter(i) and + str_columns = [[x if not self._get_formatter(i) and need_leadsp[x] else x] for i, (col, x) in enumerate(zip(columns, fmt_columns))] @@ -891,7 +916,7 @@ class HTMLFormatter(TableFormatter): indent_delta = 2 def __init__(self, formatter, classes=None, max_rows=None, max_cols=None, - notebook=False): + notebook=False, border=None): self.fmt = formatter self.classes = classes @@ -907,6 +932,9 @@ def __init__(self, formatter, classes=None, max_rows=None, max_cols=None, self.is_truncated = (self.max_rows < len(self.fmt.frame) or self.max_cols < len(self.fmt.columns)) self.notebook = notebook + if border is None: + border = get_option('html.border') + self.border = border def write(self, s, indent=0): rs = pprint_thing(s) @@ -982,7 +1010,8 @@ def write_result(self, buf): self.write(''.format(div_style)) - self.write('
' % ' '.join(_classes), + self.write('
' % (self.border, + ' '.join(_classes)), indent) indent += self.indent_delta @@ -1277,29 +1306,41 @@ def _write_hierarchical_rows(self, fmt_values, indent): def _get_level_lengths(levels, sentinel=''): - from itertools import groupby + """For each index in each level the function returns lengths of indexes. - def _make_grouper(): - record = {'count': 0} + Parameters + ---------- + levels : list of lists + List of values on for level. + sentinel : string, optional + Value which states that no new index starts on there. - def grouper(x): - if x != sentinel: - record['count'] += 1 - return record['count'] + Returns + ---------- + Returns list of maps. For each level returns map of indexes (key is index + in row and value is length of index). + """ + if len(levels) == 0: + return [] - return grouper + control = [True for x in levels[0]] result = [] - for lev in levels: - i = 0 - f = _make_grouper() - recs = {} - for key, gpr in groupby(lev, f): - values = list(gpr) - recs[i] = len(values) - i += len(values) + for level in levels: + last_index = 0 + + lengths = {} + for i, key in enumerate(level): + if control[i] and key == sentinel: + pass + else: + control[i] = False + lengths[last_index] = i - last_index + last_index = i + + lengths[last_index] = len(level) - last_index - result.append(recs) + result.append(lengths) return result @@ -1309,15 +1350,10 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', float_format=None, cols=None, header=True, index=True, index_label=None, mode='w', nanRep=None, encoding=None, compression=None, quoting=None, line_terminator='\n', - chunksize=None, engine=None, tupleize_cols=False, - quotechar='"', date_format=None, doublequote=True, - escapechar=None, decimal='.'): - - if engine is not None: - warnings.warn("'engine' keyword is deprecated and will be " - "removed in a future version", FutureWarning, - stacklevel=3) - self.engine = engine # remove for 0.18 + chunksize=None, tupleize_cols=False, quotechar='"', + date_format=None, doublequote=True, escapechar=None, + decimal='.'): + self.obj = obj if path_or_buf is None: @@ -1352,11 +1388,6 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', self.date_format = date_format - # GH3457 - if not self.obj.columns.is_unique and engine == 'python': - raise NotImplementedError("columns.is_unique == False not " - "supported with engine='python'") - self.tupleize_cols = tupleize_cols self.has_mi_columns = (isinstance(obj.columns, MultiIndex) and not self.tupleize_cols) @@ -1413,108 +1444,6 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', if not index: self.nlevels = 0 - # original python implem. of df.to_csv - # invoked by df.to_csv(engine=python) - def _helper_csv(self, writer, na_rep=None, cols=None, header=True, - index=True, index_label=None, float_format=None, - date_format=None): - if cols is None: - cols = self.columns - - has_aliases = isinstance(header, (tuple, list, np.ndarray, Index)) - if has_aliases or header: - if index: - # should write something for index label - if index_label is not False: - if index_label is None: - if isinstance(self.obj.index, MultiIndex): - index_label = [] - for i, name in enumerate(self.obj.index.names): - if name is None: - name = '' - index_label.append(name) - else: - index_label = self.obj.index.name - if index_label is None: - index_label = [''] - else: - index_label = [index_label] - elif not isinstance(index_label, - (list, tuple, np.ndarray, Index)): - # given a string for a DF with Index - index_label = [index_label] - - encoded_labels = list(index_label) - else: - encoded_labels = [] - - if has_aliases: - if len(header) != len(cols): - raise ValueError(('Writing %d cols but got %d aliases' - % (len(cols), len(header)))) - else: - write_cols = header - else: - write_cols = cols - encoded_cols = list(write_cols) - - writer.writerow(encoded_labels + encoded_cols) - else: - encoded_cols = list(cols) - writer.writerow(encoded_cols) - - if date_format is None: - date_formatter = lambda x: Timestamp(x)._repr_base - else: - - def strftime_with_nulls(x): - x = Timestamp(x) - if notnull(x): - return x.strftime(date_format) - - date_formatter = lambda x: strftime_with_nulls(x) - - data_index = self.obj.index - - if isinstance(self.obj.index, PeriodIndex): - data_index = self.obj.index.to_timestamp() - - if isinstance(data_index, DatetimeIndex) and date_format is not None: - data_index = Index([date_formatter(x) for x in data_index]) - - values = self.obj.copy() - values.index = data_index - values.columns = values.columns.to_native_types( - na_rep=na_rep, float_format=float_format, date_format=date_format, - quoting=self.quoting) - values = values[cols] - - series = {} - for k, v in compat.iteritems(values._series): - series[k] = v._values - - nlevels = getattr(data_index, 'nlevels', 1) - for j, idx in enumerate(data_index): - row_fields = [] - if index: - if nlevels == 1: - row_fields = [idx] - else: # handle MultiIndex - row_fields = list(idx) - for i, col in enumerate(cols): - val = series[col][j] - if lib.checknull(val): - val = na_rep - - if float_format is not None and com.is_float(val): - val = float_format % val - elif isinstance(val, (np.datetime64, Timestamp)): - val = date_formatter(val) - - row_fields.append(val) - - writer.writerow(row_fields) - def save(self): # create the writer & save if hasattr(self.path_or_buf, 'write'): @@ -1538,17 +1467,7 @@ def save(self): else: self.writer = csv.writer(f, **writer_kwargs) - if self.engine == 'python': - # to be removed in 0.13 - self._helper_csv(self.writer, na_rep=self.na_rep, - float_format=self.float_format, - cols=self.cols, header=self.header, - index=self.index, - index_label=self.index_label, - date_format=self.date_format) - - else: - self._save() + self._save() finally: if close: @@ -1603,9 +1522,9 @@ def _save_header(self): if not has_mi_columns: encoded_labels += list(write_cols) - - # write out the mi - if has_mi_columns: + writer.writerow(encoded_labels) + else: + # write out the mi columns = obj.columns # write out the names for each level, then ALL of the values for @@ -1626,12 +1545,12 @@ def _save_header(self): writer.writerow(col_line) - # add blanks for the columns, so that we - # have consistent seps - encoded_labels.extend([''] * len(columns)) - - # write out the index label line - writer.writerow(encoded_labels) + # Write out the index line if it's not empty. + # Otherwise, we will print out an extraneous + # blank line between the mi and the data rows. + if encoded_labels and set(encoded_labels) != set(['']): + encoded_labels.extend([''] * len(columns)) + writer.writerow(encoded_labels) def _save(self): @@ -1752,7 +1671,7 @@ def __init__(self, df, na_rep='', float_format=None, cols=None, def _format_value(self, val): if lib.checknull(val): val = self.na_rep - elif com.is_float(val): + elif is_float(val): if lib.isposinf_scalar(val): val = self.inf_rep elif lib.isneginf_scalar(val): @@ -1762,7 +1681,6 @@ def _format_value(self, val): return val def _format_header_mi(self): - if self.columns.nlevels > 1: if not self.index: raise NotImplementedError("Writing to Excel with MultiIndex" @@ -1935,7 +1853,11 @@ def _format_hierarchical_rows(self): for spans, levels, labels in zip(level_lengths, self.df.index.levels, self.df.index.labels): - values = levels.take(labels) + + values = levels.take(labels, + allow_fill=levels._can_hold_na, + fill_value=True) + for i in spans: if spans[i] > 1: yield ExcelCell(self.rowcounter + i, gcolidx, @@ -1974,19 +1896,19 @@ def get_formatted_cells(self): def format_array(values, formatter, float_format=None, na_rep='NaN', digits=None, space=None, justify='right', decimal='.'): - if com.is_categorical_dtype(values): + if is_categorical_dtype(values): fmt_klass = CategoricalArrayFormatter - elif com.is_float_dtype(values.dtype): + elif is_float_dtype(values.dtype): fmt_klass = FloatArrayFormatter - elif com.is_period_arraylike(values): + elif is_period_arraylike(values): fmt_klass = PeriodArrayFormatter - elif com.is_integer_dtype(values.dtype): + elif is_integer_dtype(values.dtype): fmt_klass = IntArrayFormatter - elif com.is_datetimetz(values): + elif is_datetimetz(values): fmt_klass = Datetime64TZFormatter - elif com.is_datetime64_dtype(values.dtype): + elif is_datetime64_dtype(values.dtype): fmt_klass = Datetime64Formatter - elif com.is_timedelta64_dtype(values.dtype): + elif is_timedelta64_dtype(values.dtype): fmt_klass = Timedelta64Formatter else: fmt_klass = GenericArrayFormatter @@ -2055,18 +1977,20 @@ def _format(x): vals = self.values if isinstance(vals, Index): vals = vals._values + elif isinstance(vals, ABCSparseArray): + vals = vals.values - is_float = lib.map_infer(vals, com.is_float) & notnull(vals) - leading_space = is_float.any() + is_float_type = lib.map_infer(vals, is_float) & notnull(vals) + leading_space = is_float_type.any() fmt_values = [] for i, v in enumerate(vals): if not is_float[i] and leading_space: - fmt_values.append(' %s' % _format(v)) + fmt_values.append('%s' % _format(v)) elif is_float[i]: fmt_values.append(float_format(v)) else: - fmt_values.append(' %s' % _format(v)) + fmt_values.append('%s' % _format(v)) return fmt_values @@ -2183,14 +2107,14 @@ def format_values_with(float_format): else: too_long = False - abs_vals = np.abs(self.values) - - # this is pretty arbitrary for now - # large values: more that 8 characters including decimal symbol - # and first digit, hence > 1e6 - has_large_values = (abs_vals > 1e6).any() - has_small_values = ((abs_vals < 10**(-self.digits)) & - (abs_vals > 0)).any() + with np.errstate(invalid='ignore'): + abs_vals = np.abs(self.values) + # this is pretty arbitrary for now + # large values: more that 8 characters including decimal symbol + # and first digit, hence > 1e6 + has_large_values = (abs_vals > 1e6).any() + has_small_values = ((abs_vals < 10**(-self.digits)) & + (abs_vals > 0)).any() if has_small_values or (too_long and has_large_values): float_format = '%% .%de' % self.digits @@ -2208,7 +2132,7 @@ def _format_strings(self): class IntArrayFormatter(GenericArrayFormatter): def _format_strings(self): - formatter = self.formatter or (lambda x: '% d' % x) + formatter = self.formatter or (lambda x: '%d' % x) fmt_values = [formatter(x) for x in self.values] return fmt_values @@ -2223,9 +2147,13 @@ def _format_strings(self): """ we by definition have DO NOT have a TZ """ values = self.values + if not isinstance(values, DatetimeIndex): values = DatetimeIndex(values) + if self.formatter is not None and callable(self.formatter): + return [self.formatter(x) for x in values] + fmt_values = format_array_from_datetime( values.asi8.ravel(), format=_get_format_datetime64_from_values(values, @@ -2296,9 +2224,10 @@ def format_percentiles(percentiles): percentiles = np.asarray(percentiles) # It checks for np.NaN as well - if not is_numeric_dtype(percentiles) or not np.all(percentiles >= 0) \ - or not np.all(percentiles <= 1): - raise ValueError("percentiles should all be in the interval [0,1]") + with np.errstate(invalid='ignore'): + if not is_numeric_dtype(percentiles) or not np.all(percentiles >= 0) \ + or not np.all(percentiles <= 1): + raise ValueError("percentiles should all be in the interval [0,1]") percentiles = 100 * percentiles int_idx = (percentiles.astype(int) == percentiles) diff --git a/pandas/formats/printing.py b/pandas/formats/printing.py index a4eaec8d5334b..37bd4b63d6f7a 100644 --- a/pandas/formats/printing.py +++ b/pandas/formats/printing.py @@ -2,9 +2,9 @@ printing tools """ +from pandas.types.inference import is_sequence from pandas import compat from pandas.compat import u -import pandas.core.common as com from pandas.core.config import get_option @@ -213,7 +213,7 @@ def as_escaped_unicode(thing, escape_chars=escape_chars): _nest_lvl < get_option("display.pprint_nest_depth")): result = _pprint_dict(thing, _nest_lvl, quote_strings=True, max_seq_items=max_seq_items) - elif (com.is_sequence(thing) and + elif (is_sequence(thing) and _nest_lvl < get_option("display.pprint_nest_depth")): result = _pprint_seq(thing, _nest_lvl, escape_chars=escape_chars, quote_strings=quote_strings, diff --git a/pandas/formats/style.py b/pandas/formats/style.py index f66ac7485c76e..4d5e72a38bb98 100644 --- a/pandas/formats/style.py +++ b/pandas/formats/style.py @@ -17,9 +17,12 @@ "or `pip install Jinja2`" raise ImportError(msg) +from pandas.types.common import is_float, is_string_like + import numpy as np import pandas as pd -from pandas.compat import lzip, range +from pandas.compat import range +from pandas.core.config import get_option import pandas.core.common as com from pandas.core.indexing import _maybe_numeric_slice, _non_reducing_slice try: @@ -78,6 +81,24 @@ class Styler(object): to automatically render itself. Otherwise call Styler.render to get the genterated HTML. + CSS classes are attached to the generated HTML + + * Index and Column names include ``index_name`` and ``level`` + where `k` is its level in a MultiIndex + * Index label cells include + + * ``row_heading`` + * ``row`` where `n` is the numeric position of the row + * ``level`` where `k` is the level in a MultiIndex + + * Column label cells include + * ``col_heading`` + * ``col`` where `n` is the numeric position of the column + * ``evel`` where `k` is the level in a MultiIndex + + * Blank cells include ``blank`` + * Data cells include ``data`` + See Also -------- pandas.DataFrame.style @@ -109,7 +130,10 @@ class Styler(object): {% for r in head %} {% for c in r %} - <{{c.type}} class="{{c.class}}">{{c.value}} + {% if c.is_visible != False %} + <{{c.type}} class="{{c.class}}" {{ c.attributes|join(" ") }}> + {{c.value}} + {% endif %} {% endfor %} {% endfor %} @@ -118,8 +142,11 @@ class Styler(object): {% for r in body %} {% for c in r %} - <{{c.type}} id="T_{{uuid}}{{c.id}}" class="{{c.class}}"> + {% if c.is_visible != False %} + <{{c.type}} id="T_{{uuid}}{{c.id}}" + class="{{c.class}}" {{ c.attributes|join(" ") }}> {{ c.display_value }} + {% endif %} {% endfor %} {% endfor %} @@ -133,7 +160,7 @@ def __init__(self, data, precision=None, table_styles=None, uuid=None, self._todo = [] if not isinstance(data, (pd.Series, pd.DataFrame)): - raise TypeError + raise TypeError("``data`` must be a Series or DataFrame") if data.ndim == 1: data = data.to_frame() if not data.index.is_unique or not data.columns.is_unique: @@ -147,13 +174,13 @@ def __init__(self, data, precision=None, table_styles=None, uuid=None, self.table_styles = table_styles self.caption = caption if precision is None: - precision = pd.options.display.precision + precision = get_option('display.precision') self.precision = precision self.table_attributes = table_attributes # display_funcs maps (row, col) -> formatting function def default_display_func(x): - if com.is_float(x): + if is_float(x): return '{:>.{precision}g}'.format(x, precision=self.precision) else: return x @@ -176,10 +203,19 @@ def _translate(self): uuid = self.uuid or str(uuid1()).replace("-", "_") ROW_HEADING_CLASS = "row_heading" COL_HEADING_CLASS = "col_heading" + INDEX_NAME_CLASS = "index_name" + DATA_CLASS = "data" BLANK_CLASS = "blank" BLANK_VALUE = "" + def format_attr(pair): + return "{key}={value}".format(**pair) + + # for sparsifying a MultiIndex + idx_lengths = _get_level_lengths(self.index) + col_lengths = _get_level_lengths(self.columns) + cell_context = dict() n_rlvls = self.data.index.nlevels @@ -187,10 +223,6 @@ def _translate(self): rlabels = self.data.index.tolist() clabels = self.data.columns.tolist() - idx_values = self.data.index.format(sparsify=False, adjoin=False, - names=False) - idx_values = lzip(*idx_values) - if n_rlvls == 1: rlabels = [[x] for x in rlabels] if n_clvls == 1: @@ -201,9 +233,24 @@ def _translate(self): head = [] for r in range(n_clvls): + # Blank for Index columns... row_es = [{"type": "th", "value": BLANK_VALUE, - "class": " ".join([BLANK_CLASS])}] * n_rlvls + "display_value": BLANK_VALUE, + "is_visible": True, + "class": " ".join([BLANK_CLASS])}] * (n_rlvls - 1) + + # ... except maybe the last for columns.names + name = self.data.columns.names[r] + cs = [BLANK_CLASS if name is None else INDEX_NAME_CLASS, + "level%s" % r] + name = BLANK_VALUE if name is None else name + row_es.append({"type": "th", + "value": name, + "display_value": name, + "class": " ".join(cs), + "is_visible": True}) + for c in range(len(clabels[0])): cs = [COL_HEADING_CLASS, "level%s" % r, "col%s" % c] cs.extend(cell_context.get( @@ -212,16 +259,23 @@ def _translate(self): row_es.append({"type": "th", "value": value, "display_value": value, - "class": " ".join(cs)}) + "class": " ".join(cs), + "is_visible": _is_visible(c, r, col_lengths), + "attributes": [ + format_attr({"key": "colspan", + "value": col_lengths.get( + (r, c), 1)}) + ]}) head.append(row_es) - if self.data.index.names and self.data.index.names != [None]: + if self.data.index.names and not all(x is None + for x in self.data.index.names): index_header_row = [] for c, name in enumerate(self.data.index.names): - cs = [COL_HEADING_CLASS, - "level%s" % (n_clvls + 1), - "col%s" % c] + cs = [INDEX_NAME_CLASS, + "level%s" % c] + name = '' if name is None else name index_header_row.append({"type": "th", "value": name, "class": " ".join(cs)}) @@ -235,12 +289,17 @@ def _translate(self): body = [] for r, idx in enumerate(self.data.index): - cs = [ROW_HEADING_CLASS, "level%s" % c, "row%s" % r] - cs.extend( - cell_context.get("row_headings", {}).get(r, {}).get(c, [])) + # cs.extend( + # cell_context.get("row_headings", {}).get(r, {}).get(c, [])) row_es = [{"type": "th", + "is_visible": _is_visible(r, c, idx_lengths), + "attributes": [ + format_attr({"key": "rowspan", + "value": idx_lengths.get((c, r), 1)}) + ], "value": rlabels[r][c], - "class": " ".join(cs), + "class": " ".join([ROW_HEADING_CLASS, "level%s" % c, + "row%s" % r]), "display_value": rlabels[r][c]} for c in range(len(rlabels[r]))] @@ -427,11 +486,30 @@ def _compute(self): def _apply(self, func, axis=0, subset=None, **kwargs): subset = slice(None) if subset is None else subset subset = _non_reducing_slice(subset) + data = self.data.loc[subset] if axis is not None: - result = self.data.loc[subset].apply(func, axis=axis, **kwargs) + result = data.apply(func, axis=axis, **kwargs) else: - # like tee - result = func(self.data.loc[subset], **kwargs) + result = func(data, **kwargs) + if not isinstance(result, pd.DataFrame): + raise TypeError( + "Function {!r} must return a DataFrame when " + "passed to `Styler.apply` with axis=None".format(func)) + if not (result.index.equals(data.index) and + result.columns.equals(data.columns)): + msg = ('Result of {!r} must have identical index and columns ' + 'as the input'.format(func)) + raise ValueError(msg) + + result_shape = result.shape + expected_shape = self.data.loc[subset].shape + if result_shape != expected_shape: + msg = ("Function {!r} returned the wrong shape.\n" + "Result has shape: {}\n" + "Expected shape: {}".format(func, + result.shape, + expected_shape)) + raise ValueError(msg) self._update_ctx(result) return self @@ -444,15 +522,19 @@ def apply(self, func, axis=0, subset=None, **kwargs): Parameters ---------- - func: function - axis: int, str or None + func : function + ``func`` should take a Series or DataFrame (depending + on ``axis``), and return an object with the same shape. + Must return a DataFrame with identical index and + column labels when ``axis=None`` + axis : int, str or None apply to each column (``axis=0`` or ``'index'``) or to each row (``axis=1`` or ``'columns'``) or - to the entire DataFrame at once with ``axis=None``. - subset: IndexSlice + to the entire DataFrame at once with ``axis=None`` + subset : IndexSlice a valid indexer to limit ``data`` to *before* applying the function. Consider using a pandas.IndexSlice - kwargs: dict + kwargs : dict pass along to ``func`` Returns @@ -461,9 +543,22 @@ def apply(self, func, axis=0, subset=None, **kwargs): Notes ----- + The output shape of ``func`` should match the input, i.e. if + ``x`` is the input row, column, or table (depending on ``axis``), + then ``func(x.shape) == x.shape`` should be true. + This is similar to ``DataFrame.apply``, except that ``axis=None`` applies the function to the entire DataFrame at once, rather than column-wise or row-wise. + + Examples + -------- + >>> def highlight_max(x): + ... return ['background-color: yellow' if v == x.max() else '' + for v in x] + ... + >>> df = pd.DataFrame(np.random.randn(5, 2)) + >>> df.style.apply(highlight_max) """ self._todo.append((lambda instance: getattr(instance, '_apply'), (func, axis, subset), kwargs)) @@ -488,6 +583,7 @@ def applymap(self, func, subset=None, **kwargs): Parameters ---------- func : function + ``func`` should take a scalar and return a scalar subset : IndexSlice a valid indexer to limit ``data`` to *before* applying the function. Consider using a pandas.IndexSlice @@ -742,6 +838,7 @@ def set_properties(self, subset=None, **kwargs): -------- >>> df = pd.DataFrame(np.random.randn(10, 4)) >>> df.style.set_properties(color="white", align="right") + >>> df.style.set_properties(**{'background-color': 'yellow'}) """ values = ';'.join('{p}: {v}'.format(p=p, v=v) for p, v in kwargs.items()) @@ -854,8 +951,42 @@ def _highlight_extrema(data, color='yellow', max_=True): index=data.index, columns=data.columns) +def _is_visible(idx_row, idx_col, lengths): + """ + Index -> {(idx_row, idx_col): bool}) + """ + return (idx_col, idx_row) in lengths + + +def _get_level_lengths(index): + """ + Given an index, find the level lenght for each element. + + Result is a dictionary of (level, inital_position): span + """ + sentinel = com.sentinel_factory() + levels = index.format(sparsify=sentinel, adjoin=False, names=False) + + if index.nlevels == 1: + return {(0, i): 1 for i, value in enumerate(levels)} + + lengths = {} + + for i, lvl in enumerate(levels): + for j, row in enumerate(lvl): + if not get_option('display.multi_sparse'): + lengths[(i, j)] = 1 + elif row != sentinel: + last_label = j + lengths[(i, last_label)] = 1 + else: + lengths[(i, last_label)] += 1 + + return lengths + + def _maybe_wrap_formatter(formatter): - if com.is_string_like(formatter): + if is_string_like(formatter): return lambda x: formatter.format(x) elif callable(formatter): return formatter diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx index f718c1ab0b8da..3bda3f49cb054 100644 --- a/pandas/hashtable.pyx +++ b/pandas/hashtable.pyx @@ -35,773 +35,9 @@ cdef extern from "Python.h": cdef size_t _INIT_VEC_CAP = 32 -cdef class ObjectVector: - - cdef: - PyObject **data - size_t n, m - ndarray ao - - def __cinit__(self): - self.n = 0 - self.m = _INIT_VEC_CAP - self.ao = np.empty(_INIT_VEC_CAP, dtype=object) - self.data = self.ao.data - - def __len__(self): - return self.n - - cdef inline append(self, object o): - if self.n == self.m: - self.m = max(self.m * 2, _INIT_VEC_CAP) - self.ao.resize(self.m) - self.data = self.ao.data - - Py_INCREF(o) - self.data[self.n] = o - self.n += 1 - - def to_array(self): - self.ao.resize(self.n) - self.m = self.n - return self.ao - -ctypedef struct Int64VectorData: - int64_t *data - size_t n, m - -ctypedef struct Float64VectorData: - float64_t *data - size_t n, m - -ctypedef fused vector_data: - Int64VectorData - Float64VectorData - -ctypedef fused sixty_four_bit_scalar: - int64_t - float64_t - -cdef bint needs_resize(vector_data *data) nogil: - return data.n == data.m - -cdef void append_data(vector_data *data, sixty_four_bit_scalar x) nogil: - - # compile time specilization of the fused types - # as the cross-product is generated, but we cannot assign float->int - # the types that don't pass are pruned - if (vector_data is Int64VectorData and sixty_four_bit_scalar is int64_t) or ( - vector_data is Float64VectorData and sixty_four_bit_scalar is float64_t): - - data.data[data.n] = x - data.n += 1 - -cdef class Int64Vector: - - cdef: - Int64VectorData *data - ndarray ao - - def __cinit__(self): - self.data = PyMem_Malloc(sizeof(Int64VectorData)) - if not self.data: - raise MemoryError() - self.data.n = 0 - self.data.m = _INIT_VEC_CAP - self.ao = np.empty(self.data.m, dtype=np.int64) - self.data.data = self.ao.data - - cdef resize(self): - self.data.m = max(self.data.m * 4, _INIT_VEC_CAP) - self.ao.resize(self.data.m) - self.data.data = self.ao.data - - def __dealloc__(self): - PyMem_Free(self.data) - - def __len__(self): - return self.data.n - - def to_array(self): - self.ao.resize(self.data.n) - self.data.m = self.data.n - return self.ao - - cdef inline void append(self, int64_t x): - - if needs_resize(self.data): - self.resize() - - append_data(self.data, x) - -cdef class Float64Vector: - - cdef: - Float64VectorData *data - ndarray ao - - def __cinit__(self): - self.data = PyMem_Malloc(sizeof(Float64VectorData)) - if not self.data: - raise MemoryError() - self.data.n = 0 - self.data.m = _INIT_VEC_CAP - self.ao = np.empty(self.data.m, dtype=np.float64) - self.data.data = self.ao.data - - cdef resize(self): - self.data.m = max(self.data.m * 4, _INIT_VEC_CAP) - self.ao.resize(self.data.m) - self.data.data = self.ao.data - - def __dealloc__(self): - PyMem_Free(self.data) - - def __len__(self): - return self.data.n - - def to_array(self): - self.ao.resize(self.data.n) - self.data.m = self.data.n - return self.ao - - cdef inline void append(self, float64_t x): - - if needs_resize(self.data): - self.resize() - - append_data(self.data, x) - -cdef class HashTable: - pass - -cdef class StringHashTable(HashTable): - cdef kh_str_t *table - - def __cinit__(self, int size_hint=1): - self.table = kh_init_str() - if size_hint is not None: - kh_resize_str(self.table, size_hint) - - def __dealloc__(self): - kh_destroy_str(self.table) - - cpdef get_item(self, object val): - cdef khiter_t k - k = kh_get_str(self.table, util.get_c_string(val)) - if k != self.table.n_buckets: - return self.table.vals[k] - else: - raise KeyError(val) - - def get_iter_test(self, object key, Py_ssize_t iterations): - cdef Py_ssize_t i, val - for i in range(iterations): - k = kh_get_str(self.table, util.get_c_string(key)) - if k != self.table.n_buckets: - val = self.table.vals[k] - - cpdef set_item(self, object key, Py_ssize_t val): - cdef: - khiter_t k - int ret = 0 - char* buf - - buf = util.get_c_string(key) - - k = kh_put_str(self.table, buf, &ret) - self.table.keys[k] = key - if kh_exist_str(self.table, k): - self.table.vals[k] = val - else: - raise KeyError(key) - - def get_indexer(self, ndarray[object] values): - cdef: - Py_ssize_t i, n = len(values) - ndarray[int64_t] labels = np.empty(n, dtype=np.int64) - char *buf - int64_t *resbuf = labels.data - khiter_t k - kh_str_t *table = self.table - - for i in range(n): - buf = util.get_c_string(values[i]) - k = kh_get_str(table, buf) - if k != table.n_buckets: - resbuf[i] = table.vals[k] - else: - resbuf[i] = -1 - return labels - - def unique(self, ndarray[object] values): - cdef: - Py_ssize_t i, n = len(values) - int ret = 0 - object val - char *buf - khiter_t k - ObjectVector uniques = ObjectVector() - - for i in range(n): - val = values[i] - buf = util.get_c_string(val) - k = kh_get_str(self.table, buf) - if k == self.table.n_buckets: - kh_put_str(self.table, buf, &ret) - uniques.append(val) - - return uniques.to_array() - - def factorize(self, ndarray[object] values): - cdef: - Py_ssize_t i, n = len(values) - ndarray[int64_t] labels = np.empty(n, dtype=np.int64) - dict reverse = {} - Py_ssize_t idx, count = 0 - int ret = 0 - object val - char *buf - khiter_t k - - for i in range(n): - val = values[i] - buf = util.get_c_string(val) - k = kh_get_str(self.table, buf) - if k != self.table.n_buckets: - idx = self.table.vals[k] - labels[i] = idx - else: - k = kh_put_str(self.table, buf, &ret) - # print 'putting %s, %s' % (val, count) - - self.table.vals[k] = count - reverse[count] = val - labels[i] = count - count += 1 - - return reverse, labels - -cdef class Int64HashTable(HashTable): - - def __cinit__(self, size_hint=1): - self.table = kh_init_int64() - if size_hint is not None: - kh_resize_int64(self.table, size_hint) - - def __len__(self): - return self.table.size - - def __dealloc__(self): - kh_destroy_int64(self.table) - - def __contains__(self, object key): - cdef khiter_t k - k = kh_get_int64(self.table, key) - return k != self.table.n_buckets - - cpdef get_item(self, int64_t val): - cdef khiter_t k - k = kh_get_int64(self.table, val) - if k != self.table.n_buckets: - return self.table.vals[k] - else: - raise KeyError(val) - - def get_iter_test(self, int64_t key, Py_ssize_t iterations): - cdef Py_ssize_t i, val=0 - for i in range(iterations): - k = kh_get_int64(self.table, val) - if k != self.table.n_buckets: - val = self.table.vals[k] - - cpdef set_item(self, int64_t key, Py_ssize_t val): - cdef: - khiter_t k - int ret = 0 - - k = kh_put_int64(self.table, key, &ret) - self.table.keys[k] = key - if kh_exist_int64(self.table, k): - self.table.vals[k] = val - else: - raise KeyError(key) - - @cython.boundscheck(False) - def map(self, int64_t[:] keys, int64_t[:] values): - cdef: - Py_ssize_t i, n = len(values) - int ret = 0 - int64_t key - khiter_t k - - with nogil: - for i in range(n): - key = keys[i] - k = kh_put_int64(self.table, key, &ret) - self.table.vals[k] = values[i] - - @cython.boundscheck(False) - def map_locations(self, ndarray[int64_t, ndim=1] values): - cdef: - Py_ssize_t i, n = len(values) - int ret = 0 - int64_t val - khiter_t k - - with nogil: - for i in range(n): - val = values[i] - k = kh_put_int64(self.table, val, &ret) - self.table.vals[k] = i - - @cython.boundscheck(False) - def lookup(self, int64_t[:] values): - cdef: - Py_ssize_t i, n = len(values) - int ret = 0 - int64_t val - khiter_t k - int64_t[:] locs = np.empty(n, dtype=np.int64) - - with nogil: - for i in range(n): - val = values[i] - k = kh_get_int64(self.table, val) - if k != self.table.n_buckets: - locs[i] = self.table.vals[k] - else: - locs[i] = -1 - - return np.asarray(locs) - - def factorize(self, ndarray[object] values): - reverse = {} - labels = self.get_labels(values, reverse, 0, 0) - return reverse, labels - - @cython.boundscheck(False) - def get_labels(self, int64_t[:] values, Int64Vector uniques, - Py_ssize_t count_prior, Py_ssize_t na_sentinel, - bint check_null=True): - cdef: - Py_ssize_t i, n = len(values) - int64_t[:] labels - Py_ssize_t idx, count = count_prior - int ret = 0 - int64_t val - khiter_t k - Int64VectorData *ud - - labels = np.empty(n, dtype=np.int64) - ud = uniques.data - - with nogil: - for i in range(n): - val = values[i] - k = kh_get_int64(self.table, val) - - if check_null and val == iNaT: - labels[i] = na_sentinel - continue - - if k != self.table.n_buckets: - idx = self.table.vals[k] - labels[i] = idx - else: - k = kh_put_int64(self.table, val, &ret) - self.table.vals[k] = count - - if needs_resize(ud): - with gil: - uniques.resize() - append_data(ud, val) - labels[i] = count - count += 1 - - return np.asarray(labels) - - @cython.boundscheck(False) - def get_labels_groupby(self, int64_t[:] values): - cdef: - Py_ssize_t i, n = len(values) - int64_t[:] labels - Py_ssize_t idx, count = 0 - int ret = 0 - int64_t val - khiter_t k - Int64Vector uniques = Int64Vector() - Int64VectorData *ud - - labels = np.empty(n, dtype=np.int64) - ud = uniques.data - - with nogil: - for i in range(n): - val = values[i] - - # specific for groupby - if val < 0: - labels[i] = -1 - continue - - k = kh_get_int64(self.table, val) - if k != self.table.n_buckets: - idx = self.table.vals[k] - labels[i] = idx - else: - k = kh_put_int64(self.table, val, &ret) - self.table.vals[k] = count - - if needs_resize(ud): - with gil: - uniques.resize() - append_data(ud, val) - labels[i] = count - count += 1 - - arr_uniques = uniques.to_array() - - return np.asarray(labels), arr_uniques - - @cython.boundscheck(False) - def unique(self, int64_t[:] values): - cdef: - Py_ssize_t i, n = len(values) - int ret = 0 - int64_t val - khiter_t k - Int64Vector uniques = Int64Vector() - Int64VectorData *ud - - ud = uniques.data - - with nogil: - for i in range(n): - val = values[i] - k = kh_get_int64(self.table, val) - if k == self.table.n_buckets: - kh_put_int64(self.table, val, &ret) - - if needs_resize(ud): - with gil: - uniques.resize() - append_data(ud, val) - - return uniques.to_array() - - -cdef class Float64HashTable(HashTable): - - def __cinit__(self, size_hint=1): - self.table = kh_init_float64() - if size_hint is not None: - kh_resize_float64(self.table, size_hint) - - def __len__(self): - return self.table.size - - cpdef get_item(self, float64_t val): - cdef khiter_t k - k = kh_get_float64(self.table, val) - if k != self.table.n_buckets: - return self.table.vals[k] - else: - raise KeyError(val) - - cpdef set_item(self, float64_t key, Py_ssize_t val): - cdef: - khiter_t k - int ret = 0 - - k = kh_put_float64(self.table, key, &ret) - self.table.keys[k] = key - if kh_exist_float64(self.table, k): - self.table.vals[k] = val - else: - raise KeyError(key) - - def __dealloc__(self): - kh_destroy_float64(self.table) - - def __contains__(self, object key): - cdef khiter_t k - k = kh_get_float64(self.table, key) - return k != self.table.n_buckets - - def factorize(self, float64_t[:] values): - uniques = Float64Vector() - labels = self.get_labels(values, uniques, 0, -1, 1) - return uniques.to_array(), labels - - @cython.boundscheck(False) - def get_labels(self, float64_t[:] values, - Float64Vector uniques, - Py_ssize_t count_prior, int64_t na_sentinel, - bint check_null=True): - cdef: - Py_ssize_t i, n = len(values) - int64_t[:] labels - Py_ssize_t idx, count = count_prior - int ret = 0 - float64_t val - khiter_t k - Float64VectorData *ud - - labels = np.empty(n, dtype=np.int64) - ud = uniques.data - - with nogil: - for i in range(n): - val = values[i] - - if check_null and val != val: - labels[i] = na_sentinel - continue - - k = kh_get_float64(self.table, val) - if k != self.table.n_buckets: - idx = self.table.vals[k] - labels[i] = idx - else: - k = kh_put_float64(self.table, val, &ret) - self.table.vals[k] = count - - if needs_resize(ud): - with gil: - uniques.resize() - append_data(ud, val) - labels[i] = count - count += 1 - - return np.asarray(labels) - - @cython.boundscheck(False) - def map_locations(self, ndarray[float64_t, ndim=1] values): - cdef: - Py_ssize_t i, n = len(values) - int ret = 0 - khiter_t k - - with nogil: - for i in range(n): - k = kh_put_float64(self.table, values[i], &ret) - self.table.vals[k] = i - - @cython.boundscheck(False) - def lookup(self, float64_t[:] values): - cdef: - Py_ssize_t i, n = len(values) - int ret = 0 - float64_t val - khiter_t k - int64_t[:] locs = np.empty(n, dtype=np.int64) - - with nogil: - for i in range(n): - val = values[i] - k = kh_get_float64(self.table, val) - if k != self.table.n_buckets: - locs[i] = self.table.vals[k] - else: - locs[i] = -1 - - return np.asarray(locs) - - @cython.boundscheck(False) - def unique(self, float64_t[:] values): - cdef: - Py_ssize_t i, n = len(values) - int ret = 0 - float64_t val - khiter_t k - bint seen_na = 0 - Float64Vector uniques = Float64Vector() - Float64VectorData *ud - - ud = uniques.data - - with nogil: - for i in range(n): - val = values[i] - - if val == val: - k = kh_get_float64(self.table, val) - if k == self.table.n_buckets: - kh_put_float64(self.table, val, &ret) - - if needs_resize(ud): - with gil: - uniques.resize() - append_data(ud, val) - - elif not seen_na: - seen_na = 1 - - if needs_resize(ud): - with gil: - uniques.resize() - append_data(ud, NAN) - - return uniques.to_array() - -na_sentinel = object - -cdef class PyObjectHashTable(HashTable): - - def __init__(self, size_hint=1): - self.table = kh_init_pymap() - kh_resize_pymap(self.table, size_hint) - - def __dealloc__(self): - if self.table is not NULL: - self.destroy() - - def __len__(self): - return self.table.size - - def __contains__(self, object key): - cdef khiter_t k - hash(key) - if key != key or key is None: - key = na_sentinel - k = kh_get_pymap(self.table, key) - return k != self.table.n_buckets - - def destroy(self): - kh_destroy_pymap(self.table) - self.table = NULL - - cpdef get_item(self, object val): - cdef khiter_t k - if val != val or val is None: - val = na_sentinel - k = kh_get_pymap(self.table, val) - if k != self.table.n_buckets: - return self.table.vals[k] - else: - raise KeyError(val) - - def get_iter_test(self, object key, Py_ssize_t iterations): - cdef Py_ssize_t i, val - if key != key or key is None: - key = na_sentinel - for i in range(iterations): - k = kh_get_pymap(self.table, key) - if k != self.table.n_buckets: - val = self.table.vals[k] - - cpdef set_item(self, object key, Py_ssize_t val): - cdef: - khiter_t k - int ret = 0 - char* buf - - hash(key) - if key != key or key is None: - key = na_sentinel - k = kh_put_pymap(self.table, key, &ret) - # self.table.keys[k] = key - if kh_exist_pymap(self.table, k): - self.table.vals[k] = val - else: - raise KeyError(key) - - def map_locations(self, ndarray[object] values): - cdef: - Py_ssize_t i, n = len(values) - int ret = 0 - object val - khiter_t k - - for i in range(n): - val = values[i] - hash(val) - if val != val or val is None: - val = na_sentinel - - k = kh_put_pymap(self.table, val, &ret) - self.table.vals[k] = i - - def lookup(self, ndarray[object] values): - cdef: - Py_ssize_t i, n = len(values) - int ret = 0 - object val - khiter_t k - int64_t[:] locs = np.empty(n, dtype=np.int64) - - for i in range(n): - val = values[i] - hash(val) - if val != val or val is None: - val = na_sentinel - - k = kh_get_pymap(self.table, val) - if k != self.table.n_buckets: - locs[i] = self.table.vals[k] - else: - locs[i] = -1 - - return np.asarray(locs) - - def unique(self, ndarray[object] values): - cdef: - Py_ssize_t i, n = len(values) - int ret = 0 - object val - khiter_t k - ObjectVector uniques = ObjectVector() - bint seen_na = 0 - - for i in range(n): - val = values[i] - hash(val) - if not _checknan(val): - k = kh_get_pymap(self.table, val) - if k == self.table.n_buckets: - kh_put_pymap(self.table, val, &ret) - uniques.append(val) - elif not seen_na: - seen_na = 1 - uniques.append(nan) - - return uniques.to_array() - - def get_labels(self, ndarray[object] values, ObjectVector uniques, - Py_ssize_t count_prior, int64_t na_sentinel, - bint check_null=True): - cdef: - Py_ssize_t i, n = len(values) - int64_t[:] labels - Py_ssize_t idx, count = count_prior - int ret = 0 - object val - khiter_t k - - labels = np.empty(n, dtype=np.int64) - - for i in range(n): - val = values[i] - hash(val) - - if check_null and val != val or val is None: - labels[i] = na_sentinel - continue - - k = kh_get_pymap(self.table, val) - if k != self.table.n_buckets: - idx = self.table.vals[k] - labels[i] = idx - else: - k = kh_put_pymap(self.table, val, &ret) - self.table.vals[k] = count - uniques.append(val) - labels[i] = count - count += 1 - - return np.asarray(labels) +include "hashtable_class_helper.pxi" +include "hashtable_func_helper.pxi" cdef class Factorizer: cdef public PyObjectHashTable table @@ -828,10 +64,10 @@ cdef class Factorizer: mask = (labels == na_sentinel) # sort on if sort: - if labels.dtype != np.int_: - labels = labels.astype(np.int_) + if labels.dtype != np.intp: + labels = labels.astype(np.intp) sorter = self.uniques.to_array().argsort() - reverse_indexer = np.empty(len(sorter), dtype=np.int_) + reverse_indexer = np.empty(len(sorter), dtype=np.intp) reverse_indexer.put(sorter, np.arange(len(sorter))) labels = reverse_indexer.take(labels, mode='clip') labels[mask] = na_sentinel @@ -864,11 +100,11 @@ cdef class Int64Factorizer: # sort on if sort: - if labels.dtype != np.int_: - labels = labels.astype(np.int_) + if labels.dtype != np.intp: + labels = labels.astype(np.intp) sorter = self.uniques.to_array().argsort() - reverse_indexer = np.empty(len(sorter), dtype=np.int_) + reverse_indexer = np.empty(len(sorter), dtype=np.intp) reverse_indexer.put(sorter, np.arange(len(sorter))) labels = reverse_indexer.take(labels) @@ -876,94 +112,9 @@ cdef class Int64Factorizer: self.count = len(self.uniques) return labels -ctypedef fused kh_scalar64: - kh_int64_t - kh_float64_t - -@cython.boundscheck(False) -cdef build_count_table_scalar64(sixty_four_bit_scalar[:] values, - kh_scalar64 *table, bint dropna): - cdef: - khiter_t k - Py_ssize_t i, n = len(values) - sixty_four_bit_scalar val - int ret = 0 - - if sixty_four_bit_scalar is float64_t and kh_scalar64 is kh_float64_t: - with nogil: - kh_resize_float64(table, n) - - for i in range(n): - val = values[i] - if val == val or not dropna: - k = kh_get_float64(table, val) - if k != table.n_buckets: - table.vals[k] += 1 - else: - k = kh_put_float64(table, val, &ret) - table.vals[k] = 1 - elif sixty_four_bit_scalar is int64_t and kh_scalar64 is kh_int64_t: - with nogil: - kh_resize_int64(table, n) - - for i in range(n): - val = values[i] - k = kh_get_int64(table, val) - if k != table.n_buckets: - table.vals[k] += 1 - else: - k = kh_put_int64(table, val, &ret) - table.vals[k] = 1 - else: - raise ValueError("Table type must match scalar type.") - - +@cython.wraparound(False) @cython.boundscheck(False) -cpdef value_count_scalar64(sixty_four_bit_scalar[:] values, bint dropna): - cdef: - Py_ssize_t i - kh_float64_t *ftable - kh_int64_t *itable - sixty_four_bit_scalar[:] result_keys - int64_t[:] result_counts - int k - - i = 0 - - if sixty_four_bit_scalar is float64_t: - ftable = kh_init_float64() - build_count_table_scalar64(values, ftable, dropna) - - result_keys = np.empty(ftable.n_occupied, dtype=np.float64) - result_counts = np.zeros(ftable.n_occupied, dtype=np.int64) - - with nogil: - for k in range(ftable.n_buckets): - if kh_exist_float64(ftable, k): - result_keys[i] = ftable.keys[k] - result_counts[i] = ftable.vals[k] - i += 1 - kh_destroy_float64(ftable) - - elif sixty_four_bit_scalar is int64_t: - itable = kh_init_int64() - build_count_table_scalar64(values, itable, dropna) - - result_keys = np.empty(itable.n_occupied, dtype=np.int64) - result_counts = np.zeros(itable.n_occupied, dtype=np.int64) - - with nogil: - for k in range(itable.n_buckets): - if kh_exist_int64(itable, k): - result_keys[i] = itable.keys[k] - result_counts[i] = itable.vals[k] - i += 1 - kh_destroy_int64(itable) - - return np.asarray(result_keys), np.asarray(result_counts) - - cdef build_count_table_object(ndarray[object] values, ndarray[uint8_t, cast=True] mask, kh_pymap_t *table): @@ -987,6 +138,8 @@ cdef build_count_table_object(ndarray[object] values, table.vals[k] = 1 +@cython.wraparound(False) +@cython.boundscheck(False) cpdef value_count_object(ndarray[object] values, ndarray[uint8_t, cast=True] mask): cdef: @@ -1010,6 +163,8 @@ cpdef value_count_object(ndarray[object] values, return result_keys, result_counts +@cython.wraparound(False) +@cython.boundscheck(False) def mode_object(ndarray[object] values, ndarray[uint8_t, cast=True] mask): cdef: int count, max_count = 2 @@ -1037,9 +192,10 @@ def mode_object(ndarray[object] values, ndarray[uint8_t, cast=True] mask): kh_destroy_pymap(table) - return modes[:j+1] + return modes[:j + 1] +@cython.wraparound(False) @cython.boundscheck(False) def mode_int64(int64_t[:] values): cdef: @@ -1051,7 +207,7 @@ def mode_int64(int64_t[:] values): table = kh_init_int64() - build_count_table_scalar64(values, table, 0) + build_count_table_int64(values, table, 0) modes = np.empty(table.n_buckets, dtype=np.int64) @@ -1071,48 +227,49 @@ def mode_int64(int64_t[:] values): kh_destroy_int64(table) - return modes[:j+1] + return modes[:j + 1] + @cython.wraparound(False) @cython.boundscheck(False) -def duplicated_int64(ndarray[int64_t, ndim=1] values, object keep='first'): +def duplicated_object(ndarray[object] values, object keep='first'): cdef: - int ret = 0, k - int64_t value - Py_ssize_t i, n = len(values) - kh_int64_t * table = kh_init_int64() - ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool') - - kh_resize_int64(table, min(n, _SIZE_HINT_LIMIT)) + Py_ssize_t i, n + dict seen = dict() + object row - if keep not in ('last', 'first', False): - raise ValueError('keep must be either "first", "last" or False') + n = len(values) + cdef ndarray[uint8_t] result = np.zeros(n, dtype=np.uint8) if keep == 'last': - with nogil: - for i from n > i >=0: - kh_put_int64(table, values[i], &ret) - out[i] = ret == 0 + for i from n > i >= 0: + row = values[i] + if row in seen: + result[i] = 1 + else: + seen[row] = i + result[i] = 0 elif keep == 'first': - with nogil: - for i from 0 <= i < n: - kh_put_int64(table, values[i], &ret) - out[i] = ret == 0 + for i from 0 <= i < n: + row = values[i] + if row in seen: + result[i] = 1 + else: + seen[row] = i + result[i] = 0 + elif keep is False: + for i from 0 <= i < n: + row = values[i] + if row in seen: + result[i] = 1 + result[seen[row]] = 1 + else: + seen[row] = i + result[i] = 0 else: - with nogil: - for i from 0 <= i < n: - value = values[i] - k = kh_get_int64(table, value) - if k != table.n_buckets: - out[table.vals[k]] = 1 - out[i] = 1 - else: - k = kh_put_int64(table, value, &ret) - table.keys[k] = value - table.vals[k] = i - out[i] = 0 - kh_destroy_int64(table) - return out + raise ValueError('keep must be either "first", "last" or False') + + return result.view(np.bool_) @cython.wraparound(False) @@ -1140,7 +297,7 @@ def unique_label_indices(ndarray[int64_t, ndim=1] labels): if needs_resize(ud): with gil: idx.resize() - append_data(ud, i) + append_data_int64(ud, i) kh_destroy_int64(table) diff --git a/pandas/index.pyx b/pandas/index.pyx index 71717dd2d771b..2935560a05b6b 100644 --- a/pandas/index.pyx +++ b/pandas/index.pyx @@ -54,7 +54,8 @@ cdef inline is_definitely_invalid_key(object val): # we have a _data, means we are a NDFrame return (PySlice_Check(val) or cnp.PyArray_Check(val) - or PyList_Check(val) or hasattr(val,'_data')) + or PyList_Check(val) or hasattr(val, '_data')) + def get_value_at(ndarray arr, object loc): if arr.descr.type_num == NPY_DATETIME: @@ -63,6 +64,7 @@ def get_value_at(ndarray arr, object loc): return Timedelta(util.get_value_at(arr, loc)) return util.get_value_at(arr, loc) + def set_value_at(ndarray arr, object loc, object val): return util.set_value_at(arr, loc, val) @@ -80,7 +82,7 @@ cdef class IndexEngine: cdef: bint unique, monotonic_inc, monotonic_dec - bint initialized, monotonic_check, unique_check + bint initialized, monotonic_check def __init__(self, vgetter, n): self.vgetter = vgetter @@ -91,7 +93,6 @@ cdef class IndexEngine: self.monotonic_check = 0 self.unique = 0 - self.unique_check = 0 self.monotonic_inc = 0 self.monotonic_dec = 0 @@ -211,8 +212,8 @@ cdef class IndexEngine: property is_unique: def __get__(self): - if not self.unique_check: - self._do_unique_check() + if not self.initialized: + self.initialize() return self.unique == 1 @@ -246,9 +247,6 @@ cdef class IndexEngine: cdef _get_index_values(self): return self.vgetter() - cdef inline _do_unique_check(self): - self._ensure_mapping_populated() - def _call_monotonic(self, values): raise NotImplementedError @@ -270,7 +268,6 @@ cdef class IndexEngine: if len(self.mapping) == len(values): self.unique = 1 - self.unique_check = 1 self.initialized = 1 @@ -307,7 +304,7 @@ cdef class IndexEngine: else: n_alloc = n - result = np.empty(n_alloc, dtype=np.int64) + result = np.empty(n_alloc, dtype=np.int64) missing = np.empty(n_t, dtype=np.int64) # form the set of the results (like ismember) @@ -316,7 +313,7 @@ cdef class IndexEngine: val = util.get_value_1d(values, i) if val in stargets: if val not in d: - d[val] = [] + d[val] = [] d[val].append(i) for i in range(n_t): @@ -327,20 +324,20 @@ cdef class IndexEngine: if val in d: for j in d[val]: - # realloc if needed - if count >= n_alloc: - n_alloc += 10000 - result = np.resize(result, n_alloc) + # realloc if needed + if count >= n_alloc: + n_alloc += 10000 + result = np.resize(result, n_alloc) - result[count] = j - count += 1 + result[count] = j + count += 1 # value not found else: if count >= n_alloc: - n_alloc += 10000 - result = np.resize(result, n_alloc) + n_alloc += 10000 + result = np.resize(result, n_alloc) result[count] = -1 count += 1 missing[count_missing] = i @@ -484,9 +481,9 @@ cdef Py_ssize_t _bin_search(ndarray values, object val) except -1: return mid + 1 _pad_functions = { - 'object' : algos.pad_object, - 'int64' : algos.pad_int64, - 'float64' : algos.pad_float64 + 'object': algos.pad_object, + 'int64': algos.pad_int64, + 'float64': algos.pad_float64 } _backfill_functions = { @@ -611,7 +608,7 @@ cdef class TimedeltaEngine(DatetimeEngine): cpdef convert_scalar(ndarray arr, object value): if arr.descr.type_num == NPY_DATETIME: - if isinstance(value,np.ndarray): + if isinstance(value, np.ndarray): pass elif isinstance(value, Timestamp): return value.value @@ -620,7 +617,7 @@ cpdef convert_scalar(ndarray arr, object value): else: return Timestamp(value).value elif arr.descr.type_num == NPY_TIMEDELTA: - if isinstance(value,np.ndarray): + if isinstance(value, np.ndarray): pass elif isinstance(value, Timedelta): return value.value @@ -644,7 +641,8 @@ cdef inline _to_i8(object val): return get_datetime64_value(val) elif PyDateTime_Check(val): tzinfo = getattr(val, 'tzinfo', None) - ival = _pydatetime_to_dts(val, &dts) # Save the original date value so we can get the utcoffset from it. + # Save the original date value so we can get the utcoffset from it. + ival = _pydatetime_to_dts(val, &dts) if tzinfo is not None and not _is_utc(tzinfo): offset = tslib._get_utcoffset(tzinfo, val) ival -= tslib._delta_to_nanoseconds(offset) diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index 82f16becbd511..d4ca18a6713b5 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -5,6 +5,7 @@ import numpy as np import pandas.tslib as tslib import pandas.lib as lib +import pandas._join as _join import pandas.algos as _algos import pandas.index as _index from pandas.lib import Timestamp, Timedelta, is_datetime_array @@ -12,6 +13,29 @@ from pandas.compat import range, u from pandas.compat.numpy import function as nv from pandas import compat + + +from pandas.types.generic import ABCSeries, ABCMultiIndex, ABCPeriodIndex +from pandas.types.missing import isnull, array_equivalent +from pandas.types.common import (_ensure_int64, _ensure_object, + _ensure_platform_int, + is_integer, + is_float, + is_dtype_equal, + is_object_dtype, + is_categorical_dtype, + is_bool_dtype, + is_integer_dtype, is_float_dtype, + is_datetime64_any_dtype, + is_timedelta64_dtype, + needs_i8_conversion, + is_iterator, is_list_like, + is_scalar) +from pandas.types.cast import _coerce_indexer_dtype +from pandas.core.common import (is_bool_indexer, + _values_from_object, + _asarray_tuplesafe) + from pandas.core.base import (PandasObject, FrozenList, FrozenNDArray, IndexOpsMixin) import pandas.core.base as base @@ -22,15 +46,7 @@ import pandas.core.missing as missing import pandas.core.algorithms as algos from pandas.formats.printing import pprint_thing -from pandas.core.common import (isnull, array_equivalent, - is_object_dtype, is_datetimetz, ABCSeries, - ABCPeriodIndex, ABCMultiIndex, - _values_from_object, is_float, is_integer, - is_iterator, is_categorical_dtype, - _ensure_object, _ensure_int64, is_bool_indexer, - is_list_like, is_bool_dtype, - is_integer_dtype, is_float_dtype, - needs_i8_conversion) +from pandas.core.ops import _comp_method_OBJECT_ARRAY from pandas.core.strings import StringAccessorMixin from pandas.core.config import get_option @@ -44,7 +60,8 @@ _unsortable_types = frozenset(('mixed', 'mixed-integer')) -_index_doc_kwargs = dict(klass='Index', inplace='', duplicated='np.array') +_index_doc_kwargs = dict(klass='Index', inplace='', + unique='Index', duplicated='np.ndarray') _index_shared_docs = dict() @@ -96,10 +113,10 @@ class Index(IndexOpsMixin, StringAccessorMixin, PandasObject): # Cython methods _groupby = _algos.groupby_object _arrmap = _algos.arrmap_object - _left_indexer_unique = _algos.left_join_indexer_unique_object - _left_indexer = _algos.left_join_indexer_object - _inner_indexer = _algos.inner_join_indexer_object - _outer_indexer = _algos.outer_join_indexer_object + _left_indexer_unique = _join.left_join_indexer_unique_object + _left_indexer = _join.left_join_indexer_object + _inner_indexer = _join.inner_join_indexer_object + _outer_indexer = _join.outer_join_indexer_object _box_scalars = False _typ = 'index' @@ -147,16 +164,19 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, # index-like elif isinstance(data, (np.ndarray, Index, ABCSeries)): - if (issubclass(data.dtype.type, np.datetime64) or - is_datetimetz(data)): + if (is_datetime64_any_dtype(data) or + (dtype is not None and is_datetime64_any_dtype(dtype)) or + 'tz' in kwargs): from pandas.tseries.index import DatetimeIndex - result = DatetimeIndex(data, copy=copy, name=name, **kwargs) - if dtype is not None and _o_dtype == dtype: + result = DatetimeIndex(data, copy=copy, name=name, + dtype=dtype, **kwargs) + if dtype is not None and is_dtype_equal(_o_dtype, dtype): return Index(result.to_pydatetime(), dtype=_o_dtype) else: return result - elif issubclass(data.dtype.type, np.timedelta64): + elif (is_timedelta64_dtype(data) or + (dtype is not None and is_timedelta64_dtype(dtype))): from pandas.tseries.tdi import TimedeltaIndex result = TimedeltaIndex(data, copy=copy, name=name, **kwargs) if dtype is not None and _o_dtype == dtype: @@ -210,7 +230,8 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, pass # maybe coerce to a sub-class - from pandas.tseries.period import PeriodIndex + from pandas.tseries.period import (PeriodIndex, + IncompatibleFrequency) if isinstance(data, PeriodIndex): return PeriodIndex(data, copy=copy, name=name, **kwargs) if issubclass(data.dtype.type, np.integer): @@ -222,7 +243,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, elif issubclass(data.dtype.type, np.bool) or is_bool_dtype(data): subarr = data.astype('object') else: - subarr = com._asarray_tuplesafe(data, dtype=object) + subarr = _asarray_tuplesafe(data, dtype=object) # _asarray_tuplesafe does not always copy underlying data, # so need to make sure that this happens @@ -242,29 +263,32 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, # don't support boolean explicity ATM pass elif inferred != 'string': - if (inferred.startswith('datetime') or - tslib.is_timestamp_array(subarr)): - + if inferred.startswith('datetime'): if (lib.is_datetime_with_singletz_array(subarr) or 'tz' in kwargs): # only when subarr has the same tz from pandas.tseries.index import DatetimeIndex - return DatetimeIndex(subarr, copy=copy, name=name, - **kwargs) + try: + return DatetimeIndex(subarr, copy=copy, + name=name, **kwargs) + except tslib.OutOfBoundsDatetime: + pass - elif (inferred.startswith('timedelta') or - lib.is_timedelta_array(subarr)): + elif inferred.startswith('timedelta'): from pandas.tseries.tdi import TimedeltaIndex return TimedeltaIndex(subarr, copy=copy, name=name, **kwargs) elif inferred == 'period': - return PeriodIndex(subarr, name=name, **kwargs) + try: + return PeriodIndex(subarr, name=name, **kwargs) + except IncompatibleFrequency: + pass return cls._simple_new(subarr, name) elif hasattr(data, '__array__'): return Index(np.asarray(data), dtype=dtype, copy=copy, name=name, **kwargs) - elif data is None or lib.isscalar(data): + elif data is None or is_scalar(data): cls._scalar_data_error(data) else: if (tupleize_cols and isinstance(data, list) and data and @@ -284,7 +308,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, # python2 - MultiIndex fails on mixed types pass # other iterable of some kind - subarr = com._asarray_tuplesafe(data, dtype=object) + subarr = _asarray_tuplesafe(data, dtype=object) return Index(subarr, dtype=dtype, copy=copy, name=name, **kwargs) """ @@ -376,6 +400,33 @@ def _shallow_copy_with_infer(self, values=None, **kwargs): pass return Index(values, **attributes) + def _deepcopy_if_needed(self, orig, copy=False): + """ + .. versionadded:: 0.19.0 + + Make a copy of self if data coincides (in memory) with orig. + Subclasses should override this if self._base is not an ndarray. + + Parameters + ---------- + orig : ndarray + other ndarray to compare self._data against + copy : boolean, default False + when False, do not run any check, just return self + + Returns + ------- + A copy of self if needed, otherwise self : Index + """ + if copy: + # Retrieve the "base objects", i.e. the original memory allocations + orig = orig if orig.base is None else orig.base + new = self._data if self._data.base is None else self._data.base + if orig is new: + return self.copy(deep=True) + + return self + def _update_inplace(self, result, **kwargs): # guard when called from IndexOpsMixin raise TypeError("Index can't be updated inplace") @@ -467,7 +518,7 @@ def repeat(self, n, *args, **kwargs): def where(self, cond, other=None): """ - .. versionadded:: 0.18.2 + .. versionadded:: 0.19.0 Return an Index of same shape as self and whose corresponding entries are from self where cond is True and otherwise are from @@ -512,7 +563,7 @@ def _coerce_to_ndarray(cls, data): """ if not isinstance(data, (np.ndarray, Index)): - if data is None or lib.isscalar(data): + if data is None or is_scalar(data): cls._scalar_data_error(data) # other iterable of some kind @@ -786,7 +837,7 @@ def _to_embed(self, keep_tz=False): satisfied, the original data is used to create a new Index or the original Index is returned. - .. versionadded:: 0.18.2 + .. versionadded:: 0.19.0 """ @@ -801,9 +852,14 @@ def _to_safe_for_reshape(self): def to_datetime(self, dayfirst=False): """ + DEPRECATED: use :meth:`pandas.to_datetime` instead. + For an Index containing strings or datetime.datetime objects, attempt conversion to DatetimeIndex """ + warnings.warn("to_datetime is deprecated. Use pd.to_datetime(...)", + FutureWarning, stacklevel=2) + from pandas.tseries.index import DatetimeIndex if self.inferred_type == 'string': from dateutil.parser import parse @@ -814,7 +870,7 @@ def to_datetime(self, dayfirst=False): return DatetimeIndex(self.values) def _assert_can_do_setop(self, other): - if not com.is_list_like(other): + if not is_list_like(other): raise TypeError('Input must be Index or array-like') return True @@ -826,6 +882,16 @@ def _convert_can_do_setop(self, other): result_name = self.name if self.name == other.name else None return other, result_name + def _convert_for_op(self, value): + """ Convert value to be insertable to ndarray """ + return value + + def _assert_can_do_op(self, value): + """ Check value is valid for scalar op """ + if not lib.isscalar(value): + msg = "'value' must be a scalar, passed: {0}" + raise TypeError(msg.format(type(value).__name__)) + @property def nlevels(self): return 1 @@ -917,6 +983,16 @@ def rename(self, name, inplace=False): """ return self.set_names([name], inplace=inplace) + def reshape(self, *args, **kwargs): + """ + NOT IMPLEMENTED: do not call this method, as reshaping is not + supported for Index objects and will raise an error. + + Reshape an Index. + """ + raise NotImplementedError("reshaping is not supported " + "for Index objects") + @property def _has_complex_internals(self): # to disable groupby tricks in MultiIndex @@ -1181,7 +1257,7 @@ def _constructor(self): @cache_readonly def _engine(self): # property, for now, slow to look up - return self._engine_type(lambda: self.values, len(self)) + return self._engine_type(lambda: self._values, len(self)) def _validate_index_level(self, level): """ @@ -1298,7 +1374,7 @@ def __getitem__(self, key): getitem = self._data.__getitem__ promote = self._shallow_copy - if lib.isscalar(key): + if is_scalar(key): return getitem(key) if isinstance(key, slice): @@ -1311,20 +1387,24 @@ def __getitem__(self, key): key = _values_from_object(key) result = getitem(key) - if not lib.isscalar(result): + if not is_scalar(result): return promote(result) else: return result - def _ensure_compat_append(self, other): + def append(self, other): """ - prepare the append + Append a collection of Index options together + + Parameters + ---------- + other : Index or list/tuple of indices Returns ------- - list of to_concat, name of result Index + appended : Index """ - name = self.name + to_concat = [self] if isinstance(other, (list, tuple)): @@ -1333,46 +1413,29 @@ def _ensure_compat_append(self, other): to_concat.append(other) for obj in to_concat: - if (isinstance(obj, Index) and obj.name != name and - obj.name is not None): - name = None - break - - to_concat = self._ensure_compat_concat(to_concat) - to_concat = [x._values if isinstance(x, Index) else x - for x in to_concat] - return to_concat, name + if not isinstance(obj, Index): + raise TypeError('all inputs must be Index') - def append(self, other): - """ - Append a collection of Index options together - - Parameters - ---------- - other : Index or list/tuple of indices - - Returns - ------- - appended : Index - """ - to_concat, name = self._ensure_compat_append(other) - attribs = self._get_attributes_dict() - attribs['name'] = name - return self._shallow_copy_with_infer( - np.concatenate(to_concat), **attribs) + names = set([obj.name for obj in to_concat]) + name = None if len(names) > 1 else self.name - @staticmethod - def _ensure_compat_concat(indexes): - from pandas.tseries.api import (DatetimeIndex, PeriodIndex, - TimedeltaIndex) - klasses = DatetimeIndex, PeriodIndex, TimedeltaIndex + typs = _concat.get_dtype_kinds(to_concat) - is_ts = [isinstance(idx, klasses) for idx in indexes] + if 'category' in typs: + # if any of the to_concat is category + from pandas.indexes.category import CategoricalIndex + return CategoricalIndex._append_same_dtype(self, to_concat, name) - if any(is_ts) and not all(is_ts): - return [_maybe_box(idx) for idx in indexes] + if len(typs) == 1: + return self._append_same_dtype(to_concat, name=name) + return _concat._concat_index_asobject(to_concat, name=name) - return indexes + def _append_same_dtype(self, to_concat, name): + """ + Concatenate to_concat which has the same class + """ + # must be overrided in specific classes + return _concat._concat_index_asobject(to_concat, name) _index_shared_docs['take'] = """ return a new %(klass)s of the values selected by the indices @@ -1399,7 +1462,7 @@ def _ensure_compat_concat(indexes): def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): nv.validate_take(tuple(), kwargs) - indices = com._ensure_platform_int(indices) + indices = _ensure_platform_int(indices) if self._can_hold_na: taken = self._assert_take_fillable(self.values, indices, allow_fill=allow_fill, @@ -1415,7 +1478,7 @@ def take(self, indices, axis=0, allow_fill=True, def _assert_take_fillable(self, values, indices, allow_fill=True, fill_value=None, na_value=np.nan): """ Internal method to handle NA filling of take """ - indices = com._ensure_platform_int(indices) + indices = _ensure_platform_int(indices) # only fill if we are passing a non-None fill_value if allow_fill and fill_value is not None: @@ -1458,16 +1521,6 @@ def hasnans(self): else: return False - def _convert_for_op(self, value): - """ Convert value to be insertable to ndarray """ - return value - - def _assert_can_do_op(self, value): - """ Check value is valid for scalar op """ - if not lib.isscalar(value): - msg = "'value' must be a scalar, passed: {0}" - raise TypeError(msg.format(type(value).__name__)) - def putmask(self, mask, value): """ return a new Index of the values set with the mask @@ -1552,8 +1605,15 @@ def equals(self, other): if not isinstance(other, Index): return False - return array_equivalent(_values_from_object(self), - _values_from_object(other)) + if is_object_dtype(self) and not is_object_dtype(other): + # if other is not object, use other's logic for coercion + return other.equals(self) + + try: + return array_equivalent(_values_from_object(self), + _values_from_object(other)) + except: + return False def identical(self, other): """Similar to equals, but check that other comparable attributes are @@ -1679,28 +1739,16 @@ def argsort(self, *args, **kwargs): return result.argsort(*args, **kwargs) def __add__(self, other): - if com.is_list_like(other): - warnings.warn("using '+' to provide set union with Indexes is " - "deprecated, use '|' or .union()", FutureWarning, - stacklevel=2) - if isinstance(other, Index): - return self.union(other) return Index(np.array(self) + other) def __radd__(self, other): - if is_list_like(other): - warnings.warn("using '+' to provide set union with Indexes is " - "deprecated, use '|' or .union()", FutureWarning, - stacklevel=2) return Index(other + np.array(self)) __iadd__ = __add__ def __sub__(self, other): - warnings.warn("using '-' to provide set differences with Indexes is " - "deprecated, use .difference()", FutureWarning, - stacklevel=2) - return self.difference(other) + raise TypeError("cannot perform __sub__ with this index type: " + "{typ}".format(typ=type(self))) def __and__(self, other): return self.intersection(other) @@ -1723,7 +1771,7 @@ def _get_consensus_name(self, other): else: name = None if self.name != name: - return other._shallow_copy(name=name) + return self._shallow_copy(name=name) return self def union(self, other): @@ -1756,20 +1804,20 @@ def union(self, other): if len(self) == 0: return other._get_consensus_name(self) - if not com.is_dtype_equal(self.dtype, other.dtype): + if not is_dtype_equal(self.dtype, other.dtype): this = self.astype('O') other = other.astype('O') return this.union(other) if self.is_monotonic and other.is_monotonic: try: - result = self._outer_indexer(self.values, other._values)[0] + result = self._outer_indexer(self._values, other._values)[0] except TypeError: # incomparable objects - result = list(self.values) + result = list(self._values) # worth making this faster? a very unusual case - value_set = set(self.values) + value_set = set(self._values) result.extend([x for x in other._values if x not in value_set]) else: indexer = self.get_indexer(other) @@ -1778,10 +1826,10 @@ def union(self, other): if len(indexer) > 0: other_diff = algos.take_nd(other._values, indexer, allow_fill=False) - result = _concat._concat_compat((self.values, other_diff)) + result = _concat._concat_compat((self._values, other_diff)) try: - self.values[0] < other_diff[0] + self._values[0] < other_diff[0] except TypeError as e: warnings.warn("%s, sort order is undefined for " "incomparable objects" % e, RuntimeWarning, @@ -1793,7 +1841,7 @@ def union(self, other): result.sort() else: - result = self.values + result = self._values try: result = np.sort(result) @@ -1839,24 +1887,24 @@ def intersection(self, other): if self.equals(other): return self._get_consensus_name(other) - if not com.is_dtype_equal(self.dtype, other.dtype): + if not is_dtype_equal(self.dtype, other.dtype): this = self.astype('O') other = other.astype('O') return this.intersection(other) if self.is_monotonic and other.is_monotonic: try: - result = self._inner_indexer(self.values, other._values)[0] + result = self._inner_indexer(self._values, other._values)[0] return self._wrap_union_result(other, result) except TypeError: pass try: - indexer = Index(self.values).get_indexer(other._values) + indexer = Index(self._values).get_indexer(other._values) indexer = indexer.take((indexer != -1).nonzero()[0]) except: # duplicates - indexer = Index(self.values).get_indexer_non_unique( + indexer = Index(self._values).get_indexer_non_unique( other._values)[0].unique() indexer = indexer[indexer != -1] @@ -1870,7 +1918,8 @@ def difference(self, other): Return a new Index with elements from the index that are not in `other`. - This is the sorted set difference of two Index objects. + This is the set difference of two Index objects. + It's sorted if sorting is possible. Parameters ---------- @@ -1896,14 +1945,25 @@ def difference(self, other): other, result_name = self._convert_can_do_setop(other) - theDiff = sorted(set(self) - set(other)) - return Index(theDiff, name=result_name) + this = self._get_unique_index() + + indexer = this.get_indexer(other) + indexer = indexer.take((indexer != -1).nonzero()[0]) + + label_diff = np.setdiff1d(np.arange(this.size), indexer, + assume_unique=True) + the_diff = this.values.take(label_diff) + try: + the_diff = algos.safe_sort(the_diff) + except TypeError: + pass - diff = deprecate('diff', difference) + return this._shallow_copy(the_diff, name=result_name) def symmetric_difference(self, other, result_name=None): """ - Compute the sorted symmetric difference of two Index objects. + Compute the symmetric difference of two Index objects. + It's sorted if sorting is possible. Parameters ---------- @@ -1918,10 +1978,8 @@ def symmetric_difference(self, other, result_name=None): ----- ``symmetric_difference`` contains elements that appear in either ``idx1`` or ``idx2`` but not both. Equivalent to the Index created by - ``(idx1 - idx2) + (idx2 - idx1)`` with duplicates dropped. - - The sorting of a result containing ``NaN`` values is not guaranteed - across Python versions. See GitHub issue #6444. + ``idx1.difference(idx2) | idx2.difference(idx1)`` with duplicates + dropped. Examples -------- @@ -1940,8 +1998,26 @@ def symmetric_difference(self, other, result_name=None): if result_name is None: result_name = result_name_update - the_diff = sorted(set((self.difference(other)). - union(other.difference(self)))) + this = self._get_unique_index() + other = other._get_unique_index() + indexer = this.get_indexer(other) + + # {this} minus {other} + common_indexer = indexer.take((indexer != -1).nonzero()[0]) + left_indexer = np.setdiff1d(np.arange(this.size), common_indexer, + assume_unique=True) + left_diff = this.values.take(left_indexer) + + # {other} minus {this} + right_indexer = (indexer == -1).nonzero()[0] + right_diff = other.values.take(right_indexer) + + the_diff = _concat._concat_compat([left_diff, right_diff]) + try: + the_diff = algos.safe_sort(the_diff) + except TypeError: + pass + attribs = self._get_attributes_dict() attribs['name'] = result_name if 'freq' in attribs: @@ -1950,6 +2026,36 @@ def symmetric_difference(self, other, result_name=None): sym_diff = deprecate('sym_diff', symmetric_difference) + def _get_unique_index(self, dropna=False): + """ + Returns an index containing unique values. + + Parameters + ---------- + dropna : bool + If True, NaN values are dropped. + + Returns + ------- + uniques : index + """ + if self.is_unique and not dropna: + return self + + values = self.values + + if not self.is_unique: + values = self.unique() + + if dropna: + try: + if self.hasnans: + values = values[~isnull(values)] + except NotImplementedError: + pass + + return self._shallow_copy(values) + def get_loc(self, key, method=None, tolerance=None): """ Get integer location for requested label @@ -2001,7 +2107,7 @@ def get_value(self, series, key): # if we have something that is Index-like, then # use this, e.g. DatetimeIndex s = getattr(series, '_values', None) - if isinstance(s, Index) and lib.isscalar(key): + if isinstance(s, Index) and is_scalar(key): try: return s[key] except (IndexError, ValueError): @@ -2034,7 +2140,7 @@ def get_value(self, series, key): raise e1 except TypeError: # python 3 - if lib.isscalar(key): # pragma: no cover + if is_scalar(key): # pragma: no cover raise IndexError(key) raise InvalidIndexError(key) @@ -2110,7 +2216,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): return pself.get_indexer(ptarget, method=method, limit=limit, tolerance=tolerance) - if not com.is_dtype_equal(self.dtype, target.dtype): + if not is_dtype_equal(self.dtype, target.dtype): this = self.astype(object) target = target.astype(object) return this.get_indexer(target, method=method, limit=limit, @@ -2134,7 +2240,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): indexer = self._engine.get_indexer(target._values) - return com._ensure_platform_int(indexer) + return _ensure_platform_int(indexer) def _convert_tolerance(self, tolerance): # override this method on subclasses @@ -2416,10 +2522,10 @@ def _reindex_non_unique(self, target): if len(missing): l = np.arange(len(indexer)) - missing = com._ensure_platform_int(missing) + missing = _ensure_platform_int(missing) missing_labels = target.take(missing) missing_indexer = _ensure_int64(l[~check]) - cur_labels = self.take(indexer[check])._values + cur_labels = self.take(indexer[check]).values cur_indexer = _ensure_int64(l[check]) new_labels = np.empty(tuple([len(indexer)]), dtype=object) @@ -2439,7 +2545,7 @@ def _reindex_non_unique(self, target): else: # need to retake to have the same size as the indexer - indexer = indexer._values + indexer = indexer.values indexer[~check] = 0 # reset the new indexer to account for the new size @@ -2514,7 +2620,7 @@ def join(self, other, how='left', level=None, return_indexers=False): result = x, z, y return result - if not com.is_dtype_equal(self.dtype, other.dtype): + if not is_dtype_equal(self.dtype, other.dtype): this = self.astype('O') other = other.astype('O') return this.join(other, how=how, return_indexers=return_indexers) @@ -2610,8 +2716,8 @@ def _join_non_unique(self, other, how='left', return_indexers=False): [other._values], how=how, sort=True) - left_idx = com._ensure_platform_int(left_idx) - right_idx = com._ensure_platform_int(right_idx) + left_idx = _ensure_platform_int(left_idx) + right_idx = _ensure_platform_int(right_idx) join_index = self.values.take(left_idx) mask = left_idx == -1 @@ -2703,7 +2809,7 @@ def _get_leaf_sorter(labels): new_levels[level] = new_level if keep_order: # just drop missing values. o.w. keep order - left_indexer = np.arange(len(left)) + left_indexer = np.arange(len(left), dtype=np.intp) mask = new_lev_labels != -1 if not mask.all(): new_labels = [lab[mask] for lab in new_labels] @@ -2746,6 +2852,10 @@ def _get_leaf_sorter(labels): left_indexer, right_indexer = right_indexer, left_indexer if return_indexers: + left_indexer = (None if left_indexer is None + else _ensure_platform_int(left_indexer)) + right_indexer = (None if right_indexer is None + else _ensure_platform_int(right_indexer)) return join_index, left_indexer, right_indexer else: return join_index @@ -2758,7 +2868,7 @@ def _join_monotonic(self, other, how='left', return_indexers=False): else: return ret_index - sv = self.values + sv = self._values ov = other._values if self.is_unique and other.is_unique: @@ -2789,6 +2899,8 @@ def _join_monotonic(self, other, how='left', return_indexers=False): join_index = self._wrap_joined_index(join_index, other) if return_indexers: + lidx = None if lidx is None else _ensure_platform_int(lidx) + ridx = None if ridx is None else _ensure_platform_int(ridx) return join_index, lidx, ridx else: return join_index @@ -2823,9 +2935,9 @@ def slice_indexer(self, start=None, end=None, step=None, kind=None): kind=kind) # return a slice - if not lib.isscalar(start_slice): + if not is_scalar(start_slice): raise AssertionError("Start slice bound is non-scalar") - if not lib.isscalar(end_slice): + if not is_scalar(end_slice): raise AssertionError("End slice bound is non-scalar") return slice(start_slice, end_slice, step) @@ -3062,7 +3174,6 @@ def insert(self, loc, item): """ _self = np.asarray(self) item = self._coerce_scalar_to_index(item)._values - idx = np.concatenate((_self[:loc], item, _self[loc:])) return self._shallow_copy_with_infer(idx) @@ -3090,6 +3201,11 @@ def drop(self, labels, errors='raise'): indexer = indexer[~mask] return self.delete(indexer) + @Appender(base._shared_docs['unique'] % _index_doc_kwargs) + def unique(self): + result = super(Index, self).unique() + return self._shallow_copy(result) + @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'}) @Appender(base._shared_docs['drop_duplicates'] % _index_doc_kwargs) @@ -3131,6 +3247,29 @@ def fillna(self, value=None, downcast=None): return Index(result, name=self.name) return self._shallow_copy() + _index_shared_docs['dropna'] = """ + Return Index without NA/NaN values + + Parameters + ---------- + how : {'any', 'all'}, default 'any' + If the Index is a MultiIndex, drop the value when any or all levels + are NaN. + + Returns + ------- + valid : Index + """ + + @Appender(_index_shared_docs['dropna']) + def dropna(self, how='any'): + if how not in ('any', 'all'): + raise ValueError("invalid how option: {0}".format(how)) + + if self.hasnans: + return self._shallow_copy(self.values[~self._isnan]) + return self._shallow_copy() + def _evaluate_with_timedelta_like(self, other, op, opstr): raise TypeError("can only perform ops with timedelta like values") @@ -3155,8 +3294,14 @@ def _evaluate_compare(self, other): if needs_i8_conversion(self) and needs_i8_conversion(other): return self._evaluate_compare(other, op) - func = getattr(self.values, op) - result = func(np.asarray(other)) + if is_object_dtype(self) and self.nlevels == 1: + # don't pass MultiIndex + with np.errstate(all='ignore'): + result = _comp_method_OBJECT_ARRAY( + op, self.values, other) + else: + with np.errstate(all='ignore'): + result = op(self.values, np.asarray(other)) # technically we could support bool dtyped Index # for now just return the indexing array directly @@ -3169,16 +3314,16 @@ def _evaluate_compare(self, other): return _evaluate_compare - cls.__eq__ = _make_compare('__eq__') - cls.__ne__ = _make_compare('__ne__') - cls.__lt__ = _make_compare('__lt__') - cls.__gt__ = _make_compare('__gt__') - cls.__le__ = _make_compare('__le__') - cls.__ge__ = _make_compare('__ge__') + cls.__eq__ = _make_compare(operator.eq) + cls.__ne__ = _make_compare(operator.ne) + cls.__lt__ = _make_compare(operator.lt) + cls.__gt__ = _make_compare(operator.gt) + cls.__le__ = _make_compare(operator.le) + cls.__ge__ = _make_compare(operator.ge) @classmethod - def _add_numericlike_set_methods_disabled(cls): - """ add in the numeric set-like methods to disable """ + def _add_numeric_methods_add_sub_disabled(cls): + """ add in the numeric add/sub methods to disable """ def _make_invalid_op(name): def invalid_op(self, other=None): @@ -3193,7 +3338,7 @@ def invalid_op(self, other=None): @classmethod def _add_numeric_methods_disabled(cls): - """ add in numeric methods to disable """ + """ add in numeric methods to disable other than add/sub """ def _make_invalid_op(name): def invalid_op(self, other=None): @@ -3301,7 +3446,9 @@ def _evaluate_numeric_binop(self, other): attrs = self._get_attributes_dict() attrs = self._maybe_update_attributes(attrs) - return Index(op(values, other), **attrs) + with np.errstate(all='ignore'): + result = op(values, other) + return Index(result, **attrs) return _evaluate_numeric_binop @@ -3453,7 +3600,7 @@ def _get_na_value(dtype): def _ensure_frozen(array_like, categories, copy=False): - array_like = com._coerce_indexer_dtype(array_like, categories) + array_like = _coerce_indexer_dtype(array_like, categories) array_like = array_like.view(FrozenNDArray) if copy: array_like = array_like.copy() @@ -3470,16 +3617,6 @@ def _ensure_has_len(seq): return seq -def _maybe_box(idx): - from pandas.tseries.api import DatetimeIndex, PeriodIndex, TimedeltaIndex - klasses = DatetimeIndex, PeriodIndex, TimedeltaIndex - - if isinstance(idx, klasses): - return idx.asobject - - return idx - - def _trim_front(strings): """ Trims zeros and decimal points diff --git a/pandas/indexes/category.py b/pandas/indexes/category.py index e877e43bcc603..c1f5d47e1e04f 100644 --- a/pandas/indexes/category.py +++ b/pandas/indexes/category.py @@ -1,15 +1,21 @@ import numpy as np -import pandas.lib as lib import pandas.index as _index from pandas import compat from pandas.compat.numpy import function as nv +from pandas.types.generic import ABCCategorical, ABCSeries +from pandas.types.common import (is_categorical_dtype, + _ensure_platform_int, + is_list_like, + is_scalar) +from pandas.types.missing import array_equivalent + + from pandas.util.decorators import (Appender, cache_readonly, deprecate_kwarg) from pandas.core.config import get_option from pandas.indexes.base import Index, _index_shared_docs import pandas.core.base as base -import pandas.core.common as com import pandas.core.missing as missing import pandas.indexes.base as ibase @@ -46,7 +52,10 @@ def __new__(cls, data=None, categories=None, ordered=None, dtype=None, if fastpath: return cls._simple_new(data, name=name) - if isinstance(data, com.ABCCategorical): + if name is None and hasattr(data, 'name'): + name = data.name + + if isinstance(data, ABCCategorical): data = cls._create_categorical(cls, data, categories, ordered) elif isinstance(data, CategoricalIndex): data = data._data @@ -55,7 +64,7 @@ def __new__(cls, data=None, categories=None, ordered=None, dtype=None, # don't allow scalars # if data is None, then categories must be provided - if lib.isscalar(data): + if is_scalar(data): if data is not None or categories is None: cls._scalar_data_error(data) data = [] @@ -113,7 +122,8 @@ def _create_categorical(self, data, categories=None, ordered=None): ------- Categorical """ - if not isinstance(data, com.ABCCategorical): + if not isinstance(data, ABCCategorical): + ordered = False if ordered is None else ordered from pandas.core.categorical import Categorical data = Categorical(data, categories=categories, ordered=ordered) else: @@ -161,7 +171,7 @@ def _is_dtype_compat(self, other): ------ TypeError if the dtypes are not compatible """ - if com.is_categorical_dtype(other): + if is_categorical_dtype(other): if isinstance(other, CategoricalIndex): other = other._values if not other.is_dtype_equal(self): @@ -169,7 +179,7 @@ def _is_dtype_compat(self, other): "when appending") else: values = other - if not com.is_list_like(values): + if not is_list_like(values): values = [values] other = CategoricalIndex(self._create_categorical( self, other, categories=self.categories, ordered=self.ordered)) @@ -186,9 +196,12 @@ def equals(self, other): if self.is_(other): return True + if not isinstance(other, Index): + return False + try: other = self._is_dtype_compat(other) - return com.array_equivalent(self._data, other) + return array_equivalent(self._data, other) except (TypeError, ValueError): pass @@ -273,12 +286,21 @@ def _engine(self): def is_unique(self): return not self.duplicated().any() + @Appender(base._shared_docs['unique'] % ibase._index_doc_kwargs) + def unique(self): + result = base.IndexOpsMixin.unique(self) + # CategoricalIndex._shallow_copy uses keeps original categories + # and ordered if not otherwise specified + return self._shallow_copy(result, categories=result.categories, + ordered=result.ordered) + @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'}) @Appender(base._shared_docs['duplicated'] % ibase._index_doc_kwargs) def duplicated(self, keep='first'): from pandas.hashtable import duplicated_int64 - return duplicated_int64(self.codes.astype('i8'), keep) + codes = self.codes.astype('i8') + return duplicated_int64(codes, keep) def _to_safe_for_reshape(self): """ convert to object if we are a categorical """ @@ -309,7 +331,7 @@ def _can_reindex(self, indexer): def where(self, cond, other=None): """ - .. versionadded:: 0.18.2 + .. versionadded:: 0.19.0 Return an Index of same shape as self and whose corresponding entries are from self where cond is True and otherwise are from @@ -356,7 +378,7 @@ def reindex(self, target, method=None, level=None, limit=None, target = ibase._ensure_index(target) - if not com.is_categorical_dtype(target) and not target.is_unique: + if not is_categorical_dtype(target) and not target.is_unique: raise ValueError("cannot reindex with a non-unique indexer") indexer, missing = self.get_indexer_non_unique(np.array(target)) @@ -384,7 +406,7 @@ def reindex(self, target, method=None, level=None, limit=None, # unless we had an inital Categorical to begin with # in which case we are going to conform to the passed Categorical new_target = np.asarray(new_target) - if com.is_categorical_dtype(target): + if is_categorical_dtype(target): new_target = target._shallow_copy(new_target, name=self.name) else: new_target = Index(new_target, name=self.name) @@ -456,7 +478,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): codes = self.categories.get_indexer(target) indexer, _ = self._engine.get_indexer_non_unique(codes) - return com._ensure_platform_int(indexer) + return _ensure_platform_int(indexer) def get_indexer_non_unique(self, target): """ this is the same for a CategoricalIndex for get_indexer; the API @@ -487,7 +509,7 @@ def _convert_list_indexer(self, keyarr, kind=None): def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): nv.validate_take(tuple(), kwargs) - indices = com._ensure_platform_int(indices) + indices = _ensure_platform_int(indices) taken = self._assert_take_fillable(self.codes, indices, allow_fill=allow_fill, fill_value=fill_value, @@ -550,26 +572,17 @@ def insert(self, loc, item): codes = np.concatenate((codes[:loc], code, codes[loc:])) return self._create_from_codes(codes) - def append(self, other): + def _append_same_dtype(self, to_concat, name): """ - Append a collection of CategoricalIndex options together - - Parameters - ---------- - other : Index or list/tuple of indices - - Returns - ------- - appended : Index - - Raises - ------ + Concatenate to_concat which has the same class ValueError if other is not in the categories """ - to_concat, name = self._ensure_compat_append(other) to_concat = [self._is_dtype_compat(c) for c in to_concat] codes = np.concatenate([c.codes for c in to_concat]) - return self._create_from_codes(codes, name=name) + result = self._create_from_codes(codes, name=name) + # if name is None, _create_from_codes sets self.name + result.name = name + return result @classmethod def _add_comparison_methods(cls): @@ -587,12 +600,12 @@ def _evaluate_compare(self, other): self, other._values, categories=self.categories, ordered=self.ordered) - if isinstance(other, (com.ABCCategorical, np.ndarray, - com.ABCSeries)): + if isinstance(other, (ABCCategorical, np.ndarray, + ABCSeries)): if len(self.values) != len(other): raise ValueError("Lengths must match to compare") - if isinstance(other, com.ABCCategorical): + if isinstance(other, ABCCategorical): if not self.values.is_dtype_equal(other): raise TypeError("categorical index comparisions must " "have the same categories and ordered " @@ -615,7 +628,7 @@ def _delegate_method(self, name, *args, **kwargs): if 'inplace' in kwargs: raise ValueError("cannot use inplace with CategoricalIndex") res = method(*args, **kwargs) - if lib.isscalar(res): + if is_scalar(res): return res return CategoricalIndex(res, name=self.name) @@ -636,7 +649,7 @@ def _add_accessors(cls): typ='method', overwrite=True) -CategoricalIndex._add_numericlike_set_methods_disabled() +CategoricalIndex._add_numeric_methods_add_sub_disabled() CategoricalIndex._add_numeric_methods_disabled() CategoricalIndex._add_logical_methods_disabled() CategoricalIndex._add_comparison_methods() diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index 05b2045a4850f..09c755b2c9792 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -13,6 +13,21 @@ from pandas.compat import range, zip, lrange, lzip, map from pandas.compat.numpy import function as nv from pandas import compat + + +from pandas.types.common import (_ensure_int64, + _ensure_platform_int, + is_object_dtype, + is_iterator, + is_list_like, + is_scalar) +from pandas.types.missing import isnull, array_equivalent +from pandas.core.common import (_values_from_object, + is_bool_indexer, + is_null_slice, + PerformanceWarning) + + from pandas.core.base import FrozenList import pandas.core.base as base from pandas.util.decorators import (Appender, cache_readonly, @@ -21,13 +36,6 @@ import pandas.core.missing as missing import pandas.core.algorithms as algos from pandas.formats.printing import pprint_thing -from pandas.core.common import (isnull, array_equivalent, - is_object_dtype, - _values_from_object, - is_iterator, - _ensure_int64, is_bool_indexer, - is_list_like, is_null_slice, - PerformanceWarning) from pandas.core.config import get_option @@ -589,6 +597,19 @@ def fillna(self, value=None, downcast=None): # isnull is not implemented for MultiIndex raise NotImplementedError('isnull is not defined for MultiIndex') + @Appender(_index_shared_docs['dropna']) + def dropna(self, how='any'): + nans = [label == -1 for label in self.labels] + if how == 'any': + indexer = np.any(nans, axis=0) + elif how == 'all': + indexer = np.all(nans, axis=0) + else: + raise ValueError("invalid how option: {0}".format(how)) + + new_labels = [label[~indexer] for label in self.labels] + return self.copy(labels=new_labels, deep=True) + def get_value(self, series, key): # somewhat broken encapsulation from pandas.core.indexing import maybe_droplevels @@ -666,10 +687,7 @@ def get_level_values(self, level): labels = self.labels[num] filled = algos.take_1d(unique.values, labels, fill_value=unique._na_value) - _simple_new = unique._simple_new - values = _simple_new(filled, name=self.names[num], - freq=getattr(unique, 'freq', None), - tz=getattr(unique, 'tz', None)) + values = unique._shallow_copy(filled) return values def format(self, space=2, sparsify=None, adjoin=True, names=False, @@ -798,7 +816,7 @@ def lexsort_depth(self): else: return 0 - int64_labels = [com._ensure_int64(lab) for lab in self.labels] + int64_labels = [_ensure_int64(lab) for lab in self.labels] for k in range(self.nlevels, 0, -1): if lib.is_lexsorted(int64_labels[:k]): return k @@ -834,15 +852,19 @@ def from_arrays(cls, arrays, sortorder=None, names=None): MultiIndex.from_product : Make a MultiIndex from cartesian product of iterables """ - from pandas.core.categorical import Categorical - if len(arrays) == 1: name = None if names is None else names[0] return Index(arrays[0], name=name) - cats = [Categorical.from_array(arr, ordered=True) for arr in arrays] - levels = [c.categories for c in cats] - labels = [c.codes for c in cats] + # Check if lengths of all arrays are equal or not, + # raise ValueError, if not + for i in range(1, len(arrays)): + if len(arrays[i]) != len(arrays[i - 1]): + raise ValueError('all arrays must be same length') + + from pandas.core.categorical import _factorize_from_iterables + + labels, levels = _factorize_from_iterables(arrays) if names is None: names = [getattr(arr, "name", None) for arr in arrays] @@ -928,15 +950,14 @@ def from_product(cls, iterables, sortorder=None, names=None): MultiIndex.from_arrays : Convert list of arrays to MultiIndex MultiIndex.from_tuples : Convert list of tuples to MultiIndex """ - from pandas.core.categorical import Categorical + from pandas.core.categorical import _factorize_from_iterables from pandas.tools.util import cartesian_product - categoricals = [Categorical.from_array(it, ordered=True) - for it in iterables] - labels = cartesian_product([c.codes for c in categoricals]) + labels, levels = _factorize_from_iterables(iterables) + labels = cartesian_product(labels) - return MultiIndex(levels=[c.categories for c in categoricals], - labels=labels, sortorder=sortorder, names=names) + return MultiIndex(levels=levels, labels=labels, sortorder=sortorder, + names=names) @property def nlevels(self): @@ -984,7 +1005,7 @@ def __setstate__(self, state): self._reset_identity() def __getitem__(self, key): - if lib.isscalar(key): + if is_scalar(key): retval = [] for lev, lab in zip(self.levels, self.labels): if lab[key] == -1: @@ -1011,7 +1032,7 @@ def __getitem__(self, key): def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): nv.validate_take(tuple(), kwargs) - indices = com._ensure_platform_int(indices) + indices = _ensure_platform_int(indices) taken = self._assert_take_fillable(self.labels, indices, allow_fill=allow_fill, fill_value=fill_value, @@ -1313,7 +1334,7 @@ def sortlevel(self, level=0, ascending=True, sort_remaining=True): if not ascending: indexer = indexer[::-1] - indexer = com._ensure_platform_int(indexer) + indexer = _ensure_platform_int(indexer) new_labels = [lab.take(indexer) for lab in self.labels] new_index = MultiIndex(labels=new_labels, levels=self.levels, @@ -1377,7 +1398,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): else: indexer = self_index._engine.get_indexer(target._values) - return com._ensure_platform_int(indexer) + return _ensure_platform_int(indexer) def reindex(self, target, method=None, level=None, limit=None, tolerance=None): @@ -1415,6 +1436,7 @@ def reindex(self, target, method=None, level=None, limit=None, return_indexers=True, keep_order=False) else: + target = _ensure_index(target) if self.equals(target): indexer = None else: @@ -1759,7 +1781,7 @@ def convert_indexer(start, stop, step, indexer=indexer, labels=labels): # selected from pandas import Series mapper = Series(indexer) - indexer = labels.take(com._ensure_platform_int(indexer)) + indexer = labels.take(_ensure_platform_int(indexer)) result = Series(Index(indexer).isin(r).nonzero()[0]) m = result.map(mapper)._values @@ -1963,6 +1985,9 @@ def equals(self, other): if self.is_(other): return True + if not isinstance(other, Index): + return False + if not isinstance(other, MultiIndex): return array_equivalent(self._values, _values_from_object(_ensure_index(other))) @@ -2194,6 +2219,7 @@ def isin(self, values, level=None): MultiIndex._add_numeric_methods_disabled() +MultiIndex._add_numeric_methods_add_sub_disabled() MultiIndex._add_logical_methods_disabled() diff --git a/pandas/indexes/numeric.py b/pandas/indexes/numeric.py index 0deaf4da9b2bb..b9625f3aaff92 100644 --- a/pandas/indexes/numeric.py +++ b/pandas/indexes/numeric.py @@ -1,15 +1,18 @@ import numpy as np import pandas.lib as lib +import pandas._join as _join import pandas.algos as _algos import pandas.index as _index +from pandas.types.common import (is_dtype_equal, pandas_dtype, + is_float_dtype, is_object_dtype, + is_integer_dtype, is_scalar) +from pandas.types.missing import isnull +from pandas.core.common import _values_from_object + from pandas import compat from pandas.indexes.base import Index, InvalidIndexError, _index_shared_docs from pandas.util.decorators import Appender, cache_readonly -import pandas.core.common as com -from pandas.core.common import (is_dtype_equal, isnull, pandas_dtype, - is_float_dtype, is_object_dtype, - is_integer_dtype) import pandas.indexes.base as ibase @@ -22,6 +25,28 @@ class NumericIndex(Index): """ _is_numeric_dtype = True + def __new__(cls, data=None, dtype=None, copy=False, name=None, + fastpath=False): + + if fastpath: + return cls._simple_new(data, name=name) + + # isscalar, generators handled in coerce_to_ndarray + data = cls._coerce_to_ndarray(data) + + if issubclass(data.dtype.type, compat.string_types): + cls._string_data_error(data) + + if copy or not is_dtype_equal(data.dtype, cls._default_dtype): + subarr = np.array(data, dtype=cls._default_dtype, copy=copy) + cls._assert_safe_casting(data, subarr) + else: + subarr = data + + if name is None and hasattr(data, 'name'): + name = data.name + return cls._simple_new(subarr, name=name) + def _maybe_cast_slice_bound(self, label, side, kind): """ This function should be overloaded in subclasses that allow non-trivial @@ -55,6 +80,15 @@ def _convert_tolerance(self, tolerance): raise ValueError('tolerance argument for %s must be numeric: %r' % (type(self).__name__, tolerance)) + @classmethod + def _assert_safe_casting(cls, data, subarr): + """ + Subclasses need to override this only if the process of casting data + from some accepted dtype to the internal dtype(s) bears the risk of + truncation (e.g. float to int). + """ + pass + class Int64Index(NumericIndex): """ @@ -81,38 +115,16 @@ class Int64Index(NumericIndex): _typ = 'int64index' _groupby = _algos.groupby_int64 _arrmap = _algos.arrmap_int64 - _left_indexer_unique = _algos.left_join_indexer_unique_int64 - _left_indexer = _algos.left_join_indexer_int64 - _inner_indexer = _algos.inner_join_indexer_int64 - _outer_indexer = _algos.outer_join_indexer_int64 + _left_indexer_unique = _join.left_join_indexer_unique_int64 + _left_indexer = _join.left_join_indexer_int64 + _inner_indexer = _join.inner_join_indexer_int64 + _outer_indexer = _join.outer_join_indexer_int64 _can_hold_na = False _engine_type = _index.Int64Engine - def __new__(cls, data=None, dtype=None, copy=False, name=None, - fastpath=False, **kwargs): - - if fastpath: - return cls._simple_new(data, name=name) - - # isscalar, generators handled in coerce_to_ndarray - data = cls._coerce_to_ndarray(data) - - if issubclass(data.dtype.type, compat.string_types): - cls._string_data_error(data) - - elif issubclass(data.dtype.type, np.integer): - dtype = np.int64 - subarr = np.array(data, dtype=dtype, copy=copy) - else: - subarr = np.array(data, dtype=np.int64, copy=copy) - if len(data) > 0: - if (subarr != data).any(): - raise TypeError('Unsafe NumPy casting to integer, you must' - ' explicitly cast') - - return cls._simple_new(subarr, name=name) + _default_dtype = np.int64 @property def inferred_type(self): @@ -148,24 +160,19 @@ def _convert_scalar_indexer(self, key, kind=None): return (super(Int64Index, self) ._convert_scalar_indexer(key, kind=kind)) - def equals(self, other): - """ - Determines if two Index objects contain the same elements. - """ - if self.is_(other): - return True - - try: - return com.array_equivalent(com._values_from_object(self), - com._values_from_object(other)) - except TypeError: - # e.g. fails in numpy 1.6 with DatetimeIndex #1681 - return False - def _wrap_joined_index(self, joined, other): name = self.name if self.name == other.name else None return Int64Index(joined, name=name) + @classmethod + def _assert_safe_casting(cls, data, subarr): + """ + Ensure incoming data can be represented as ints. + """ + if not issubclass(data.dtype.type, np.integer): + if not np.array_equal(data, subarr): + raise TypeError('Unsafe NumPy casting, you must ' + 'explicitly cast') Int64Index._add_numeric_methods() Int64Index._add_logical_methods() @@ -195,44 +202,12 @@ class Float64Index(NumericIndex): _engine_type = _index.Float64Engine _groupby = _algos.groupby_float64 _arrmap = _algos.arrmap_float64 - _left_indexer_unique = _algos.left_join_indexer_unique_float64 - _left_indexer = _algos.left_join_indexer_float64 - _inner_indexer = _algos.inner_join_indexer_float64 - _outer_indexer = _algos.outer_join_indexer_float64 - - def __new__(cls, data=None, dtype=None, copy=False, name=None, - fastpath=False, **kwargs): + _left_indexer_unique = _join.left_join_indexer_unique_float64 + _left_indexer = _join.left_join_indexer_float64 + _inner_indexer = _join.inner_join_indexer_float64 + _outer_indexer = _join.outer_join_indexer_float64 - if fastpath: - return cls._simple_new(data, name) - - data = cls._coerce_to_ndarray(data) - - if issubclass(data.dtype.type, compat.string_types): - cls._string_data_error(data) - - if dtype is None: - dtype = np.float64 - dtype = np.dtype(dtype) - - # allow integer / object dtypes to be passed, but coerce to float64 - if dtype.kind in ['i', 'O', 'f']: - dtype = np.float64 - - else: - raise TypeError("cannot support {0} dtype in " - "Float64Index".format(dtype)) - - try: - subarr = np.array(data, dtype=dtype, copy=copy) - except: - raise TypeError('Unsafe NumPy casting, you must explicitly cast') - - # coerce to float64 for storage - if subarr.dtype != np.float64: - subarr = subarr.astype(np.float64) - - return cls._simple_new(subarr, name) + _default_dtype = np.float64 @property def inferred_type(self): @@ -305,22 +280,14 @@ def _format_native_types(self, na_rep='', float_format=None, decimal='.', def get_value(self, series, key): """ we always want to get an index value, never a value """ - if not lib.isscalar(key): + if not is_scalar(key): raise InvalidIndexError - from pandas.core.indexing import maybe_droplevels - from pandas.core.series import Series - - k = com._values_from_object(key) + k = _values_from_object(key) loc = self.get_loc(k) - new_values = com._values_from_object(series)[loc] + new_values = _values_from_object(series)[loc] - if lib.isscalar(new_values) or new_values is None: - return new_values - - new_index = self[loc] - new_index = maybe_droplevels(new_index, k) - return Series(new_values, index=new_index, name=series.name) + return new_values def equals(self, other): """ @@ -329,6 +296,9 @@ def equals(self, other): if self is other: return True + if not isinstance(other, Index): + return False + # need to compare nans locations and make sure that they are the same # since nans don't compare equal this is a bit tricky try: @@ -339,8 +309,7 @@ def equals(self, other): return False left, right = self._values, other._values return ((left == right) | (self._isnan & other._isnan)).all() - except TypeError: - # e.g. fails in numpy 1.6 with DatetimeIndex #1681 + except (TypeError, ValueError): return False def __contains__(self, other): @@ -392,6 +361,5 @@ def isin(self, values, level=None): return lib.ismember_nans(np.array(self), value_set, isnull(list(value_set)).any()) - Float64Index._add_numeric_methods() Float64Index._add_logical_methods_disabled() diff --git a/pandas/indexes/range.py b/pandas/indexes/range.py index 168143fdea047..76166e7155bd0 100644 --- a/pandas/indexes/range.py +++ b/pandas/indexes/range.py @@ -4,14 +4,16 @@ import numpy as np import pandas.index as _index +from pandas.types.common import (is_integer, + is_scalar, + is_int64_dtype) + from pandas import compat from pandas.compat import lrange, range from pandas.compat.numpy import function as nv from pandas.indexes.base import Index, _index_shared_docs from pandas.util.decorators import Appender, cache_readonly -import pandas.core.common as com import pandas.indexes.base as ibase -import pandas.lib as lib from pandas.indexes.numeric import Int64Index @@ -56,19 +58,24 @@ def __new__(cls, start=None, stop=None, step=None, name=None, dtype=None, # validate the arguments def _ensure_int(value, field): + msg = ("RangeIndex(...) must be called with integers," + " {value} was passed for {field}") + if not is_scalar(value): + raise TypeError(msg.format(value=type(value).__name__, + field=field)) try: new_value = int(value) assert(new_value == value) - except (ValueError, AssertionError): - raise TypeError("RangeIndex(...) must be called with integers," - " {value} was passed for {field}".format( - value=type(value).__name__, - field=field) - ) + except (TypeError, ValueError, AssertionError): + raise TypeError(msg.format(value=type(value).__name__, + field=field)) return new_value - if start is None: + if start is None and stop is None and step is None: + msg = "RangeIndex(...) must be called with integers" + raise TypeError(msg) + elif start is None: start = 0 else: start = _ensure_int(start, 'start') @@ -120,8 +127,13 @@ def _simple_new(cls, start, stop=None, step=None, name=None, result = object.__new__(cls) # handle passed None, non-integers - if start is None or not com.is_integer(start): + if start is None and stop is None: + # empty + start, stop, step = 0, 0, 1 + + if start is None or not is_integer(start): try: + return RangeIndex(start, stop, step, name=name, **kwargs) except TypeError: return Index(start, stop, step, name=name, **kwargs) @@ -139,7 +151,7 @@ def _simple_new(cls, start, stop=None, step=None, name=None, @staticmethod def _validate_dtype(dtype): """ require dtype to be None or int64 """ - if not (dtype is None or com.is_int64_dtype(dtype)): + if not (dtype is None or is_int64_dtype(dtype)): raise TypeError('Invalid to pass a non-int64 dtype to RangeIndex') @cache_readonly @@ -219,6 +231,14 @@ def is_unique(self): """ return if the index has unique values """ return True + @cache_readonly + def is_monotonic_increasing(self): + return self._step > 0 or len(self) <= 1 + + @cache_readonly + def is_monotonic_decreasing(self): + return self._step < 0 or len(self) <= 1 + @property def has_duplicates(self): return False @@ -448,7 +468,7 @@ def __getitem__(self, key): """ super_getitem = super(RangeIndex, self).__getitem__ - if lib.isscalar(key): + if is_scalar(key): n = int(key) if n != key: return super_getitem(key) @@ -510,7 +530,7 @@ def __getitem__(self, key): return super_getitem(key) def __floordiv__(self, other): - if com.is_integer(other): + if is_integer(other): if (len(self) == 0 or self._start % other == 0 and self._step % other == 0): @@ -556,18 +576,20 @@ def _evaluate_numeric_binop(self, other): try: # alppy if we have an override if step: - rstep = step(self._step, other) + with np.errstate(all='ignore'): + rstep = step(self._step, other) # we don't have a representable op # so return a base index - if not com.is_integer(rstep) or not rstep: + if not is_integer(rstep) or not rstep: raise ValueError else: rstep = self._step - rstart = op(self._start, other) - rstop = op(self._stop, other) + with np.errstate(all='ignore'): + rstart = op(self._start, other) + rstop = op(self._stop, other) result = RangeIndex(rstart, rstop, @@ -577,7 +599,7 @@ def _evaluate_numeric_binop(self, other): # for compat with numpy / Int64Index # even if we can represent as a RangeIndex, return # as a Float64Index if we have float-like descriptors - if not all([com.is_integer(x) for x in + if not all([is_integer(x) for x in [rstart, rstop, rstep]]): result = result.astype('float64') @@ -592,7 +614,9 @@ def _evaluate_numeric_binop(self, other): if isinstance(other, RangeIndex): other = other.values - return Index(op(self, other), **attrs) + with np.errstate(all='ignore'): + results = op(self, other) + return Index(results, **attrs) return _evaluate_numeric_binop diff --git a/pandas/io/auth.py b/pandas/io/auth.py index b20b7c8ff1b04..e42df6a7309b7 100644 --- a/pandas/io/auth.py +++ b/pandas/io/auth.py @@ -30,7 +30,8 @@ class AuthenticationConfigError(ValueError): %s -with information from the APIs Console . +with information from the APIs Console +. """ DOC_URL = ('https://developers.google.com/api-client-library/python/guide/' diff --git a/pandas/io/common.py b/pandas/io/common.py index cf4bba6e97afb..127ebc4839fd3 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -4,14 +4,23 @@ import os import csv import codecs +import mmap import zipfile from contextlib import contextmanager, closing from pandas.compat import StringIO, BytesIO, string_types, text_type from pandas import compat from pandas.formats.printing import pprint_thing -from pandas.core.common import is_number, AbstractMethodError +from pandas.core.common import AbstractMethodError +from pandas.types.common import is_number +# common NA values +# no longer excluding inf representations +# '1.#INF','-1.#INF', '1.#INF000000', +_NA_VALUES = set([ + '-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A', + 'N/A', 'NA', '#NA', 'NULL', 'NaN', '-NaN', 'nan', '-nan', '' +]) try: import pathlib @@ -276,7 +285,7 @@ def ZipFile(*args, **kwargs): ZipFile = zipfile.ZipFile -def _get_handle(path, mode, encoding=None, compression=None): +def _get_handle(path, mode, encoding=None, compression=None, memory_map=False): """Gets file handle for given path and mode. """ if compression is not None: @@ -324,9 +333,57 @@ def _get_handle(path, mode, encoding=None, compression=None): else: f = open(path, mode) + if memory_map and hasattr(f, 'fileno'): + try: + g = MMapWrapper(f) + f.close() + f = g + except Exception: + # we catch any errors that may have occurred + # because that is consistent with the lower-level + # functionality of the C engine (pd.read_csv), so + # leave the file handler as is then + pass + return f +class MMapWrapper(BaseIterator): + """ + Wrapper for the Python's mmap class so that it can be properly read in + by Python's csv.reader class. + + Parameters + ---------- + f : file object + File object to be mapped onto memory. Must support the 'fileno' + method or have an equivalent attribute + + """ + + def __init__(self, f): + self.mmap = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) + + def __getattr__(self, name): + return getattr(self.mmap, name) + + def __next__(self): + newline = self.mmap.readline() + + # readline returns bytes, not str, in Python 3, + # but Python's CSV reader expects str, so convert + # the output to str before continuing + if compat.PY3: + newline = compat.bytes_to_str(newline) + + # mmap doesn't raise if reading past the allocated + # data but instead returns an empty string, so raise + # if that is returned + if newline == '': + raise StopIteration + return newline + + class UTF8Recoder(BaseIterator): """ diff --git a/pandas/io/data.py b/pandas/io/data.py index 5fa440e7bb1ff..e76790a6ab98b 100644 --- a/pandas/io/data.py +++ b/pandas/io/data.py @@ -1,1245 +1,6 @@ -""" -Module contains tools for collecting data from various remote sources - - -""" -# flake8: noqa - -import warnings -import tempfile -import datetime as dt -import time - -from collections import defaultdict - -import numpy as np - -from pandas.compat import( - StringIO, bytes_to_str, range, lmap, zip -) -import pandas.compat as compat -from pandas import Panel, DataFrame, Series, read_csv, concat, to_datetime, DatetimeIndex, DateOffset -from pandas.core.common import is_list_like, PandasError -from pandas.io.common import urlopen, ZipFile, urlencode -from pandas.tseries.offsets import MonthEnd -from pandas.util.testing import _network_error_classes -from pandas.io.html import read_html - -warnings.warn("\n" - "The pandas.io.data module is moved to a separate package " - "(pandas-datareader) and will be removed from pandas in a " - "future version.\nAfter installing the pandas-datareader package " - "(https://github.com/pydata/pandas-datareader), you can change " - "the import ``from pandas.io import data, wb`` to " - "``from pandas_datareader import data, wb``.", - FutureWarning) - -class SymbolWarning(UserWarning): - pass - - -class RemoteDataError(PandasError, IOError): - pass - - -def DataReader(name, data_source=None, start=None, end=None, - retry_count=3, pause=0.001): - """ - Imports data from a number of online sources. - - Currently supports Yahoo! Finance, Google Finance, St. Louis FED (FRED) - and Kenneth French's data library. - - Parameters - ---------- - name : str or list of strs - the name of the dataset. Some data sources (yahoo, google, fred) will - accept a list of names. - data_source: str, default: None - the data source ("yahoo", "google", "fred", or "ff") - start : datetime, default: None - left boundary for range (defaults to 1/1/2010) - end : datetime, default: None - right boundary for range (defaults to today) - retry_count : int, default 3 - Number of times to retry query request. - pause : numeric, default 0.001 - Time, in seconds, to pause between consecutive queries of chunks. If - single value given for symbol, represents the pause between retries. - - Examples - ---------- - - # Data from Yahoo! Finance - gs = DataReader("GS", "yahoo") - - # Data from Google Finance - aapl = DataReader("AAPL", "google") - - # Data from FRED - vix = DataReader("VIXCLS", "fred") - - # Data from Fama/French - ff = DataReader("F-F_Research_Data_Factors", "famafrench") - ff = DataReader("F-F_Research_Data_Factors_weekly", "famafrench") - ff = DataReader("6_Portfolios_2x3", "famafrench") - ff = DataReader("F-F_ST_Reversal_Factor", "famafrench") - """ - start, end = _sanitize_dates(start, end) - - if data_source == "yahoo": - return get_data_yahoo(symbols=name, start=start, end=end, - adjust_price=False, chunksize=25, - retry_count=retry_count, pause=pause) - elif data_source == "google": - return get_data_google(symbols=name, start=start, end=end, - adjust_price=False, chunksize=25, - retry_count=retry_count, pause=pause) - elif data_source == "fred": - return get_data_fred(name, start, end) - elif data_source == "famafrench": - return get_data_famafrench(name) - - -def _sanitize_dates(start, end): - from pandas.core.datetools import to_datetime - start = to_datetime(start) - end = to_datetime(end) - if start is None: - start = dt.datetime(2010, 1, 1) - if end is None: - end = dt.datetime.today() - return start, end - - -def _in_chunks(seq, size): - """ - Return sequence in 'chunks' of size defined by size - """ - return (seq[pos:pos + size] for pos in range(0, len(seq), size)) - - -_yahoo_codes = {'symbol': 's', 'last': 'l1', 'change_pct': 'p2', 'PE': 'r', - 'time': 't1', 'short_ratio': 's7'} - - -_YAHOO_QUOTE_URL = 'http://finance.yahoo.com/d/quotes.csv?' - - -def get_quote_yahoo(symbols): - """ - Get current yahoo quote - - Returns a DataFrame - """ - if isinstance(symbols, compat.string_types): - sym_list = symbols - else: - sym_list = '+'.join(symbols) - - # for codes see: http://www.gummy-stuff.org/Yahoo-data.htm - request = ''.join(compat.itervalues(_yahoo_codes)) # code request string - header = list(_yahoo_codes.keys()) - - data = defaultdict(list) - - url_str = _YAHOO_QUOTE_URL + 's=%s&f=%s' % (sym_list, request) - - with urlopen(url_str) as url: - lines = url.readlines() - - for line in lines: - fields = line.decode('utf-8').strip().split(',') - for i, field in enumerate(fields): - if field[-2:] == '%"': - v = float(field.strip('"%')) - elif field[0] == '"': - v = field.strip('"') - else: - try: - v = float(field) - except ValueError: - v = field - data[header[i]].append(v) - - idx = data.pop('symbol') - return DataFrame(data, index=idx) - - -def get_quote_google(symbols): - raise NotImplementedError("Google Finance doesn't have this functionality") - - -def _retry_read_url(url, retry_count, pause, name): - for _ in range(retry_count): - time.sleep(pause) - - # kludge to close the socket ASAP - try: - with urlopen(url) as resp: - lines = resp.read() - except _network_error_classes: - pass - else: - rs = read_csv(StringIO(bytes_to_str(lines)), index_col=0, - parse_dates=True, na_values='-')[::-1] - # Yahoo! Finance sometimes does this awesome thing where they - # return 2 rows for the most recent business day - if len(rs) > 2 and rs.index[-1] == rs.index[-2]: # pragma: no cover - rs = rs[:-1] - - #Get rid of unicode characters in index name. - try: - rs.index.name = rs.index.name.decode('unicode_escape').encode('ascii', 'ignore') - except AttributeError: - #Python 3 string has no decode method. - rs.index.name = rs.index.name.encode('ascii', 'ignore').decode() - - return rs - - raise IOError("after %d tries, %s did not " - "return a 200 for url %r" % (retry_count, name, url)) - - -_HISTORICAL_YAHOO_URL = 'http://ichart.finance.yahoo.com/table.csv?' - - -def _get_hist_yahoo(sym, start, end, interval, retry_count, pause): - """ - Get historical data for the given name from yahoo. - Date format is datetime - - Returns a DataFrame. - """ - start, end = _sanitize_dates(start, end) - url = (_HISTORICAL_YAHOO_URL + 's=%s' % sym + - '&a=%s' % (start.month - 1) + - '&b=%s' % start.day + - '&c=%s' % start.year + - '&d=%s' % (end.month - 1) + - '&e=%s' % end.day + - '&f=%s' % end.year + - '&g=%s' % interval + - '&ignore=.csv') - return _retry_read_url(url, retry_count, pause, 'Yahoo!') - - -_HISTORICAL_GOOGLE_URL = 'http://www.google.com/finance/historical?' - - -def _get_hist_google(sym, start, end, interval, retry_count, pause): - """ - Get historical data for the given name from google. - Date format is datetime - - Returns a DataFrame. - """ - start, end = _sanitize_dates(start, end) - - # www.google.com/finance/historical?q=GOOG&startdate=Jun+9%2C+2011&enddate=Jun+8%2C+2013&output=csv - url = "%s%s" % (_HISTORICAL_GOOGLE_URL, - urlencode({"q": sym, - "startdate": start.strftime('%b %d, ' '%Y'), - "enddate": end.strftime('%b %d, %Y'), - "output": "csv"})) - return _retry_read_url(url, retry_count, pause, 'Google') - - -def _adjust_prices(hist_data, price_list=None): - """ - Return modifed DataFrame or Panel with adjusted prices based on - 'Adj Close' price. Adds 'Adj_Ratio' column. - """ - if price_list is None: - price_list = 'Open', 'High', 'Low', 'Close' - adj_ratio = hist_data['Adj Close'] / hist_data['Close'] - - data = hist_data.copy() - for item in price_list: - data[item] = hist_data[item] * adj_ratio - data['Adj_Ratio'] = adj_ratio - del data['Adj Close'] - return data - - -def _calc_return_index(price_df): - """ - Return a returns index from a input price df or series. Initial value - (typically NaN) is set to 1. - """ - df = price_df.pct_change().add(1).cumprod() - mask = df.ix[1].notnull() & df.ix[0].isnull() - df.ix[0][mask] = 1 - - # Check for first stock listings after starting date of index in ret_index - # If True, find first_valid_index and set previous entry to 1. - if (~mask).any(): - for sym in mask.index[~mask]: - tstamp = df[sym].first_valid_index() - t_idx = df.index.get_loc(tstamp) - 1 - df[sym].ix[t_idx] = 1 - - return df - - -_YAHOO_COMPONENTS_URL = 'http://download.finance.yahoo.com/d/quotes.csv?' - - -def get_components_yahoo(idx_sym): - """ - Returns DataFrame containing list of component information for - index represented in idx_sym from yahoo. Includes component symbol - (ticker), exchange, and name. - - Parameters - ---------- - idx_sym : str - Stock index symbol - Examples: - '^DJI' (Dow Jones Industrial Average) - '^NYA' (NYSE Composite) - '^IXIC' (NASDAQ Composite) - - See: http://finance.yahoo.com/indices for other index symbols - - Returns - ------- - idx_df : DataFrame - """ - stats = 'snx' - # URL of form: - # http://download.finance.yahoo.com/d/quotes.csv?s=@%5EIXIC&f=snxl1d1t1c1ohgv - url = _YAHOO_COMPONENTS_URL + 's={0}&f={1}&e=.csv&h={2}' - - idx_mod = idx_sym.replace('^', '@%5E') - url_str = url.format(idx_mod, stats, 1) - - idx_df = DataFrame() - mask = [True] - comp_idx = 1 - - # LOOP across component index structure, - # break when no new components are found - while True in mask: - url_str = url.format(idx_mod, stats, comp_idx) - with urlopen(url_str) as resp: - raw = resp.read() - lines = raw.decode('utf-8').strip().strip('"').split('"\r\n"') - lines = [line.strip().split('","') for line in lines] - - temp_df = DataFrame(lines, columns=['ticker', 'name', 'exchange']) - temp_df = temp_df.drop_duplicates() - temp_df = temp_df.set_index('ticker') - mask = ~temp_df.index.isin(idx_df.index) - - comp_idx = comp_idx + 50 - idx_df = idx_df.append(temp_df[mask]) - - return idx_df - - -def _dl_mult_symbols(symbols, start, end, interval, chunksize, retry_count, pause, - method): - stocks = {} - failed = [] - passed = [] - for sym_group in _in_chunks(symbols, chunksize): - for sym in sym_group: - try: - stocks[sym] = method(sym, start, end, interval, retry_count, pause) - passed.append(sym) - except IOError: - warnings.warn('Failed to read symbol: {0!r}, replacing with ' - 'NaN.'.format(sym), SymbolWarning) - failed.append(sym) - - if len(passed) == 0: - raise RemoteDataError("No data fetched using " - "{0!r}".format(method.__name__)) - try: - if len(stocks) > 0 and len(failed) > 0 and len(passed) > 0: - df_na = stocks[passed[0]].copy() - df_na[:] = np.nan - for sym in failed: - stocks[sym] = df_na - return Panel(stocks).swapaxes('items', 'minor') - except AttributeError: - # cannot construct a panel with just 1D nans indicating no data - raise RemoteDataError("No data fetched using " - "{0!r}".format(method.__name__)) - -_source_functions = {'google': _get_hist_google, 'yahoo': _get_hist_yahoo} - - -def _get_data_from(symbols, start, end, interval, retry_count, pause, adjust_price, - ret_index, chunksize, source): - - src_fn = _source_functions[source] - - # If a single symbol, (e.g., 'GOOG') - if isinstance(symbols, (compat.string_types, int)): - hist_data = src_fn(symbols, start, end, interval, retry_count, pause) - # Or multiple symbols, (e.g., ['GOOG', 'AAPL', 'MSFT']) - elif isinstance(symbols, DataFrame): - hist_data = _dl_mult_symbols(symbols.index, start, end, interval, chunksize, - retry_count, pause, src_fn) - else: - hist_data = _dl_mult_symbols(symbols, start, end, interval, chunksize, - retry_count, pause, src_fn) - if source.lower() == 'yahoo': - if ret_index: - hist_data['Ret_Index'] = _calc_return_index(hist_data['Adj Close']) - if adjust_price: - hist_data = _adjust_prices(hist_data) - - return hist_data - - -def get_data_yahoo(symbols=None, start=None, end=None, retry_count=3, - pause=0.001, adjust_price=False, ret_index=False, - chunksize=25, interval='d'): - """ - Returns DataFrame/Panel of historical stock prices from symbols, over date - range, start to end. To avoid being penalized by Yahoo! Finance servers, - pauses between downloading 'chunks' of symbols can be specified. - - Parameters - ---------- - symbols : string, array-like object (list, tuple, Series), or DataFrame, default: None - Single stock symbol (ticker), array-like object of symbols or - DataFrame with index containing stock symbols - start : string, (defaults to '1/1/2010') - Starting date, timestamp. Parses many different kind of date - representations (e.g., 'JAN-01-2010', '1/1/10', 'Jan, 1, 1980') - end : string, (defaults to today) - Ending date, timestamp. Same format as starting date. - retry_count : int, default: 3 - Number of times to retry query request. - pause : numeric, default: 0.001 - Time, in seconds, to pause between consecutive queries of chunks. If - single value given for symbol, represents the pause between retries. - adjust_price : bool, default: False - If True, adjusts all prices in hist_data ('Open', 'High', 'Low', - 'Close') based on 'Adj Close' price. Adds 'Adj_Ratio' column and drops - 'Adj Close'. - ret_index : bool, default: False - If True, includes a simple return index 'Ret_Index' in hist_data. - chunksize : int, default: 25 - Number of symbols to download consecutively before intiating pause. - interval : string, default: 'd' - Time interval code, valid values are 'd' for daily, 'w' for weekly, - 'm' for monthly and 'v' for dividend. - - Returns - ------- - hist_data : DataFrame (str) or Panel (array-like object, DataFrame) - """ - if interval not in ['d', 'w', 'm', 'v']: - raise ValueError("Invalid interval: valid values are 'd', 'w', 'm' and 'v'") - return _get_data_from(symbols, start, end, interval, retry_count, pause, - adjust_price, ret_index, chunksize, 'yahoo') - - -def get_data_google(symbols=None, start=None, end=None, retry_count=3, - pause=0.001, adjust_price=False, ret_index=False, - chunksize=25): - """ - Returns DataFrame/Panel of historical stock prices from symbols, over date - range, start to end. To avoid being penalized by Google Finance servers, - pauses between downloading 'chunks' of symbols can be specified. - - Parameters - ---------- - symbols : string, array-like object (list, tuple, Series), or DataFrame - Single stock symbol (ticker), array-like object of symbols or - DataFrame with index containing stock symbols. - start : string, (defaults to '1/1/2010') - Starting date, timestamp. Parses many different kind of date - representations (e.g., 'JAN-01-2010', '1/1/10', 'Jan, 1, 1980') - end : string, (defaults to today) - Ending date, timestamp. Same format as starting date. - retry_count : int, default: 3 - Number of times to retry query request. - pause : numeric, default: 0.001 - Time, in seconds, to pause between consecutive queries of chunks. If - single value given for symbol, represents the pause between retries. - chunksize : int, default: 25 - Number of symbols to download consecutively before intiating pause. - ret_index : bool, default: False - If True, includes a simple return index 'Ret_Index' in hist_data. - - Returns - ------- - hist_data : DataFrame (str) or Panel (array-like object, DataFrame) - """ - return _get_data_from(symbols, start, end, None, retry_count, pause, - adjust_price, ret_index, chunksize, 'google') - - -_FRED_URL = "http://research.stlouisfed.org/fred2/series/" - - -def get_data_fred(name, start=dt.datetime(2010, 1, 1), - end=dt.datetime.today()): - """ - Get data for the given name from the St. Louis FED (FRED). - Date format is datetime - - Returns a DataFrame. - - If multiple names are passed for "series" then the index of the - DataFrame is the outer join of the indicies of each series. - """ - start, end = _sanitize_dates(start, end) - - if not is_list_like(name): - names = [name] - else: - names = name - - urls = [_FRED_URL + '%s' % n + '/downloaddata/%s' % n + '.csv' for - n in names] - - def fetch_data(url, name): - with urlopen(url) as resp: - data = read_csv(resp, index_col=0, parse_dates=True, - header=None, skiprows=1, names=["DATE", name], - na_values='.') - try: - return data.truncate(start, end) - except KeyError: - if data.ix[3].name[7:12] == 'Error': - raise IOError("Failed to get the data. Check that {0!r} is " - "a valid FRED series.".format(name)) - raise - df = concat([fetch_data(url, n) for url, n in zip(urls, names)], - axis=1, join='outer') - return df - - -_FAMAFRENCH_URL = 'http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp' - - -def get_data_famafrench(name): - # path of zip files - zip_file_path = '{0}/{1}_TXT.zip'.format(_FAMAFRENCH_URL, name) - - with urlopen(zip_file_path) as url: - raw = url.read() - - with tempfile.TemporaryFile() as tmpf: - tmpf.write(raw) - - with ZipFile(tmpf, 'r') as zf: - data = zf.open(zf.namelist()[0]).readlines() - - line_lengths = np.array(lmap(len, data)) - file_edges = np.where(line_lengths == 2)[0] - - datasets = {} - edges = zip(file_edges + 1, file_edges[1:]) - for i, (left_edge, right_edge) in enumerate(edges): - dataset = [d.split() for d in data[left_edge:right_edge]] - if len(dataset) > 10: - ncol_raw = np.array(lmap(len, dataset)) - ncol = np.median(ncol_raw) - header_index = np.where(ncol_raw == ncol - 1)[0][-1] - header = dataset[header_index] - ds_header = dataset[header_index + 1:] - # to ensure the header is unique - header = ['{0} {1}'.format(j, hj) for j, hj in enumerate(header, - start=1)] - index = np.array([d[0] for d in ds_header], dtype=int) - dataset = np.array([d[1:] for d in ds_header], dtype=float) - datasets[i] = DataFrame(dataset, index, columns=header) - - return datasets - - -# Items needed for options class -CUR_MONTH = dt.datetime.now().month -CUR_YEAR = dt.datetime.now().year -CUR_DAY = dt.datetime.now().day - - -def _two_char(s): - return '{0:0>2}'.format(s) - - -class Options(object): - """ - ***Experimental*** - This class fetches call/put data for a given stock/expiry month. - - It is instantiated with a string representing the ticker symbol. - - The class has the following methods: - get_options_data:(month, year, expiry) - get_call_data:(month, year, expiry) - get_put_data: (month, year, expiry) - get_near_stock_price(opt_frame, above_below) - get_all_data(call, put) - get_forward_data(months, call, put) (deprecated) - - Examples - -------- - # Instantiate object with ticker - >>> aapl = Options('aapl', 'yahoo') - - # Fetch next expiry call data - >>> calls = aapl.get_call_data() - - # Can now access aapl.calls instance variable - >>> aapl.calls - - # Fetch next expiry put data - >>> puts = aapl.get_put_data() - - # Can now access aapl.puts instance variable - >>> aapl.puts - - # cut down the call data to be 3 below and 3 above the stock price. - >>> cut_calls = aapl.get_near_stock_price(call=True, above_below=3) - - # Fetch call and put data with expiry from now to 8 months out - >>> forward_data = aapl.get_forward_data(8, call=True, put=True) - - # Fetch all call and put data - >>> all_data = aapl.get_all_data() - """ - - _TABLE_LOC = {'calls': 1, 'puts': 2} - _OPTIONS_BASE_URL = 'http://finance.yahoo.com/q/op?s={sym}' - _FINANCE_BASE_URL = 'http://finance.yahoo.com' - - def __init__(self, symbol, data_source=None): - """ Instantiates options_data with a ticker saved as symbol """ - self.symbol = symbol.upper() - if data_source is None: - warnings.warn("Options(symbol) is deprecated, use Options(symbol," - " data_source) instead", FutureWarning, stacklevel=2) - data_source = "yahoo" - if data_source != "yahoo": - raise NotImplementedError("currently only yahoo supported") - - def get_options_data(self, month=None, year=None, expiry=None): - """ - ***Experimental*** - Gets call/put data for the stock with the expiration data in the - given month and year - - Parameters - ---------- - month : number, int, optional(default=None) - The month the options expire. This should be either 1 or 2 - digits. - - year : number, int, optional(default=None) - The year the options expire. This should be a 4 digit int. - - expiry : date-like or convertible or list-like object, optional (default=None) - The date (or dates) when options expire (defaults to current month) - - Returns - ------- - pandas.DataFrame - A DataFrame with requested options data. - - Index: - Strike: Option strike, int - Expiry: Option expiry, Timestamp - Type: Call or Put, string - Symbol: Option symbol as reported on Yahoo, string - Columns: - Last: Last option price, float - Chg: Change from prior day, float - Bid: Bid price, float - Ask: Ask price, float - Vol: Volume traded, int64 - Open_Int: Open interest, int64 - IsNonstandard: True if the the deliverable is not 100 shares, otherwise false - Underlying: Ticker of the underlying security, string - Underlying_Price: Price of the underlying security, float64 - Quote_Time: Time of the quote, Timestamp - - Notes - ----- - Note: Format of returned data frame is dependent on Yahoo and may change. - - When called, this function will add instance variables named - calls and puts. See the following example: - - >>> aapl = Options('aapl', 'yahoo') # Create object - >>> aapl.calls # will give an AttributeError - >>> aapl.get_options() # Get data and set ivars - >>> aapl.calls # Doesn't throw AttributeError - - Also note that aapl.calls and appl.puts will always be the calls - and puts for the next expiry. If the user calls this method with - a different expiry, the ivar will be named callsYYMMDD or putsYYMMDD, - where YY, MM and DD are, respectively, two digit representations of - the year, month and day for the expiry of the options. - - """ - return concat([f(month, year, expiry) - for f in (self.get_put_data, - self.get_call_data)]).sortlevel() - - def _get_option_frames_from_yahoo(self, expiry): - url = self._yahoo_url_from_expiry(expiry) - option_frames = self._option_frames_from_url(url) - frame_name = '_frames' + self._expiry_to_string(expiry) - setattr(self, frame_name, option_frames) - return option_frames - - @staticmethod - def _expiry_to_string(expiry): - m1 = _two_char(expiry.month) - d1 = _two_char(expiry.day) - return str(expiry.year)[-2:] + m1 + d1 - - def _yahoo_url_from_expiry(self, expiry): - try: - expiry_links = self._expiry_links - - except AttributeError: - _, expiry_links = self._get_expiry_dates_and_links() - - return self._FINANCE_BASE_URL + expiry_links[expiry] - - def _option_frames_from_url(self, url): - frames = read_html(url) - nframes = len(frames) - frames_req = max(self._TABLE_LOC.values()) - if nframes < frames_req: - raise RemoteDataError("%s options tables found (%s expected)" % (nframes, frames_req)) - - if not hasattr(self, 'underlying_price'): - try: - self.underlying_price, self.quote_time = self._underlying_price_and_time_from_url(url) - except IndexError: - self.underlying_price, self.quote_time = np.nan, np.nan - - calls = frames[self._TABLE_LOC['calls']] - puts = frames[self._TABLE_LOC['puts']] - - calls = self._process_data(calls, 'call') - puts = self._process_data(puts, 'put') - - return {'calls': calls, 'puts': puts} - - def _underlying_price_and_time_from_url(self, url): - root = self._parse_url(url) - underlying_price = self._underlying_price_from_root(root) - quote_time = self._quote_time_from_root(root) - return underlying_price, quote_time - - @staticmethod - def _underlying_price_from_root(root): - underlying_price = root.xpath('.//*[@class="time_rtq_ticker Fz-30 Fw-b"]')[0]\ - .getchildren()[0].text - underlying_price = underlying_price.replace(',', '') #GH11 - - try: - underlying_price = float(underlying_price) - except ValueError: - underlying_price = np.nan - - return underlying_price - - @staticmethod - def _quote_time_from_root(root): - #Gets the time of the quote, note this is actually the time of the underlying price. - try: - quote_time_text = root.xpath('.//*[@class="time_rtq Fz-m"]')[0].getchildren()[1].getchildren()[0].text - ##TODO: Enable timezone matching when strptime can match EST with %Z - quote_time_text = quote_time_text.split(' ')[0] - quote_time = dt.datetime.strptime(quote_time_text, "%I:%M%p") - quote_time = quote_time.replace(year=CUR_YEAR, month=CUR_MONTH, day=CUR_DAY) - except ValueError: - quote_time = np.nan - - return quote_time - - def _get_option_data(self, expiry, name): - frame_name = '_frames' + self._expiry_to_string(expiry) - - try: - frames = getattr(self, frame_name) - except AttributeError: - frames = self._get_option_frames_from_yahoo(expiry) - - option_data = frames[name] - if expiry != self.expiry_dates[0]: - name += self._expiry_to_string(expiry) - - setattr(self, name, option_data) - return option_data - - def get_call_data(self, month=None, year=None, expiry=None): - """ - ***Experimental*** - Gets call/put data for the stock with the expiration data in the - given month and year - - Parameters - ---------- - month : number, int, optional(default=None) - The month the options expire. This should be either 1 or 2 - digits. - - year : number, int, optional(default=None) - The year the options expire. This should be a 4 digit int. - - expiry : date-like or convertible or list-like object, optional (default=None) - The date (or dates) when options expire (defaults to current month) - - Returns - ------- - call_data: pandas.DataFrame - A DataFrame with requested options data. - - Index: - Strike: Option strike, int - Expiry: Option expiry, Timestamp - Type: Call or Put, string - Symbol: Option symbol as reported on Yahoo, string - Columns: - Last: Last option price, float - Chg: Change from prior day, float - Bid: Bid price, float - Ask: Ask price, float - Vol: Volume traded, int64 - Open_Int: Open interest, int64 - IsNonstandard: True if the the deliverable is not 100 shares, otherwise false - Underlying: Ticker of the underlying security, string - Underlying_Price: Price of the underlying security, float64 - Quote_Time: Time of the quote, Timestamp - - Notes - ----- - Note: Format of returned data frame is dependent on Yahoo and may change. - - When called, this function will add instance variables named - calls and puts. See the following example: - - >>> aapl = Options('aapl', 'yahoo') # Create object - >>> aapl.calls # will give an AttributeError - >>> aapl.get_call_data() # Get data and set ivars - >>> aapl.calls # Doesn't throw AttributeError - - Also note that aapl.calls will always be the calls for the next - expiry. If the user calls this method with a different month - or year, the ivar will be named callsYYMMDD where YY, MM and DD are, - respectively, two digit representations of the year, month and day - for the expiry of the options. - """ - expiry = self._try_parse_dates(year, month, expiry) - return self._get_data_in_date_range(expiry, call=True, put=False) - - def get_put_data(self, month=None, year=None, expiry=None): - """ - ***Experimental*** - Gets put data for the stock with the expiration data in the - given month and year - - Parameters - ---------- - month : number, int, optional(default=None) - The month the options expire. This should be either 1 or 2 - digits. - - year : number, int, optional(default=None) - The year the options expire. This should be a 4 digit int. - - expiry : date-like or convertible or list-like object, optional (default=None) - The date (or dates) when options expire (defaults to current month) - - Returns - ------- - put_data: pandas.DataFrame - A DataFrame with requested options data. - - Index: - Strike: Option strike, int - Expiry: Option expiry, Timestamp - Type: Call or Put, string - Symbol: Option symbol as reported on Yahoo, string - Columns: - Last: Last option price, float - Chg: Change from prior day, float - Bid: Bid price, float - Ask: Ask price, float - Vol: Volume traded, int64 - Open_Int: Open interest, int64 - IsNonstandard: True if the the deliverable is not 100 shares, otherwise false - Underlying: Ticker of the underlying security, string - Underlying_Price: Price of the underlying security, float64 - Quote_Time: Time of the quote, Timestamp - - Notes - ----- - Note: Format of returned data frame is dependent on Yahoo and may change. - - When called, this function will add instance variables named - puts. See the following example: - - >>> aapl = Options('aapl') # Create object - >>> aapl.puts # will give an AttributeError - >>> aapl.get_put_data() # Get data and set ivars - >>> aapl.puts # Doesn't throw AttributeError - - return self.__setattr__(self, str(str(x) + str(y))) - - Also note that aapl.puts will always be the puts for the next - expiry. If the user calls this method with a different month - or year, the ivar will be named putsYYMMDD where YY, MM and DD are, - respectively, two digit representations of the year, month and day - for the expiry of the options. - """ - expiry = self._try_parse_dates(year, month, expiry) - return self._get_data_in_date_range(expiry, put=True, call=False) - - def get_near_stock_price(self, above_below=2, call=True, put=False, - month=None, year=None, expiry=None): - """ - ***Experimental*** - Returns a data frame of options that are near the current stock price. - - Parameters - ---------- - above_below : number, int, optional (default=2) - The number of strike prices above and below the stock price that - should be taken - - call : bool, default: True - Tells the function whether or not it should be using calls - - put : bool, default: False - Tells the function weather or not it should be using puts - - month : number, int, optional(default=None) - The month the options expire. This should be either 1 or 2 - digits. - - year : number, int, optional(default=None) - The year the options expire. This should be a 4 digit int. - - expiry : date-like or convertible or list-like object, optional (default=None) - The date (or dates) when options expire (defaults to current month) - - Returns - ------- - chopped: DataFrame - The resultant DataFrame chopped down to be 2 * above_below + 1 rows - desired. If there isn't data as far out as the user has asked for - then - - Note: Format of returned data frame is dependent on Yahoo and may change. - - """ - expiry = self._try_parse_dates(year, month, expiry) - data = self._get_data_in_date_range(expiry, call=call, put=put) - return self.chop_data(data, above_below, self.underlying_price) - - def chop_data(self, df, above_below=2, underlying_price=None): - """Returns a data frame only options that are near the current stock price.""" - - if not underlying_price: - try: - underlying_price = self.underlying_price - except AttributeError: - underlying_price = np.nan - - max_strike = max(df.index.get_level_values('Strike')) - min_strike = min(df.index.get_level_values('Strike')) - - if not np.isnan(underlying_price) and min_strike < underlying_price < max_strike: - start_index = np.where(df.index.get_level_values('Strike') - > underlying_price)[0][0] - - get_range = slice(start_index - above_below, - start_index + above_below + 1) - df = df[get_range].dropna(how='all') - - return df - - def _try_parse_dates(self, year, month, expiry): - """ - Validates dates provided by user. Ensures the user either provided both a month and a year or an expiry. - - Parameters - ---------- - year : int - Calendar year - - month : int - Calendar month - - expiry : date-like or convertible, (preferred) - Expiry date - - Returns - ------- - list of expiry dates (datetime.date) - """ - - #Checks if the user gave one of the month or the year but not both and did not provide an expiry: - if (month is not None and year is None) or (month is None and year is not None) and expiry is None: - msg = "You must specify either (`year` and `month`) or `expiry` " \ - "or none of these options for the next expiry." - raise ValueError(msg) - - if expiry is not None: - if hasattr(expiry, '__iter__'): - expiry = [self._validate_expiry(exp) for exp in expiry] - else: - expiry = [self._validate_expiry(expiry)] - - if len(expiry) == 0: - raise ValueError('No expiries available for given input.') - - elif year is None and month is None: - #No arguments passed, provide next expiry - year = CUR_YEAR - month = CUR_MONTH - expiry = dt.date(year, month, 1) - expiry = [self._validate_expiry(expiry)] - - else: - #Year and month passed, provide all expiries in that month - expiry = [expiry for expiry in self.expiry_dates if expiry.year == year and expiry.month == month] - if len(expiry) == 0: - raise ValueError('No expiries available in %s-%s' % (year, month)) - - return expiry - - def _validate_expiry(self, expiry): - """Ensures that an expiry date has data available on Yahoo - If the expiry date does not have options that expire on that day, return next expiry""" - - expiry_dates = self.expiry_dates - expiry = to_datetime(expiry) - if hasattr(expiry, 'date'): - expiry = expiry.date() - - if expiry in expiry_dates: - return expiry - else: - index = DatetimeIndex(expiry_dates).sort_values() - return index[index.date >= expiry][0].date() - - def get_forward_data(self, months, call=True, put=False, near=False, - above_below=2): - """ - ***Experimental*** - Gets either call, put, or both data for months starting in the current - month and going out in the future a specified amount of time. - - Parameters - ---------- - months : number, int - How many months to go out in the collection of the data. This is - inclusive. - - call : bool, optional (default=True) - Whether or not to collect data for call options - - put : bool, optional (default=False) - Whether or not to collect data for put options. - - near : bool, optional (default=False) - Whether this function should get only the data near the - current stock price. Uses Options.get_near_stock_price - - above_below : number, int, optional (default=2) - The number of strike prices above and below the stock price that - should be taken if the near option is set to True - - Returns - ------- - pandas.DataFrame - A DataFrame with requested options data. - - Index: - Strike: Option strike, int - Expiry: Option expiry, Timestamp - Type: Call or Put, string - Symbol: Option symbol as reported on Yahoo, string - Columns: - Last: Last option price, float - Chg: Change from prior day, float - Bid: Bid price, float - Ask: Ask price, float - Vol: Volume traded, int64 - Open_Int: Open interest, int64 - IsNonstandard: True if the the deliverable is not 100 shares, otherwise false - Underlying: Ticker of the underlying security, string - Underlying_Price: Price of the underlying security, float64 - Quote_Time: Time of the quote, Timestamp - - Note: Format of returned data frame is dependent on Yahoo and may change. - - """ - warnings.warn("get_forward_data() is deprecated", FutureWarning, - stacklevel=2) - end_date = dt.date.today() + MonthEnd(months) - dates = (date for date in self.expiry_dates if date <= end_date.date()) - data = self._get_data_in_date_range(dates, call=call, put=put) - if near: - data = self.chop_data(data, above_below=above_below) - return data - - def get_all_data(self, call=True, put=True): - """ - ***Experimental*** - Gets either call, put, or both data for all available months starting - in the current month. - - Parameters - ---------- - call : bool, optional (default=True) - Whether or not to collect data for call options - - put : bool, optional (default=True) - Whether or not to collect data for put options. - - Returns - ------- - pandas.DataFrame - A DataFrame with requested options data. - - Index: - Strike: Option strike, int - Expiry: Option expiry, Timestamp - Type: Call or Put, string - Symbol: Option symbol as reported on Yahoo, string - Columns: - Last: Last option price, float - Chg: Change from prior day, float - Bid: Bid price, float - Ask: Ask price, float - Vol: Volume traded, int64 - Open_Int: Open interest, int64 - IsNonstandard: True if the the deliverable is not 100 shares, otherwise false - Underlying: Ticker of the underlying security, string - Underlying_Price: Price of the underlying security, float64 - Quote_Time: Time of the quote, Timestamp - - Note: Format of returned data frame is dependent on Yahoo and may change. - - """ - - try: - expiry_dates = self.expiry_dates - except AttributeError: - expiry_dates, _ = self._get_expiry_dates_and_links() - - return self._get_data_in_date_range(dates=expiry_dates, call=call, put=put) - - def _get_data_in_date_range(self, dates, call=True, put=True): - - to_ret = Series({'calls': call, 'puts': put}) - to_ret = to_ret[to_ret].index - data = [] - - for name in to_ret: - for expiry_date in dates: - nam = name + self._expiry_to_string(expiry_date) - try: # Try to access on the instance - frame = getattr(self, nam) - except AttributeError: - frame = self._get_option_data(expiry=expiry_date, name=name) - data.append(frame) - - return concat(data).sortlevel() - - @property - def expiry_dates(self): - """ - Returns a list of available expiry dates - """ - try: - expiry_dates = self._expiry_dates - except AttributeError: - expiry_dates, _ = self._get_expiry_dates_and_links() - return expiry_dates - - def _get_expiry_dates_and_links(self): - """ - Gets available expiry dates. - - Returns - ------- - Tuple of: - List of datetime.date objects - Dict of datetime.date objects as keys and corresponding links - """ - - url = self._OPTIONS_BASE_URL.format(sym=self.symbol) - root = self._parse_url(url) - - try: - links = root.xpath('//*[@id="options_menu"]/form/select/option') - except IndexError: - raise RemoteDataError('Expiry dates not available') - - expiry_dates = [dt.datetime.strptime(element.text, "%B %d, %Y").date() for element in links] - links = [element.attrib['data-selectbox-link'] for element in links] - - if len(expiry_dates) == 0: - raise RemoteDataError('Data not available') - - expiry_links = dict(zip(expiry_dates, links)) - self._expiry_links = expiry_links - self._expiry_dates = expiry_dates - return expiry_dates, expiry_links - - def _parse_url(self, url): - """ - Downloads and parses a URL, returns xml root. - - """ - try: - from lxml.html import parse - except ImportError: - raise ImportError("Please install lxml if you want to use the " - "{0!r} class".format(self.__class__.__name__)) - try: - doc = parse(url) - except _network_error_classes: - raise RemoteDataError("Unable to parse URL " - "{0!r}".format(url)) - else: - root = doc.getroot() - if root is None: - raise RemoteDataError("Parsed URL {0!r} has no root" - "element".format(url)) - return root - - def _process_data(self, frame, type): - """ - Adds columns for Expiry, IsNonstandard (ie: deliverable is not 100 shares) - and Tag (the tag indicating what is actually deliverable, None if standard). - - """ - frame.columns = ['Strike', 'Symbol', 'Last', 'Bid', 'Ask', 'Chg', 'PctChg', 'Vol', 'Open_Int', 'IV'] - frame["Rootexp"] = frame.Symbol.str[0:-9] - frame["Root"] = frame.Rootexp.str[0:-6] - frame["Expiry"] = to_datetime(frame.Rootexp.str[-6:]) - #Removes dashes in equity ticker to map to option ticker. - #Ex: BRK-B to BRKB140517C00100000 - frame["IsNonstandard"] = frame['Root'] != self.symbol.replace('-', '') - del frame["Rootexp"] - frame["Underlying"] = self.symbol - try: - frame['Underlying_Price'] = self.underlying_price - frame["Quote_Time"] = self.quote_time - except AttributeError: - frame['Underlying_Price'] = np.nan - frame["Quote_Time"] = np.nan - frame.rename(columns={'Open Int': 'Open_Int'}, inplace=True) - frame['Type'] = type - frame.set_index(['Strike', 'Expiry', 'Type', 'Symbol'], inplace=True) - - return frame +raise ImportError( + "The pandas.io.data module is moved to a separate package " + "(pandas-datareader). After installing the pandas-datareader package " + "(https://github.com/pydata/pandas-datareader), you can change " + "the import ``from pandas.io import data, wb`` to " + "``from pandas_datareader import data, wb``.") diff --git a/pandas/io/excel.py b/pandas/io/excel.py index 4c26480a0f583..5e4dd4379a8e3 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -10,10 +10,14 @@ import abc import numpy as np +from pandas.types.common import (is_integer, is_float, + is_bool, is_list_like) + from pandas.core.frame import DataFrame from pandas.io.parsers import TextParser from pandas.io.common import (_is_url, _urlopen, _validate_header_arg, - EmptyDataError, get_filepath_or_buffer) + EmptyDataError, get_filepath_or_buffer, + _NA_VALUES) from pandas.tseries.period import Period from pandas import json from pandas.compat import (map, zip, reduce, range, lrange, u, add_metaclass, @@ -22,15 +26,107 @@ from pandas.formats.printing import pprint_thing import pandas.compat as compat import pandas.compat.openpyxl_compat as openpyxl_compat -import pandas.core.common as com from warnings import warn from distutils.version import LooseVersion +from pandas.util.decorators import Appender __all__ = ["read_excel", "ExcelWriter", "ExcelFile"] _writer_extensions = ["xlsx", "xls", "xlsm"] _writers = {} +_read_excel_doc = """ +Read an Excel table into a pandas DataFrame + +Parameters +---------- +io : string, path object (pathlib.Path or py._path.local.LocalPath), + file-like object, pandas ExcelFile, or xlrd workbook. + The string could be a URL. Valid URL schemes include http, ftp, s3, + and file. For file URLs, a host is expected. For instance, a local + file could be file://localhost/path/to/workbook.xlsx +sheetname : string, int, mixed list of strings/ints, or None, default 0 + + Strings are used for sheet names, Integers are used in zero-indexed + sheet positions. + + Lists of strings/integers are used to request multiple sheets. + + Specify None to get all sheets. + + str|int -> DataFrame is returned. + list|None -> Dict of DataFrames is returned, with keys representing + sheets. + + Available Cases + + * Defaults to 0 -> 1st sheet as a DataFrame + * 1 -> 2nd sheet as a DataFrame + * "Sheet1" -> 1st sheet as a DataFrame + * [0,1,"Sheet5"] -> 1st, 2nd & 5th sheet as a dictionary of DataFrames + * None -> All sheets as a dictionary of DataFrames + +header : int, list of ints, default 0 + Row (0-indexed) to use for the column labels of the parsed + DataFrame. If a list of integers is passed those row positions will + be combined into a ``MultiIndex`` +skiprows : list-like + Rows to skip at the beginning (0-indexed) +skip_footer : int, default 0 + Rows at the end to skip (0-indexed) +index_col : int, list of ints, default None + Column (0-indexed) to use as the row labels of the DataFrame. + Pass None if there is no such column. If a list is passed, + those columns will be combined into a ``MultiIndex`` +names : array-like, default None + List of column names to use. If file contains no header row, + then you should explicitly pass header=None +converters : dict, default None + Dict of functions for converting values in certain columns. Keys can + either be integers or column labels, values are functions that take one + input argument, the Excel cell content, and return the transformed + content. +parse_cols : int or list, default None + * If None then parse all columns, + * If int then indicates last column to be parsed + * If list of ints then indicates list of column numbers to be parsed + * If string then indicates comma separated list of column names and + column ranges (e.g. "A:E" or "A,C,E:F") +squeeze : boolean, default False + If the parsed data only contains one column then return a Series +na_values : scalar, str, list-like, or dict, default None + Additional strings to recognize as NA/NaN. If dict passed, specific + per-column NA values. By default the following values are interpreted + as NaN: '""" + "', '".join(sorted(_NA_VALUES)) + """'. +thousands : str, default None + Thousands separator for parsing string columns to numeric. Note that + this parameter is only necessary for columns stored as TEXT in Excel, + any numeric columns will automatically be parsed, regardless of display + format. +keep_default_na : bool, default True + If na_values are specified and keep_default_na is False the default NaN + values are overridden, otherwise they're appended to. +verbose : boolean, default False + Indicate number of NA values placed in non-numeric columns +engine: string, default None + If io is not a buffer or path, this must be set to identify io. + Acceptable values are None or xlrd +convert_float : boolean, default True + convert integral floats to int (i.e., 1.0 --> 1). If False, all numeric + data will be read in as floats: Excel stores all numbers as floats + internally +has_index_names : boolean, default None + DEPRECATED: for version 0.17+ index names will be automatically + inferred based on index_col. To read Excel output from 0.16.2 and + prior that had saved index names, use True. + +Returns +------- +parsed : DataFrame or Dict of DataFrames + DataFrame from the passed in Excel file. See notes in sheetname + argument for more information on when a Dict of Dataframes is returned. +""" + def register_writer(klass): """Adds engine to the excel writer registry. You must use this method to @@ -72,100 +168,13 @@ def get_writer(engine_name): raise ValueError("No Excel writer '%s'" % engine_name) +@Appender(_read_excel_doc) def read_excel(io, sheetname=0, header=0, skiprows=None, skip_footer=0, index_col=None, names=None, parse_cols=None, parse_dates=False, date_parser=None, na_values=None, thousands=None, convert_float=True, has_index_names=None, converters=None, engine=None, squeeze=False, **kwds): - """ - Read an Excel table into a pandas DataFrame - Parameters - ---------- - io : string, path object (pathlib.Path or py._path.local.LocalPath), - file-like object, pandas ExcelFile, or xlrd workbook. - The string could be a URL. Valid URL schemes include http, ftp, s3, - and file. For file URLs, a host is expected. For instance, a local - file could be file://localhost/path/to/workbook.xlsx - sheetname : string, int, mixed list of strings/ints, or None, default 0 - - Strings are used for sheet names, Integers are used in zero-indexed - sheet positions. - - Lists of strings/integers are used to request multiple sheets. - - Specify None to get all sheets. - - str|int -> DataFrame is returned. - list|None -> Dict of DataFrames is returned, with keys representing - sheets. - - Available Cases - - * Defaults to 0 -> 1st sheet as a DataFrame - * 1 -> 2nd sheet as a DataFrame - * "Sheet1" -> 1st sheet as a DataFrame - * [0,1,"Sheet5"] -> 1st, 2nd & 5th sheet as a dictionary of DataFrames - * None -> All sheets as a dictionary of DataFrames - - header : int, list of ints, default 0 - Row (0-indexed) to use for the column labels of the parsed - DataFrame. If a list of integers is passed those row positions will - be combined into a ``MultiIndex`` - skiprows : list-like - Rows to skip at the beginning (0-indexed) - skip_footer : int, default 0 - Rows at the end to skip (0-indexed) - index_col : int, list of ints, default None - Column (0-indexed) to use as the row labels of the DataFrame. - Pass None if there is no such column. If a list is passed, - those columns will be combined into a ``MultiIndex`` - names : array-like, default None - List of column names to use. If file contains no header row, - then you should explicitly pass header=None - converters : dict, default None - Dict of functions for converting values in certain columns. Keys can - either be integers or column labels, values are functions that take one - input argument, the Excel cell content, and return the transformed - content. - parse_cols : int or list, default None - * If None then parse all columns, - * If int then indicates last column to be parsed - * If list of ints then indicates list of column numbers to be parsed - * If string then indicates comma separated list of column names and - column ranges (e.g. "A:E" or "A,C,E:F") - squeeze : boolean, default False - If the parsed data only contains one column then return a Series - na_values : list-like, default None - List of additional strings to recognize as NA/NaN - thousands : str, default None - Thousands separator for parsing string columns to numeric. Note that - this parameter is only necessary for columns stored as TEXT in Excel, - any numeric columns will automatically be parsed, regardless of display - format. - keep_default_na : bool, default True - If na_values are specified and keep_default_na is False the default NaN - values are overridden, otherwise they're appended to - verbose : boolean, default False - Indicate number of NA values placed in non-numeric columns - engine: string, default None - If io is not a buffer or path, this must be set to identify io. - Acceptable values are None or xlrd - convert_float : boolean, default True - convert integral floats to int (i.e., 1.0 --> 1). If False, all numeric - data will be read in as floats: Excel stores all numbers as floats - internally - has_index_names : boolean, default None - DEPRECATED: for version 0.17+ index names will be automatically - inferred based on index_col. To read Excel output from 0.16.2 and - prior that had saved index names, use True. - - Returns - ------- - parsed : DataFrame or Dict of DataFrames - DataFrame from the passed in Excel file. See notes in sheetname - argument for more information on when a Dict of Dataframes is returned. - """ if not isinstance(io, ExcelFile): io = ExcelFile(io, engine=engine) @@ -423,27 +432,30 @@ def _parse_cell(cell_contents, cell_typ): output[asheetname] = DataFrame() continue - if com.is_list_like(header) and len(header) == 1: + if is_list_like(header) and len(header) == 1: header = header[0] # forward fill and pull out names for MultiIndex column header_names = None if header is not None: - if com.is_list_like(header): + if is_list_like(header): header_names = [] + control_row = [True for x in data[0]] for row in header: - if com.is_integer(skiprows): + if is_integer(skiprows): row += skiprows - data[row] = _fill_mi_header(data[row]) + + data[row], control_row = _fill_mi_header( + data[row], control_row) header_name, data[row] = _pop_header_name( data[row], index_col) header_names.append(header_name) else: data[header] = _trim_excel_header(data[header]) - if com.is_list_like(index_col): + if is_list_like(index_col): # forward fill values for MultiIndex index - if not com.is_list_like(header): + if not is_list_like(header): offset = 1 + header else: offset = 1 + max(header) @@ -456,7 +468,7 @@ def _parse_cell(cell_contents, cell_typ): else: last = data[row][col] - if com.is_list_like(header) and len(header) > 1: + if is_list_like(header) and len(header) > 1: has_index_names = True # GH 12292 : error when read one empty column from excel file @@ -468,7 +480,7 @@ def _parse_cell(cell_contents, cell_typ): parse_dates=parse_dates, date_parser=date_parser, skiprows=skiprows, - skip_footer=skip_footer, + skipfooter=skip_footer, squeeze=squeeze, **kwds) @@ -511,16 +523,35 @@ def _trim_excel_header(row): return row -def _fill_mi_header(row): - # forward fill blanks entries - # from headers if parsing as MultiIndex +def _fill_mi_header(row, control_row): + """Forward fills blank entries in row, but only inside the same parent index + + Used for creating headers in Multiindex. + Parameters + ---------- + row : list + List of items in a single row. + constrol_row : list of boolean + Helps to determine if particular column is in same parent index as the + previous value. Used to stop propagation of empty cells between + different indexes. + + Returns + ---------- + Returns changed row and control_row + """ last = row[0] for i in range(1, len(row)): + if not control_row[i]: + last = row[i] + if row[i] == '' or row[i] is None: row[i] = last else: + control_row[i] = False last = row[i] - return row + + return row, control_row # fill blank if index_col not None @@ -534,21 +565,21 @@ def _pop_header_name(row, index_col): return none_fill(row[0]), row[1:] else: # pop out header name and fill w/ blank - i = index_col if not com.is_list_like(index_col) else max(index_col) + i = index_col if not is_list_like(index_col) else max(index_col) return none_fill(row[i]), row[:i] + [''] + row[i + 1:] def _conv_value(val): # Convert numpy types to Python types for the Excel writers. - if com.is_integer(val): + if is_integer(val): val = int(val) - elif com.is_float(val): + elif is_float(val): val = float(val) - elif com.is_bool(val): + elif is_bool(val): val = bool(val) elif isinstance(val, Period): val = "%s" % val - elif com.is_list_like(val): + elif is_list_like(val): val = str(val) return val diff --git a/pandas/io/ga.py b/pandas/io/ga.py index 6dd0bb7472c37..45424e78ddbe7 100644 --- a/pandas/io/ga.py +++ b/pandas/io/ga.py @@ -1,5 +1,5 @@ """ -1. Goto https://code.google.com/apis/console +1. Goto https://console.developers.google.com/iam-admin/projects 2. Create new project 3. Goto APIs and register for OAuth2.0 for installed applications 4. Download JSON secret file and move into same directory as this file diff --git a/pandas/io/gbq.py b/pandas/io/gbq.py index e706434f29dc5..8f23e82daf2e3 100644 --- a/pandas/io/gbq.py +++ b/pandas/io/gbq.py @@ -46,8 +46,12 @@ def _test_google_api_imports(): try: import httplib2 # noqa - from apiclient.discovery import build # noqa - from apiclient.errors import HttpError # noqa + try: + from googleapiclient.discovery import build # noqa + from googleapiclient.errors import HttpError # noqa + except: + from apiclient.discovery import build # noqa + from apiclient.errors import HttpError # noqa from oauth2client.client import AccessTokenRefreshError # noqa from oauth2client.client import OAuth2WebServerFlow # noqa from oauth2client.file import Storage # noqa @@ -141,13 +145,14 @@ class GbqConnector(object): scope = 'https://www.googleapis.com/auth/bigquery' def __init__(self, project_id, reauth=False, verbose=False, - private_key=None): + private_key=None, dialect='legacy'): _check_google_client_version() _test_google_api_imports() self.project_id = project_id self.reauth = reauth self.verbose = verbose self.private_key = private_key + self.dialect = dialect self.credentials = self.get_credentials() self.service = self.get_service() @@ -155,7 +160,60 @@ def get_credentials(self): if self.private_key: return self.get_service_account_credentials() else: - return self.get_user_account_credentials() + # Try to retrieve Application Default Credentials + credentials = self.get_application_default_credentials() + if not credentials: + credentials = self.get_user_account_credentials() + return credentials + + def get_application_default_credentials(self): + """ + This method tries to retrieve the "default application credentials". + This could be useful for running code on Google Cloud Platform. + + .. versionadded:: 0.19.0 + + Parameters + ---------- + None + + Returns + ------- + - GoogleCredentials, + If the default application credentials can be retrieved + from the environment. The retrieved credentials should also + have access to the project (self.project_id) on BigQuery. + - OR None, + If default application credentials can not be retrieved + from the environment. Or, the retrieved credentials do not + have access to the project (self.project_id) on BigQuery. + """ + import httplib2 + try: + from googleapiclient.discovery import build + except ImportError: + from apiclient.discovery import build + try: + from oauth2client.client import GoogleCredentials + except ImportError: + return None + + try: + credentials = GoogleCredentials.get_application_default() + except: + return None + + http = httplib2.Http() + try: + http = credentials.authorize(http) + bigquery_service = build('bigquery', 'v2', http=http) + # Check if the application has rights to the BigQuery project + jobs = bigquery_service.jobs() + job_data = {'configuration': {'query': {'query': 'SELECT 1'}}} + jobs.insert(projectId=self.project_id, body=job_data).execute() + return credentials + except: + return None def get_user_account_credentials(self): from oauth2client.client import OAuth2WebServerFlow @@ -266,7 +324,10 @@ def sizeof_fmt(num, suffix='b'): def get_service(self): import httplib2 - from apiclient.discovery import build + try: + from googleapiclient.discovery import build + except: + from apiclient.discovery import build http = httplib2.Http() http = self.credentials.authorize(http) @@ -315,7 +376,10 @@ def process_insert_errors(self, insert_errors): raise StreamingInsertError def run_query(self, query): - from apiclient.errors import HttpError + try: + from googleapiclient.errors import HttpError + except: + from apiclient.errors import HttpError from oauth2client.client import AccessTokenRefreshError _check_google_client_version() @@ -324,7 +388,8 @@ def run_query(self, query): job_data = { 'configuration': { 'query': { - 'query': query + 'query': query, + 'useLegacySql': self.dialect == 'legacy' # 'allowLargeResults', 'createDisposition', # 'preserveNulls', destinationTable, useQueryCache } @@ -420,15 +485,17 @@ def run_query(self, query): return schema, result_pages def load_data(self, dataframe, dataset_id, table_id, chunksize): - from apiclient.errors import HttpError + try: + from googleapiclient.errors import HttpError + except: + from apiclient.errors import HttpError job_id = uuid.uuid4().hex rows = [] remaining_rows = len(dataframe) - if self.verbose: - total_rows = remaining_rows - self._print("\n\n") + total_rows = remaining_rows + self._print("\n\n") for index, row in dataframe.reset_index(drop=True).iterrows(): row_dict = dict() @@ -474,7 +541,10 @@ def load_data(self, dataframe, dataset_id, table_id, chunksize): self._print("\n") def verify_schema(self, dataset_id, table_id, schema): - from apiclient.errors import HttpError + try: + from googleapiclient.errors import HttpError + except: + from apiclient.errors import HttpError try: return (self.service.tables().get( @@ -547,7 +617,7 @@ def _parse_entry(field_value, field_type): def read_gbq(query, project_id=None, index_col=None, col_order=None, - reauth=False, verbose=True, private_key=None): + reauth=False, verbose=True, private_key=None, dialect='legacy'): """Load data from Google BigQuery. THIS IS AN EXPERIMENTAL LIBRARY @@ -560,10 +630,20 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, https://developers.google.com/api-client-library/python/apis/bigquery/v2 Authentication to the Google BigQuery service is via OAuth 2.0. - By default user account credentials are used. You will be asked to - grant permissions for product name 'pandas GBQ'. It is also posible - to authenticate via service account credentials by using - private_key parameter. + + - If "private_key" is not provided: + + By default "application default credentials" are used. + + .. versionadded:: 0.19.0 + + If default application credentials are not found or are restrictive, + user account credentials are used. In this case, you will be asked to + grant permissions for product name 'pandas GBQ'. + + - If "private_key" is provided: + + Service account credentials will be used to authenticate. Parameters ---------- @@ -586,6 +666,17 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, or string contents. This is useful for remote server authentication (eg. jupyter iPython notebook on remote host) + .. versionadded:: 0.18.1 + + dialect : {'legacy', 'standard'}, default 'legacy' + 'legacy' : Use BigQuery's legacy SQL dialect. + 'standard' : Use BigQuery's standard SQL (beta), which is + compliant with the SQL 2011 standard. For more information + see `BigQuery SQL Reference + `__ + + .. versionadded:: 0.19.0 + Returns ------- df: DataFrame @@ -596,8 +687,12 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, if not project_id: raise TypeError("Missing required parameter: project_id") + if dialect not in ('legacy', 'standard'): + raise ValueError("'{0}' is not valid for dialect".format(dialect)) + connector = GbqConnector(project_id, reauth=reauth, verbose=verbose, - private_key=private_key) + private_key=private_key, + dialect=dialect) schema, pages = connector.run_query(query) dataframe_list = [] while len(pages) > 0: @@ -656,10 +751,20 @@ def to_gbq(dataframe, destination_table, project_id, chunksize=10000, https://developers.google.com/api-client-library/python/apis/bigquery/v2 Authentication to the Google BigQuery service is via OAuth 2.0. - By default user account credentials are used. You will be asked to - grant permissions for product name 'pandas GBQ'. It is also posible - to authenticate via service account credentials by using - private_key parameter. + + - If "private_key" is not provided: + + By default "application default credentials" are used. + + .. versionadded:: 0.19.0 + + If default application credentials are not found or are restrictive, + user account credentials are used. In this case, you will be asked to + grant permissions for product name 'pandas GBQ'. + + - If "private_key" is provided: + + Service account credentials will be used to authenticate. Parameters ---------- @@ -765,7 +870,10 @@ class _Table(GbqConnector): def __init__(self, project_id, dataset_id, reauth=False, verbose=False, private_key=None): - from apiclient.errors import HttpError + try: + from googleapiclient.errors import HttpError + except: + from apiclient.errors import HttpError self.http_error = HttpError self.dataset_id = dataset_id super(_Table, self).__init__(project_id, reauth, verbose, private_key) @@ -865,7 +973,10 @@ class _Dataset(GbqConnector): def __init__(self, project_id, reauth=False, verbose=False, private_key=None): - from apiclient.errors import HttpError + try: + from googleapiclient.errors import HttpError + except: + from apiclient.errors import HttpError self.http_error = HttpError super(_Dataset, self).__init__(project_id, reauth, verbose, private_key) diff --git a/pandas/io/html.py b/pandas/io/html.py index 48caaa39dd711..3c38dae91eb89 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -12,12 +12,12 @@ import numpy as np +from pandas.types.common import is_list_like from pandas.io.common import (EmptyDataError, _is_url, urlopen, parse_url, _validate_header_arg) from pandas.io.parsers import TextParser from pandas.compat import (lrange, lmap, u, string_types, iteritems, raise_with_traceback, binary_type) -from pandas.core import common as com from pandas import Series from pandas.core.common import AbstractMethodError from pandas.formats.printing import pprint_thing @@ -107,7 +107,7 @@ def _get_skiprows(skiprows): """ if isinstance(skiprows, slice): return lrange(skiprows.start or 0, skiprows.stop, skiprows.step or 1) - elif isinstance(skiprows, numbers.Integral) or com.is_list_like(skiprows): + elif isinstance(skiprows, numbers.Integral) or is_list_like(skiprows): return skiprows elif skiprows is None: return 0 @@ -611,10 +611,10 @@ def _expand_elements(body): body[ind] += empty * (lens_max - length) -def _data_to_frame(data, header, index_col, skiprows, - parse_dates, tupleize_cols, thousands, - decimal): - head, body, foot = data +def _data_to_frame(**kwargs): + head, body, foot = kwargs.pop('data') + header = kwargs.pop('header') + kwargs['skiprows'] = _get_skiprows(kwargs['skiprows']) if head: body = [head] + body @@ -628,10 +628,7 @@ def _data_to_frame(data, header, index_col, skiprows, # fill out elements of body that are "ragged" _expand_elements(body) - tp = TextParser(body, header=header, index_col=index_col, - skiprows=_get_skiprows(skiprows), - parse_dates=parse_dates, tupleize_cols=tupleize_cols, - thousands=thousands, decimal=decimal) + tp = TextParser(body, header=header, **kwargs) df = tp.read() return df @@ -716,9 +713,7 @@ def _validate_flavor(flavor): return flavor -def _parse(flavor, io, match, header, index_col, skiprows, - parse_dates, tupleize_cols, thousands, attrs, encoding, - decimal): +def _parse(flavor, io, match, attrs, encoding, **kwargs): flavor = _validate_flavor(flavor) compiled_match = re.compile(match) # you can pass a compiled regex here @@ -740,15 +735,7 @@ def _parse(flavor, io, match, header, index_col, skiprows, ret = [] for table in tables: try: - ret.append(_data_to_frame(data=table, - header=header, - index_col=index_col, - skiprows=skiprows, - parse_dates=parse_dates, - tupleize_cols=tupleize_cols, - thousands=thousands, - decimal=decimal - )) + ret.append(_data_to_frame(data=table, **kwargs)) except EmptyDataError: # empty table continue return ret @@ -757,7 +744,8 @@ def _parse(flavor, io, match, header, index_col, skiprows, def read_html(io, match='.+', flavor=None, header=None, index_col=None, skiprows=None, attrs=None, parse_dates=False, tupleize_cols=False, thousands=',', encoding=None, - decimal='.'): + decimal='.', converters=None, na_values=None, + keep_default_na=True): r"""Read HTML tables into a ``list`` of ``DataFrame`` objects. Parameters @@ -837,7 +825,26 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, Character to recognize as decimal point (e.g. use ',' for European data). - .. versionadded:: 0.18.2 + .. versionadded:: 0.19.0 + + converters : dict, default None + Dict of functions for converting values in certain columns. Keys can + either be integers or column labels, values are functions that take one + input argument, the cell (not column) content, and return the + transformed content. + + .. versionadded:: 0.19.0 + + na_values : iterable, default None + Custom NA values + + .. versionadded:: 0.19.0 + + keep_default_na : bool, default True + If na_values are specified and keep_default_na is False the default NaN + values are overridden, otherwise they're appended to + + .. versionadded:: 0.19.0 Returns ------- @@ -881,6 +888,9 @@ def read_html(io, match='.+', flavor=None, header=None, index_col=None, raise ValueError('cannot skip rows starting from the end of the ' 'data (you passed a negative value)') _validate_header_arg(header) - return _parse(flavor, io, match, header, index_col, skiprows, - parse_dates, tupleize_cols, thousands, attrs, encoding, - decimal) + return _parse(flavor=flavor, io=io, match=match, header=header, + index_col=index_col, skiprows=skiprows, + parse_dates=parse_dates, tupleize_cols=tupleize_cols, + thousands=thousands, attrs=attrs, encoding=encoding, + decimal=decimal, converters=converters, na_values=na_values, + keep_default_na=keep_default_na) diff --git a/pandas/io/json.py b/pandas/io/json.py index fd97e51208f7e..e697351484f68 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -7,22 +7,25 @@ import pandas.json as _json from pandas.tslib import iNaT -from pandas.compat import long, u +from pandas.compat import StringIO, long, u from pandas import compat, isnull from pandas import Series, DataFrame, to_datetime -from pandas.io.common import get_filepath_or_buffer +from pandas.io.common import get_filepath_or_buffer, _get_handle from pandas.core.common import AbstractMethodError from pandas.formats.printing import pprint_thing loads = _json.loads dumps = _json.dumps -# interface to/from - +# interface to/from def to_json(path_or_buf, obj, orient=None, date_format='epoch', double_precision=10, force_ascii=True, date_unit='ms', - default_handler=None): + default_handler=None, lines=False): + + if lines and orient != 'records': + raise ValueError( + "'lines' keyword only valid when 'orient' is records") if isinstance(obj, Series): s = SeriesWriter( @@ -37,6 +40,9 @@ def to_json(path_or_buf, obj, orient=None, date_format='epoch', else: raise NotImplementedError("'obj' should be a Series or a DataFrame") + if lines: + s = _convert_to_line_delimits(s) + if isinstance(path_or_buf, compat.string_types): with open(path_or_buf, 'w') as fh: fh.write(s) @@ -105,7 +111,8 @@ def _format_axes(self): def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, convert_axes=True, convert_dates=True, keep_default_dates=True, - numpy=False, precise_float=False, date_unit=None): + numpy=False, precise_float=False, date_unit=None, encoding=None, + lines=False): """ Convert a JSON string to pandas object @@ -178,13 +185,23 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, is to try and detect the correct precision, but if this is not desired then pass one of 's', 'ms', 'us' or 'ns' to force parsing only seconds, milliseconds, microseconds or nanoseconds respectively. + lines : boolean, default False + Read the file as a json object per line. + + .. versionadded:: 0.19.0 + + encoding : str, default is 'utf-8' + The encoding to use to decode py3 bytes. + + .. versionadded:: 0.19.0 Returns ------- result : Series or DataFrame """ - filepath_or_buffer, _, _ = get_filepath_or_buffer(path_or_buf) + filepath_or_buffer, _, _ = get_filepath_or_buffer(path_or_buf, + encoding=encoding) if isinstance(filepath_or_buffer, compat.string_types): try: exists = os.path.exists(filepath_or_buffer) @@ -195,7 +212,7 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, exists = False if exists: - with open(filepath_or_buffer, 'r') as fh: + with _get_handle(filepath_or_buffer, 'r', encoding=encoding) as fh: json = fh.read() else: json = filepath_or_buffer @@ -204,6 +221,12 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, else: json = filepath_or_buffer + if lines: + # If given a json lines file, we break the string into lines, add + # commas and put it in a json list to make a valid json object. + lines = list(StringIO(json.strip())) + json = u'[' + u','.join(lines) + u']' + obj = None if typ == 'frame': obj = FrameParser(json, orient, dtype, convert_axes, convert_dates, @@ -574,6 +597,30 @@ def is_ok(col): # JSON normalization routines +def _convert_to_line_delimits(s): + """Helper function that converts json lists to line delimited json.""" + + # Determine we have a JSON list to turn to lines otherwise just return the + # json object, only lists can + if not s[0] == '[' and s[-1] == ']': + return s + s = s[1:-1] + num_open_brackets_seen = 0 + commas_to_replace = [] + for idx, char in enumerate(s): # iter through to find all + if char == ',': # commas that should be \n + if num_open_brackets_seen == 0: + commas_to_replace.append(idx) + elif char == '{': + num_open_brackets_seen += 1 + elif char == '}': + num_open_brackets_seen -= 1 + s_arr = np.array(list(s)) # Turn to an array to set + s_arr[commas_to_replace] = '\n' # all commas at once. + s = ''.join(s_arr) + return s + + def nested_to_record(ds, prefix="", level=0): """a simplified json_normalize diff --git a/pandas/io/packers.py b/pandas/io/packers.py index 23aa133125213..1838d9175e597 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -47,19 +47,21 @@ import numpy as np from pandas import compat from pandas.compat import u, u_safe + +from pandas.types.common import (is_categorical_dtype, is_object_dtype, + needs_i8_conversion, pandas_dtype) + from pandas import (Timestamp, Period, Series, DataFrame, # noqa Index, MultiIndex, Float64Index, Int64Index, Panel, RangeIndex, PeriodIndex, DatetimeIndex, NaT, Categorical) from pandas.tslib import NaTType -from pandas.sparse.api import SparseSeries, SparseDataFrame, SparsePanel +from pandas.sparse.api import SparseSeries, SparseDataFrame from pandas.sparse.array import BlockIndex, IntIndex from pandas.core.generic import NDFrame -from pandas.core.common import (PerformanceWarning, - is_categorical_dtype, is_object_dtype, - needs_i8_conversion, pandas_dtype) +from pandas.core.common import PerformanceWarning from pandas.io.common import get_filepath_or_buffer -from pandas.core.internals import BlockManager, make_block +from pandas.core.internals import BlockManager, make_block, _safe_reshape import pandas.core.internals as internals from pandas.msgpack import Unpacker as _Unpacker, Packer as _Packer, ExtType @@ -445,18 +447,6 @@ def encode(obj): # d['data'] = dict([(name, ss) # for name, ss in compat.iteritems(obj)]) # return d - elif isinstance(obj, SparsePanel): - raise NotImplementedError( - 'msgpack sparse frame is not implemented' - ) - # d = {'typ': 'sparse_panel', - # 'klass': obj.__class__.__name__, - # 'items': obj.items} - # for f in ['default_fill_value', 'default_kind']: - # d[f] = getattr(obj, f, None) - # d['data'] = dict([(name, df) - # for name, df in compat.iteritems(obj)]) - # return d else: data = obj._data @@ -481,12 +471,12 @@ def encode(obj): tz = obj.tzinfo if tz is not None: tz = u(tz.zone) - offset = obj.offset - if offset is not None: - offset = u(offset.freqstr) + freq = obj.freq + if freq is not None: + freq = u(freq.freqstr) return {u'typ': u'timestamp', u'value': obj.value, - u'offset': offset, + u'freq': freq, u'tz': tz} if isinstance(obj, NaTType): return {u'typ': u'nat'} @@ -556,7 +546,8 @@ def decode(obj): if typ is None: return obj elif typ == u'timestamp': - return Timestamp(obj[u'value'], tz=obj[u'tz'], offset=obj[u'offset']) + freq = obj[u'freq'] if 'freq' in obj else obj[u'offset'] + return Timestamp(obj[u'value'], tz=obj[u'tz'], freq=freq) elif typ == u'nat': return NaT elif typ == u'period': @@ -619,8 +610,9 @@ def decode(obj): axes = obj[u'axes'] def create_block(b): - values = unconvert(b[u'values'], dtype_for(b[u'dtype']), - b[u'compress']).reshape(b[u'shape']) + values = _safe_reshape(unconvert( + b[u'values'], dtype_for(b[u'dtype']), + b[u'compress']), b[u'shape']) # locs handles duplicate column names, and should be used instead # of items; see GH 9618 diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index bba8ad3ccd72b..93c431531355a 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2,26 +2,31 @@ Module contains tools for processing files into DataFrames or other objects """ from __future__ import print_function -from pandas.compat import range, lrange, StringIO, lzip, zip, string_types, map -from pandas import compat from collections import defaultdict import re import csv +import sys import warnings +import datetime import numpy as np -from pandas.core.index import Index, MultiIndex +from pandas import compat +from pandas.compat import (range, lrange, StringIO, lzip, + zip, string_types, map, u) +from pandas.types.common import (is_integer, _ensure_object, + is_list_like, is_integer_dtype, + is_float, + is_scalar) +from pandas.core.index import Index, MultiIndex, RangeIndex from pandas.core.frame import DataFrame -import datetime -import pandas.core.common as com from pandas.core.common import AbstractMethodError from pandas.core.config import get_option from pandas.io.date_converters import generic_parser from pandas.io.common import (get_filepath_or_buffer, _validate_header_arg, _get_handle, UnicodeReader, UTF8Recoder, BaseIterator, CParserError, EmptyDataError, - ParserWarning) + ParserWarning, _NA_VALUES) from pandas.tseries import tools from pandas.util.decorators import Appender @@ -29,13 +34,12 @@ import pandas.lib as lib import pandas.parser as _parser -# common NA values -# no longer excluding inf representations -# '1.#INF','-1.#INF', '1.#INF000000', -_NA_VALUES = set([ - '-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A', - 'N/A', 'NA', '#NA', 'NULL', 'NaN', '-NaN', 'nan', '-nan', '' -]) + +# BOM character (byte order mark) +# This exists at the beginning of a file to indicate endianness +# of a file (stream). Unfortunately, this marker screws up parsing, +# so we need to remove it if we see it. +_BOM = u('\ufeff') _parser_params = """Also supports optionally iterating or breaking of the file into chunks. @@ -87,6 +91,14 @@ inferred from the document header row(s). For example, a valid `usecols` parameter would be [0, 1, 2] or ['foo', 'bar', 'baz']. Using this parameter results in much faster parsing time and lower memory usage. +as_recarray : boolean, default False + DEPRECATED: this argument will be removed in a future version. Please call + `pd.read_csv(...).to_records()` instead. + + Return a NumPy recarray instead of a DataFrame after parsing the data. + If set to True, this option takes precedence over the `squeeze` parameter. + In addition, as row indices are not available in such a format, the + `index_col` parameter will be ignored. squeeze : boolean, default False If the parsed data only contains one column then return a Series prefix : str, default None @@ -114,9 +126,11 @@ at the start of the file skipfooter : int, default 0 Number of lines at bottom of file to skip (Unsupported with engine='c') +skip_footer : int, default 0 + DEPRECATED: use the `skipfooter` parameter instead, as they are identical nrows : int, default None Number of rows of file to read. Useful for reading pieces of large files -na_values : str or list-like or dict, default None +na_values : scalar, str, list-like, or dict, default None Additional strings to recognize as NA/NaN. If dict passed, specific per-column NA values. By default the following values are interpreted as NaN: `'""" + "'`, `'".join(sorted(_NA_VALUES)) + """'`. @@ -183,15 +197,23 @@ Thousands separator decimal : str, default '.' Character to recognize as decimal point (e.g. use ',' for European data). +float_precision : string, default None + Specifies which converter the C engine should use for floating-point + values. The options are `None` for the ordinary converter, + `high` for the high-precision converter, and `round_trip` for the + round-trip converter. lineterminator : str (length 1), default None Character to break file into lines. Only valid with C parser. quotechar : str (length 1), optional The character used to denote the start and end of a quoted item. Quoted items can include the delimiter and it will be ignored. -quoting : int or csv.QUOTE_* instance, default None +quoting : int or csv.QUOTE_* instance, default 0 Control field quoting behavior per ``csv.QUOTE_*`` constants. Use one of QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or QUOTE_NONE (3). - Default (None) results in QUOTE_MINIMAL behavior. +doublequote : boolean, default ``True`` + When quotechar is specified and quoting is not ``QUOTE_NONE``, indicate + whether or not to interpret two consecutive quotechar elements INSIDE a + field as a single ``quotechar`` element. escapechar : str (length 1), default None One-character string used to escape delimiter when quoting is QUOTE_NONE. comment : str, default None @@ -227,6 +249,26 @@ Note that the entire file is read into a single DataFrame regardless, use the `chunksize` or `iterator` parameter to return the data in chunks. (Only valid with C parser) +buffer_lines : int, default None + DEPRECATED: this argument will be removed in a future version because its + value is not respected by the parser +compact_ints : boolean, default False + DEPRECATED: this argument will be removed in a future version + + If compact_ints is True, then for any column that is of integer dtype, + the parser will attempt to cast it as the smallest integer dtype possible, + either signed or unsigned depending on the specification from the + `use_unsigned` parameter. +use_unsigned : boolean, default False + DEPRECATED: this argument will be removed in a future version + + If integer columns are being compacted (i.e. `compact_ints=True`), specify + whether the column should be compacted to the smallest signed or unsigned + integer dtype. +memory_map : boolean, default False + If a filepath is provided for `filepath_or_buffer`, map the file object + directly onto memory and access the data directly from there. Using this + option can improve performance because there is no longer any I/O overhead. Returns ------- @@ -238,11 +280,11 @@ Parser engine to use. The C engine is faster while the python engine is currently more feature-complete.""" -_sep_doc = """sep : str, default {default} +_sep_doc = r"""sep : str, default {default} Delimiter to use. If sep is None, will try to automatically determine - this. Separators longer than 1 character and different from '\s+' will be - interpreted as regular expressions, will force use of the python parsing - engine and will ignore quotes in the data. Regex example: '\\r\\t'""" + this. Separators longer than 1 character and different from ``'\s+'`` will + be interpreted as regular expressions, will force use of the python parsing + engine and will ignore quotes in the data. Regex example: ``'\r\t'``""" _read_csv_doc = """ Read CSV (comma-separated) file into DataFrame @@ -289,11 +331,11 @@ def _validate_nrows(nrows): msg = "'nrows' must be an integer" if nrows is not None: - if com.is_float(nrows): + if is_float(nrows): if int(nrows) != nrows: raise ValueError(msg) nrows = int(nrows) - elif not com.is_integer(nrows): + elif not is_integer(nrows): raise ValueError(msg) return nrows @@ -302,9 +344,9 @@ def _validate_nrows(nrows): def _read(filepath_or_buffer, kwds): "Generic reader of line files." encoding = kwds.get('encoding', None) - skipfooter = kwds.pop('skipfooter', None) - if skipfooter is not None: - kwds['skip_footer'] = skipfooter + if encoding is not None: + encoding = re.sub('_', '-', encoding).lower() + kwds['encoding'] = encoding # If the input could be a filename, check for a recognizable compression # extension. If we're reading from a URL, the `get_filepath_or_buffer` @@ -348,11 +390,15 @@ def _read(filepath_or_buffer, kwds): raise NotImplementedError("'nrows' and 'chunksize' cannot be used" " together yet.") elif nrows is not None: - return parser.read(nrows) + data = parser.read(nrows) + parser.close() + return data elif chunksize or iterator: return parser - return parser.read() + data = parser.read() + parser.close() + return data _parser_defaults = { 'delimiter': None, @@ -372,8 +418,8 @@ def _read(filepath_or_buffer, kwds): 'na_values': None, 'true_values': None, 'false_values': None, - 'skip_footer': 0, 'converters': None, + 'skipfooter': 0, 'keep_default_na': True, 'thousands': None, @@ -422,19 +468,21 @@ def _read(filepath_or_buffer, kwds): 'widths': None, } -_c_unsupported = set(['skip_footer']) +_c_unsupported = set(['skipfooter']) _python_unsupported = set([ - 'as_recarray', - 'compact_ints', - 'use_unsigned', 'low_memory', - 'memory_map', 'buffer_lines', 'error_bad_lines', 'warn_bad_lines', 'dtype', 'float_precision', ]) +_deprecated_args = set([ + 'as_recarray', + 'buffer_lines', + 'compact_ints', + 'use_unsigned', +]) def _make_parser_function(name, sep=','): @@ -462,7 +510,6 @@ def parser_f(filepath_or_buffer, false_values=None, skipinitialspace=False, skiprows=None, - skipfooter=None, nrows=None, # NA and Missing Data Handling @@ -500,8 +547,8 @@ def parser_f(filepath_or_buffer, error_bad_lines=True, warn_bad_lines=True, - # Deprecated - skip_footer=0, + skipfooter=0, + skip_footer=0, # deprecated # Internal doublequote=True, @@ -529,6 +576,13 @@ def parser_f(filepath_or_buffer, engine = 'c' engine_specified = False + if skip_footer != 0: + warnings.warn("The 'skip_footer' argument has " + "been deprecated and will be removed " + "in a future version. Please use the " + "'skipfooter' argument instead.", + FutureWarning, stacklevel=2) + kwds = dict(delimiter=delimiter, engine=engine, dialect=dialect, @@ -659,6 +713,7 @@ def __init__(self, f, engine=None, **kwds): # miscellanea self.engine = engine self._engine = None + self._currow = 0 options = self._get_options_with_defaults(engine) @@ -673,10 +728,7 @@ def __init__(self, f, engine=None, **kwds): self._make_engine(self.engine) def close(self): - try: - self._engine._reader.close() - except: - pass + self._engine.close() def _get_options_with_defaults(self, engine): kwds = self.orig_options @@ -726,11 +778,12 @@ def _clean_options(self, options, engine): # C engine not supported yet if engine == 'c': - if options['skip_footer'] > 0: + if options['skipfooter'] > 0: fallback_reason = "the 'c' engine does not support"\ - " skip_footer" + " skipfooter" engine = 'python' + encoding = sys.getfilesystemencoding() or 'utf-8' if sep is None and not delim_whitespace: if engine == 'c': fallback_reason = "the 'c' engine does not support"\ @@ -750,6 +803,19 @@ def _clean_options(self, options, engine): elif delim_whitespace: if 'python' in engine: result['delimiter'] = '\s+' + elif sep is not None: + encodeable = True + try: + if len(sep.encode(encoding)) > 1: + encodeable = False + except UnicodeDecodeError: + encodeable = False + if not encodeable and engine not in ('python', 'python-fwf'): + fallback_reason = "the separator encoded in {encoding}" \ + " is > 1 char long, and the 'c' engine" \ + " does not support such separators".format( + encoding=encoding) + engine = 'python' if fallback_reason and engine_specified: raise ValueError(fallback_reason) @@ -789,6 +855,23 @@ def _clean_options(self, options, engine): _validate_header_arg(options['header']) + depr_warning = '' + + for arg in _deprecated_args: + parser_default = _c_parser_defaults[arg] + msg = ("The '{arg}' argument has been deprecated " + "and will be removed in a future version." + .format(arg=arg)) + + if arg == 'as_recarray': + msg += ' Please call pd.to_csv(...).to_records() instead.' + + if result.get(arg, parser_default) != parser_default: + depr_warning += msg + '\n\n' + + if depr_warning != '': + warnings.warn(depr_warning, FutureWarning, stacklevel=2) + if index_col is True: raise ValueError("The value of index_col couldn't be 'True'") if _is_index_col(index_col): @@ -813,7 +896,7 @@ def _clean_options(self, options, engine): # handle skiprows; this is internally handled by the # c-engine, so only need for python parsers if engine != 'c': - if com.is_integer(skiprows): + if is_integer(skiprows): skiprows = lrange(skiprows) skiprows = set() if skiprows is None else set(skiprows) @@ -827,7 +910,11 @@ def _clean_options(self, options, engine): return result, engine def __next__(self): - return self.get_chunk() + try: + return self.get_chunk() + except StopIteration: + self.close() + raise def _make_engine(self, engine='c'): if engine == 'c': @@ -844,8 +931,8 @@ def _failover_to_python(self): def read(self, nrows=None): if nrows is not None: - if self.options.get('skip_footer'): - raise ValueError('skip_footer not supported for iteration') + if self.options.get('skipfooter'): + raise ValueError('skipfooter not supported for iteration') ret = self._engine.read(nrows) @@ -855,8 +942,20 @@ def read(self, nrows=None): # May alter columns / col_dict index, columns, col_dict = self._create_index(ret) + if index is None: + if col_dict: + # Any column is actually fine: + new_rows = len(compat.next(compat.itervalues(col_dict))) + index = RangeIndex(self._currow, self._currow + new_rows) + else: + new_rows = 0 + else: + new_rows = len(index) + df = DataFrame(col_dict, columns=columns, index=index) + self._currow += new_rows + if self.squeeze and len(df.columns) == 1: return df[df.columns[0]].copy() return df @@ -887,9 +986,11 @@ def _validate_usecols_arg(usecols): if usecols is not None: usecols_dtype = lib.infer_dtype(usecols) - if usecols_dtype not in ('integer', 'string', 'unicode'): + if usecols_dtype not in ('empty', 'integer', + 'string', 'unicode'): raise ValueError(msg) + return set(usecols) return usecols @@ -904,7 +1005,7 @@ def _validate_parse_dates_arg(parse_dates): "for the 'parse_dates' parameter") if parse_dates is not None: - if lib.isscalar(parse_dates): + if is_scalar(parse_dates): if not lib.is_bool(parse_dates): raise TypeError(msg) @@ -935,6 +1036,7 @@ def __init__(self, kwds): self.na_fvalues = kwds.get('na_fvalues') self.true_values = kwds.get('true_values') self.false_values = kwds.get('false_values') + self.as_recarray = kwds.get('as_recarray', False) self.tupleize_cols = kwds.get('tupleize_cols', False) self.mangle_dupe_cols = kwds.get('mangle_dupe_cols', True) self.infer_datetime_format = kwds.pop('infer_datetime_format', False) @@ -963,8 +1065,8 @@ def __init__(self, kwds): is_sequence = isinstance(self.index_col, (list, tuple, np.ndarray)) if not (is_sequence and - all(map(com.is_integer, self.index_col)) or - com.is_integer(self.index_col)): + all(map(is_integer, self.index_col)) or + is_integer(self.index_col)): raise ValueError("index_col must only contain row numbers " "when specifying a multi-index header") @@ -972,8 +1074,13 @@ def __init__(self, kwds): self._first_chunk = True + # GH 13932 + # keep references to file handles opened by the parser itself + self.handles = [] + def close(self): - self._reader.close() + for f in self.handles: + f.close() @property def _has_complex_date_col(self): @@ -989,7 +1096,7 @@ def _should_parse_dates(self, i): name = self.index_names[i] j = self.index_col[i] - if lib.isscalar(self.parse_dates): + if is_scalar(self.parse_dates): return (j == self.parse_dates) or (name == self.parse_dates) else: return (j in self.parse_dates) or (name in self.parse_dates) @@ -1206,6 +1313,12 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, cvals, na_count = self._convert_types( values, set(col_na_values) | col_na_fvalues, coerce_type) + + if issubclass(cvals.dtype.type, np.integer) and self.compact_ints: + cvals = lib.downcast_int64( + cvals, _parser.na_values, + self.use_unsigned) + result[c] = cvals if verbose and na_count: print('Filled %d NA values in column %s' % (na_count, str(c))) @@ -1217,7 +1330,7 @@ def _convert_types(self, values, na_values, try_num_bool=True): mask = lib.ismember(values, na_values) na_count = mask.sum() if na_count > 0: - if com.is_integer_dtype(values): + if is_integer_dtype(values): values = values.astype(np.float64) np.putmask(values, mask, np.nan) return values, na_count @@ -1260,12 +1373,12 @@ def __init__(self, src, **kwds): self.kwds = kwds kwds = kwds.copy() - self.as_recarray = kwds.get('as_recarray', False) ParserBase.__init__(self, kwds) if 'utf-16' in (kwds.get('encoding') or ''): if isinstance(src, compat.string_types): src = open(src, 'rb') + self.handles.append(src) src = UTF8Recoder(src, kwds['encoding']) kwds['encoding'] = 'utf-8' @@ -1339,15 +1452,23 @@ def __init__(self, src, **kwds): self._implicit_index = self._reader.leading_cols > 0 + def close(self): + for f in self.handles: + f.close() + try: + self._reader.close() + except: + pass + def _set_noconvert_columns(self): names = self.orig_names usecols = self.usecols def _set(x): - if usecols and com.is_integer(x): + if usecols and is_integer(x): x = list(usecols)[x] - if not com.is_integer(x): + if not is_integer(x): x = names.index(x) self._reader.set_noconvert(x) @@ -1368,6 +1489,13 @@ def _set(x): else: _set(val) + elif self.parse_dates: + if isinstance(self.index_col, list): + for k in self.index_col: + _set(k) + elif self.index_col is not None: + _set(self.index_col) + def set_error_bad_lines(self, status): self._reader.set_error_bad_lines(int(status)) @@ -1501,8 +1629,8 @@ def TextParser(*args, **kwds): has_index_names: boolean, default False True if the cols defined in index_col have an index name and are not in the header - na_values : iterable, default None - Custom NA values + na_values : scalar, str, list-like, or dict, default None + Additional strings to recognize as NA/NaN. keep_default_na : bool, default True thousands : str, default None Thousands separator @@ -1513,7 +1641,7 @@ def TextParser(*args, **kwds): date_parser : function, default None skiprows : list of integers Row numbers to skip - skip_footer : int + skipfooter : int Number of line at bottom of file to skip converters : dict, default None Dict of functions for converting values in certain columns. Keys can @@ -1623,9 +1751,10 @@ def __init__(self, f, **kwds): self.encoding = kwds['encoding'] self.compression = kwds['compression'] + self.memory_map = kwds['memory_map'] self.skiprows = kwds['skiprows'] - self.skip_footer = kwds['skip_footer'] + self.skipfooter = kwds['skipfooter'] self.delimiter = kwds['delimiter'] self.quotechar = kwds['quotechar'] @@ -1648,21 +1777,28 @@ def __init__(self, f, **kwds): self.verbose = kwds['verbose'] self.converters = kwds['converters'] + self.compact_ints = kwds['compact_ints'] + self.use_unsigned = kwds['use_unsigned'] self.thousands = kwds['thousands'] self.decimal = kwds['decimal'] + self.comment = kwds['comment'] self._comment_lines = [] if isinstance(f, compat.string_types): f = _get_handle(f, 'r', encoding=self.encoding, - compression=self.compression) + compression=self.compression, + memory_map=self.memory_map) + self.handles.append(f) elif self.compression: f = _wrap_compressed(f, self.compression, self.encoding) + self.handles.append(f) # in Python 3, convert BytesIO or fileobjects passed with an encoding elif compat.PY3 and isinstance(f, compat.BytesIO): from io import TextIOWrapper f = TextIOWrapper(f, encoding=self.encoding) + self.handles.append(f) # Set self.data to something that can read lines. if hasattr(f, 'readline'): @@ -1722,7 +1858,7 @@ def _set_no_thousands_columns(self): noconvert_columns = set() def _set(x): - if com.is_integer(x): + if is_integer(x): noconvert_columns.add(x) else: noconvert_columns.add(self.columns.index(x)) @@ -1742,6 +1878,14 @@ def _set(x): _set(k) else: _set(val) + + elif self.parse_dates: + if isinstance(self.index_col, list): + for k in self.index_col: + _set(k) + elif self.index_col is not None: + _set(self.index_col) + return noconvert_columns def _make_reader(self, f): @@ -1800,7 +1944,11 @@ class MyDialect(csv.Dialect): else: def _read(): - line = next(f) + line = f.readline() + + if compat.PY2 and self.encoding: + line = line.decode(self.encoding) + pat = re.compile(sep) yield pat.split(line.strip()) for line in f: @@ -1842,6 +1990,9 @@ def read(self, rows=None): columns, data = self._do_date_conversions(columns, data) data = self._convert_data(data) + if self.as_recarray: + return self._to_recarray(data, columns) + index, columns = self._make_index(data, alldata, columns, indexnamerow) return index, columns, data @@ -1881,6 +2032,19 @@ def _convert_data(self, data): return self._convert_to_ndarrays(data, self.na_values, self.na_fvalues, self.verbose, clean_conv) + def _to_recarray(self, data, columns): + dtypes = [] + o = compat.OrderedDict() + + # use the columns to "order" the keys + # in the unordered 'data' dictionary + for col in columns: + dtypes.append((str(col), data[col].dtype)) + o[col] = data[col] + + tuples = lzip(*o.values()) + return np.array(tuples, dtypes) + def _infer_columns(self): names = self.names num_original_columns = 0 @@ -2053,6 +2217,67 @@ def _buffered_line(self): else: return self._next_line() + def _check_for_bom(self, first_row): + """ + Checks whether the file begins with the BOM character. + If it does, remove it. In addition, if there is quoting + in the field subsequent to the BOM, remove it as well + because it technically takes place at the beginning of + the name, not the middle of it. + """ + # first_row will be a list, so we need to check + # that that list is not empty before proceeding. + if not first_row: + return first_row + + # The first element of this row is the one that could have the + # BOM that we want to remove. Check that the first element is a + # string before proceeding. + if not isinstance(first_row[0], compat.string_types): + return first_row + + # Check that the string is not empty, as that would + # obviously not have a BOM at the start of it. + if not first_row[0]: + return first_row + + # Since the string is non-empty, check that it does + # in fact begin with a BOM. + first_elt = first_row[0][0] + + # This is to avoid warnings we get in Python 2.x if + # we find ourselves comparing with non-Unicode + if compat.PY2 and not isinstance(first_elt, unicode): # noqa + try: + first_elt = u(first_elt) + except UnicodeDecodeError: + return first_row + + if first_elt != _BOM: + return first_row + + first_row = first_row[0] + + if len(first_row) > 1 and first_row[1] == self.quotechar: + start = 2 + quote = first_row[1] + end = first_row[2:].index(quote) + 2 + + # Extract the data between the quotation marks + new_row = first_row[start:end] + + # Extract any remaining data after the second + # quotation mark. + if len(first_row) > end + 1: + new_row += first_row[end + 1:] + return [new_row] + elif len(first_row) > 1: + return [first_row[1:]] + else: + # First row is just the BOM, so we + # return an empty string. + return [""] + def _empty(self, line): return not line or all(not x for x in line) @@ -2082,7 +2307,17 @@ def _next_line(self): next(self.data) while True: - orig_line = next(self.data) + try: + orig_line = next(self.data) + except csv.Error as e: + if 'NULL byte' in str(e): + raise csv.Error( + 'NULL byte detected. This byte ' + 'cannot be processed in Python\'s ' + 'native csv library at the moment, ' + 'so please pass in engine=\'c\' instead.') + else: + raise line = self._check_comments([orig_line])[0] self.pos += 1 if (not self.skip_blank_lines and @@ -2094,6 +2329,12 @@ def _next_line(self): line = ret[0] break + # This was the first line of the file, + # which could contain the BOM at the + # beginning of it. + if self.pos == 1: + line = self._check_for_bom(line) + self.line_pos += 1 self.buf.append(line) return line @@ -2237,7 +2478,7 @@ def _rows_to_cols(self, content): content, min_width=col_len).T) zip_len = len(zipped_content) - if self.skip_footer < 0: + if self.skipfooter < 0: raise ValueError('skip footer cannot be negative') # Loop through rows to verify lengths are correct. @@ -2250,8 +2491,8 @@ def _rows_to_cols(self, content): break footers = 0 - if self.skip_footer: - footers = self.skip_footer + if self.skipfooter: + footers = self.skipfooter row_num = self.pos - (len(content) - i + footers) @@ -2337,8 +2578,8 @@ def _get_lines(self, rows=None): else: lines = new_rows - if self.skip_footer: - lines = lines[:-self.skip_footer] + if self.skipfooter: + lines = lines[:-self.skipfooter] lines = self._check_comments(lines) if self.skip_blank_lines: @@ -2354,8 +2595,8 @@ def converter(*date_cols): strs = _concat_date_cols(date_cols) try: - return tools._to_datetime( - com._ensure_object(strs), + return tools.to_datetime( + _ensure_object(strs), utc=None, box=False, dayfirst=dayfirst, @@ -2408,7 +2649,7 @@ def _isindex(colspec): if isinstance(parse_spec, list): # list of column lists for colspec in parse_spec: - if lib.isscalar(colspec): + if is_scalar(colspec): if isinstance(colspec, int) and colspec not in data_dict: colspec = orig_names[colspec] if _isindex(colspec): @@ -2479,13 +2720,15 @@ def _clean_na_values(na_values, keep_default_na=True): elif isinstance(na_values, dict): if keep_default_na: for k, v in compat.iteritems(na_values): - v = set(list(v)) | _NA_VALUES + if not is_list_like(v): + v = [v] + v = set(v) | _NA_VALUES na_values[k] = v na_fvalues = dict([ (k, _floatify_na_values(v)) for k, v in na_values.items() # noqa ]) else: - if not com.is_list_like(na_values): + if not is_list_like(na_values): na_values = [na_values] na_values = _stringify_na_values(na_values) if keep_default_na: @@ -2538,7 +2781,7 @@ def _get_empty_meta(columns, index_col, index_names, dtype=None): if not isinstance(dtype, dict): dtype = defaultdict(lambda: dtype) # Convert column indexes to column names. - dtype = dict((columns[k] if com.is_integer(k) else k, v) + dtype = dict((columns[k] if is_integer(k) else k, v) for k, v in compat.iteritems(dtype)) if index_col is None or index_col is False: diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index c19dae7f3545e..2358c296f782e 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -3,7 +3,7 @@ import numpy as np from numpy.lib.format import read_array, write_array from pandas.compat import BytesIO, cPickle as pkl, pickle_compat as pc, PY3 -import pandas.core.common as com +from pandas.types.common import is_datetime64_dtype, _NS_DTYPE def to_pickle(obj, path): @@ -86,7 +86,7 @@ def _unpickle_array(bytes): # All datetimes should be stored as M8[ns]. When unpickling with # numpy1.6, it will read these as M8[us]. So this ensures all # datetime64 types are read as MS[ns] - if com.is_datetime64_dtype(arr): - arr = arr.view(com._NS_DTYPE) + if is_datetime64_dtype(arr): + arr = arr.view(_NS_DTYPE) return arr diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index fcf5125d956c6..b8c2b146b6259 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -12,14 +12,24 @@ import warnings import os +from pandas.types.common import (is_list_like, + is_categorical_dtype, + is_timedelta64_dtype, + is_datetime64tz_dtype, + is_datetime64_dtype, + _ensure_object, + _ensure_int64, + _ensure_platform_int) +from pandas.types.missing import array_equivalent + import numpy as np import pandas as pd from pandas import (Series, DataFrame, Panel, Panel4D, Index, - MultiIndex, Int64Index) + MultiIndex, Int64Index, isnull) from pandas.core import config from pandas.io.common import _stringify_path -from pandas.sparse.api import SparseSeries, SparseDataFrame, SparsePanel +from pandas.sparse.api import SparseSeries, SparseDataFrame from pandas.sparse.array import BlockIndex, IntIndex from pandas.tseries.api import PeriodIndex, DatetimeIndex from pandas.tseries.tdi import TimedeltaIndex @@ -27,12 +37,11 @@ from pandas.formats.printing import adjoin, pprint_thing from pandas.core.common import _asarray_tuplesafe, PerformanceWarning from pandas.core.algorithms import match, unique -from pandas.core.categorical import Categorical +from pandas.core.categorical import Categorical, _factorize_from_iterables from pandas.core.internals import (BlockManager, make_block, _block2d_to_blocknd, _factor_indexer, _block_shape) from pandas.core.index import _ensure_index -import pandas.core.common as com from pandas.tools.merge import concat from pandas import compat from pandas.compat import u_safe as u, PY3, range, lrange, string_types, filter @@ -160,7 +169,6 @@ class DuplicateWarning(Warning): SparseDataFrame: u('sparse_frame'), Panel: u('wide'), Panel4D: u('ndim'), - SparsePanel: u('sparse_panel') } # storer class map @@ -174,7 +182,6 @@ class DuplicateWarning(Warning): u('frame'): 'FrameFixed', u('sparse_frame'): 'SparseFrameFixed', u('wide'): 'PanelFixed', - u('sparse_panel'): 'SparsePanelFixed', } # table class map @@ -276,10 +283,10 @@ def read_hdf(path_or_buf, key=None, **kwargs): path_or_buf : path (string), buffer, or path object (pathlib.Path or py._path.local.LocalPath) to read from - .. versionadded:: 0.18.2 support for pathlib, py.path. + .. versionadded:: 0.19.0 support for pathlib, py.path. - key : group identifier in the store. Can be omitted a HDF file contains - a single pandas object. + key : group identifier in the store. Can be omitted if the HDF file + contains a single pandas object. where : list of Term (or convertable) objects, optional start : optional, integer (defaults to None), row number to start selection @@ -296,6 +303,10 @@ def read_hdf(path_or_buf, key=None, **kwargs): """ + if kwargs.get('mode', 'a') not in ['r', 'r+', 'a']: + raise ValueError('mode {0} is not allowed while performing a read. ' + 'Allowed modes are r, r+ and a.' + .format(kwargs.get('mode'))) # grab the scope if 'where' in kwargs: kwargs['where'] = _ensure_term(kwargs['where'], scope_level=1) @@ -311,7 +322,8 @@ def read_hdf(path_or_buf, key=None, **kwargs): exists = False if not exists: - raise IOError('File %s does not exist' % path_or_buf) + raise compat.FileNotFoundError( + 'File %s does not exist' % path_or_buf) # can't auto open/close if we are using an iterator # so delegate to the iterator @@ -331,11 +343,20 @@ def read_hdf(path_or_buf, key=None, **kwargs): try: if key is None: - keys = store.keys() - if len(keys) != 1: - raise ValueError('key must be provided when HDF file contains ' - 'multiple datasets.') - key = keys[0] + groups = store.groups() + if len(groups) == 0: + raise ValueError('No dataset in HDF5 file.') + candidate_only_group = groups[0] + + # For the HDF file to have only one dataset, all other groups + # should then be metadata groups for that candidate group. (This + # assumes that the groups() method enumerates parent groups + # before their children.) + for group_to_check in groups[1:]: + if not _is_metadata_of(group_to_check, candidate_only_group): + raise ValueError('key must be provided when HDF5 file ' + 'contains multiple datasets.') + key = candidate_only_group._v_pathname return store.select(key, auto_close=auto_close, **kwargs) except: # if there is an error, close the store @@ -347,6 +368,20 @@ def read_hdf(path_or_buf, key=None, **kwargs): raise +def _is_metadata_of(group, parent_group): + """Check if a given group is a metadata group for a given parent_group.""" + if group._v_depth <= parent_group._v_depth: + return False + + current = group + while current._v_depth > 1: + parent = current._v_parent + if parent == parent_group and current._v_name == 'meta': + return True + current = current._v_parent + return False + + class HDFStore(StringMixin): """ @@ -825,6 +860,9 @@ def put(self, key, value, format=None, append=False, **kwargs): append : boolean, default False This will force Table format, append the input data to the existing. + data_columns : list of columns to create as data columns, or True to + use all columns. See + `here `__ # noqa encoding : default None, provide an encoding for strings dropna : boolean, default False, do not write an ALL nan row to the store settable by the option 'io.hdf.dropna_table' @@ -901,8 +939,11 @@ def append(self, key, value, format=None, append=True, columns=None, / selecting subsets of the data append : boolean, default True, append the input data to the existing - data_columns : list of columns to create as data columns, or True to - use all columns + data_columns : list of columns, or True, default None + List of columns to create as indexed data columns for on-disk + queries, or True to use all columns. By default only the axes + of the object are indexed. See `here + `__. min_itemsize : dict of columns that specify minimum string sizes nan_rep : string to use as string nan represenation chunksize : size to chunk the writing @@ -910,6 +951,7 @@ def append(self, key, value, format=None, append=True, columns=None, encoding : default None, provide an encoding for strings dropna : boolean, default False, do not write an ALL nan row to the store settable by the option 'io.hdf.dropna_table' + Notes ----- Does *not* check if data being appended overlaps with existing @@ -1654,7 +1696,7 @@ def validate_metadata(self, handler): new_metadata = self.metadata cur_metadata = handler.read_metadata(self.cname) if new_metadata is not None and cur_metadata is not None \ - and not com.array_equivalent(new_metadata, cur_metadata): + and not array_equivalent(new_metadata, cur_metadata): raise ValueError("cannot append a categorical with " "different categories to the existing") @@ -2308,6 +2350,11 @@ def f(values, freq=None, tz=None): return DatetimeIndex._simple_new(values, None, freq=freq, tz=tz) return f + elif klass == PeriodIndex: + def f(values, freq=None, tz=None): + return PeriodIndex._simple_new(values, None, freq=freq) + return f + return klass def validate_read(self, kwargs): @@ -2409,7 +2456,9 @@ def write_index(self, key, index): setattr(self.attrs, '%s_variety' % key, 'regular') converted = _convert_index(index, self.encoding, self.format_type).set_name('index') + self.write_array(key, converted.values) + node = getattr(self.group, key) node._v_attrs.kind = converted.kind node._v_attrs.name = index.name @@ -2511,12 +2560,12 @@ def read_index_node(self, node, start=None, stop=None): kwargs['tz'] = node._v_attrs['tz'] if kind in (u('date'), u('datetime')): - index = factory( - _unconvert_index(data, kind, encoding=self.encoding), - dtype=object, **kwargs) + index = factory(_unconvert_index(data, kind, + encoding=self.encoding), + dtype=object, **kwargs) else: - index = factory( - _unconvert_index(data, kind, encoding=self.encoding), **kwargs) + index = factory(_unconvert_index(data, kind, + encoding=self.encoding), **kwargs) index.name = name @@ -2543,7 +2592,7 @@ def write_array(self, key, value, items=None): empty_array = self._is_empty_array(value.shape) transposed = False - if com.is_categorical_dtype(value): + if is_categorical_dtype(value): raise NotImplementedError('Cannot store a category dtype in ' 'a HDF5 dataset that uses format=' '"fixed". Use format="table".') @@ -2598,12 +2647,12 @@ def write_array(self, key, value, items=None): if empty_array: self.write_array_empty(key, value) else: - if com.is_datetime64_dtype(value.dtype): + if is_datetime64_dtype(value.dtype): self._handle.create_array( self.group, key, value.view('i8')) getattr( self.group, key)._v_attrs.value_type = 'datetime64' - elif com.is_datetime64tz_dtype(value.dtype): + elif is_datetime64tz_dtype(value.dtype): # store as UTC # with a zone self._handle.create_array(self.group, key, @@ -2612,7 +2661,7 @@ def write_array(self, key, value, items=None): node = getattr(self.group, key) node._v_attrs.tz = _get_tz(value.tz) node._v_attrs.value_type = 'datetime64' - elif com.is_timedelta64_dtype(value.dtype): + elif is_timedelta64_dtype(value.dtype): self._handle.create_array( self.group, key, value.view('i8')) getattr( @@ -2745,39 +2794,6 @@ def write(self, obj, **kwargs): self.write_index('columns', obj.columns) -class SparsePanelFixed(SparseFixed): - pandas_kind = u('sparse_panel') - attributes = ['default_kind', 'default_fill_value'] - - def read(self, **kwargs): - kwargs = self.validate_read(kwargs) - items = self.read_index('items') - - sdict = {} - for name in items: - key = 'sparse_frame_%s' % name - s = SparseFrameFixed(self.parent, getattr(self.group, key)) - s.infer_axes() - sdict[name] = s.read() - return SparsePanel(sdict, items=items, default_kind=self.default_kind, - default_fill_value=self.default_fill_value) - - def write(self, obj, **kwargs): - super(SparsePanelFixed, self).write(obj, **kwargs) - self.attrs.default_fill_value = obj.default_fill_value - self.attrs.default_kind = obj.default_kind - self.write_index('items', obj.items) - - for name, sdf in obj.iteritems(): - key = 'sparse_frame_%s' % name - if key not in self.group._v_children: - node = self._handle.create_group(self.group, key) - else: - node = getattr(self.group, key) - s = SparseFrameFixed(self.parent, node) - s.write(sdf) - - class BlockManagerFixed(GenericFixed): attributes = ['ndim', 'nblocks'] is_shape_reversed = False @@ -3720,11 +3736,12 @@ def read(self, where=None, columns=None, **kwargs): if not self.read_axes(where=where, **kwargs): return None - factors = [Categorical.from_array( - a.values, ordered=True) for a in self.index_axes] - levels = [f.categories for f in factors] - N = [len(f.categories) for f in factors] - labels = [f.codes for f in factors] + lst_vals = [a.values for a in self.index_axes] + labels, levels = _factorize_from_iterables(lst_vals) + # labels and levels are tuples but lists are expected + labels = list(labels) + levels = list(levels) + N = [len(lvl) for lvl in levels] # compute the key key = _factor_indexer(N[1:], labels) @@ -3733,8 +3750,8 @@ def read(self, where=None, columns=None, **kwargs): if len(unique(key)) == len(key): sorter, _ = algos.groupsort_indexer( - com._ensure_int64(key), np.prod(N)) - sorter = com._ensure_platform_int(sorter) + _ensure_int64(key), np.prod(N)) + sorter = _ensure_platform_int(sorter) # create the objs for c in self.values_axes: @@ -3779,7 +3796,7 @@ def read(self, where=None, columns=None, **kwargs): unique_tuples = _asarray_tuplesafe(unique_tuples) indexer = match(unique_tuples, tuple_index) - indexer = com._ensure_platform_int(indexer) + indexer = _ensure_platform_int(indexer) new_index = long_index.take(indexer) new_values = lp.values.take(indexer, axis=0) @@ -3880,7 +3897,7 @@ def write_data(self, chunksize, dropna=False): # figure the mask: only do if we can successfully process this # column, otherwise ignore the mask - mask = com.isnull(a.data).all(axis=0) + mask = isnull(a.data).all(axis=0) if isinstance(mask, np.ndarray): masks.append(mask.astype('u1', copy=False)) @@ -4338,9 +4355,10 @@ def _set_tz(values, tz, preserve_UTC=False, coerce=False): coerce : if we do not have a passed timezone, coerce to M8[ns] ndarray """ if tz is not None: + name = getattr(values, 'name', None) values = values.ravel() tz = tslib.get_timezone(_ensure_decoded(tz)) - values = DatetimeIndex(values) + values = DatetimeIndex(values, name=name) if values.tz is None: values = values.tz_localize('UTC').tz_convert(tz) if preserve_UTC: @@ -4368,9 +4386,10 @@ def _convert_index(index, encoding=None, format_type=None): index_name=index_name) elif isinstance(index, (Int64Index, PeriodIndex)): atom = _tables().Int64Col() - return IndexCol( - index.values, 'integer', atom, freq=getattr(index, 'freq', None), - index_name=index_name) + # avoid to store ndarray of Period objects + return IndexCol(index._values, 'integer', atom, + freq=getattr(index, 'freq', None), + index_name=index_name) if isinstance(index, MultiIndex): raise TypeError('MultiIndex not supported here!') @@ -4499,7 +4518,7 @@ def _convert_string_array(data, encoding, itemsize=None): # create the sized dtype if itemsize is None: - itemsize = lib.max_len_string_array(com._ensure_object(data.ravel())) + itemsize = lib.max_len_string_array(_ensure_object(data.ravel())) data = np.asarray(data, dtype="S%d" % itemsize) return data @@ -4528,7 +4547,7 @@ def _unconvert_string_array(data, nan_rep=None, encoding=None): encoding = _ensure_encoding(encoding) if encoding is not None and len(data): - itemsize = lib.max_len_string_array(com._ensure_object(data)) + itemsize = lib.max_len_string_array(_ensure_object(data)) if compat.PY3: dtype = "U{0}".format(itemsize) else: @@ -4596,7 +4615,7 @@ def __init__(self, table, where=None, start=None, stop=None, **kwargs): self.terms = None self.coordinates = None - if com.is_list_like(where): + if is_list_like(where): # see if we have a passed coordinate like try: diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index b75f05cf9ed7e..2a82fd7a53222 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -92,16 +92,24 @@ def __init__(self, path_or_buf, index=None, convert_dates=True, self._path_or_buf, _, _ = get_filepath_or_buffer(path_or_buf) if isinstance(self._path_or_buf, compat.string_types): self._path_or_buf = open(self._path_or_buf, 'rb') + self.handle = self._path_or_buf self._get_properties() self._parse_metadata() + def close(self): + try: + self.handle.close() + except AttributeError: + pass + def _get_properties(self): # Check magic number self._path_or_buf.seek(0) self._cached_page = self._path_or_buf.read(288) if self._cached_page[0:len(const.magic)] != const.magic: + self.close() raise ValueError("magic number mismatch (not a SAS file?)") # Get alignment information @@ -175,6 +183,7 @@ def _get_properties(self): buf = self._path_or_buf.read(self.header_length - 288) self._cached_page += buf if len(self._cached_page) != self.header_length: + self.close() raise ValueError("The SAS7BDAT file appears to be truncated.") self._page_length = self._read_int(const.page_size_offset + align1, @@ -219,6 +228,7 @@ def _get_properties(self): # Read a single float of the given width (4 or 8). def _read_float(self, offset, width): if width not in (4, 8): + self.close() raise ValueError("invalid float width") buf = self._read_bytes(offset, width) fd = "f" if width == 4 else "d" @@ -227,6 +237,7 @@ def _read_float(self, offset, width): # Read a single signed integer of the given width (1, 2, 4 or 8). def _read_int(self, offset, width): if width not in (1, 2, 4, 8): + self.close() raise ValueError("invalid int width") buf = self._read_bytes(offset, width) it = {1: "b", 2: "h", 4: "l", 8: "q"}[width] @@ -238,11 +249,13 @@ def _read_bytes(self, offset, length): self._path_or_buf.seek(offset) buf = self._path_or_buf.read(length) if len(buf) < length: + self.close() msg = "Unable to read {:d} bytes from file position {:d}." raise ValueError(msg.format(length, offset)) return buf else: if offset + length > len(self._cached_page): + self.close() raise ValueError("The cached page is too small.") return self._cached_page[offset:offset + length] @@ -253,6 +266,7 @@ def _parse_metadata(self): if len(self._cached_page) <= 0: break if len(self._cached_page) != self._page_length: + self.close() raise ValueError( "Failed to read a meta data page from the SAS file.") done = self._process_page_meta() @@ -302,6 +316,7 @@ def _get_subheader_index(self, signature, compression, ptype): if (self.compression != "") and f1 and f2: index = const.index.dataSubheaderIndex else: + self.close() raise ValueError("Unknown subheader signature") return index @@ -598,6 +613,7 @@ def _read_next_page(self): if len(self._cached_page) <= 0: return True elif len(self._cached_page) != self._page_length: + self.close() msg = ("failed to read complete page from file " "(read {:d} of {:d} bytes)") raise ValueError(msg.format(len(self._cached_page), @@ -643,6 +659,7 @@ def _chunk_to_dataframe(self): rslt.loc[ii, name] = np.nan js += 1 else: + self.close() raise ValueError("unknown column type %s" % self.column_types[j]) diff --git a/pandas/io/sas/sas_xport.py b/pandas/io/sas/sas_xport.py index e4ca99fdcb109..76fc55154bc49 100644 --- a/pandas/io/sas/sas_xport.py +++ b/pandas/io/sas/sas_xport.py @@ -253,6 +253,9 @@ def __init__(self, filepath_or_buffer, index=None, encoding='ISO-8859-1', self._read_header() + def close(self): + self.filepath_or_buffer.close() + def _get_row(self): return self.filepath_or_buffer.read(80).decode() @@ -262,6 +265,7 @@ def _read_header(self): # read file header line1 = self._get_row() if line1 != _correct_line1: + self.close() raise ValueError("Header record is not an XPORT file.") line2 = self._get_row() @@ -269,6 +273,7 @@ def _read_header(self): ['_', 24], ['created', 16]] file_info = _split_line(line2, fif) if file_info['prefix'] != "SAS SAS SASLIB": + self.close() raise ValueError("Header record has invalid prefix.") file_info['created'] = _parse_date(file_info['created']) self.file_info = file_info @@ -282,6 +287,7 @@ def _read_header(self): headflag1 = header1.startswith(_correct_header1) headflag2 = (header2 == _correct_header2) if not (headflag1 and headflag2): + self.close() raise ValueError("Member header not found") # usually 140, could be 135 fieldnamelength = int(header1[-5:-2]) @@ -321,6 +327,7 @@ def _read_header(self): field['ntype'] = types[field['ntype']] fl = field['field_length'] if field['ntype'] == 'numeric' and ((fl < 2) or (fl > 8)): + self.close() msg = "Floating field width {0} is not between 2 and 8." raise TypeError(msg.format(fl)) @@ -335,6 +342,7 @@ def _read_header(self): header = self._get_row() if not header == _correct_obs_header: + self.close() raise ValueError("Observation header not found.") self.fields = fields @@ -425,6 +433,7 @@ def read(self, nrows=None): read_lines = min(nrows, self.nobs - self._lines_read) read_len = read_lines * self.record_length if read_len <= 0: + self.close() raise StopIteration raw = self.filepath_or_buffer.read(read_len) data = np.frombuffer(raw, dtype=self._dtype, count=read_lines) diff --git a/pandas/io/sas/saslib.pyx b/pandas/io/sas/saslib.pyx index ac73ae37ca70e..a66d62ea41581 100644 --- a/pandas/io/sas/saslib.pyx +++ b/pandas/io/sas/saslib.pyx @@ -10,12 +10,14 @@ import sas_constants as const # algorithm. It is partially documented here: # # https://cran.r-project.org/web/packages/sas7bdat/vignettes/sas7bdat.pdf -cdef np.ndarray[uint8_t, ndim=1] rle_decompress(int result_length, np.ndarray[uint8_t, ndim=1] inbuff): +cdef np.ndarray[uint8_t, ndim=1] rle_decompress( + int result_length, np.ndarray[uint8_t, ndim=1] inbuff): cdef: uint8_t control_byte, x uint8_t [:] result = np.zeros(result_length, np.uint8) - int rpos = 0, ipos = 0, i, nbytes, end_of_first_byte, length = len(inbuff) + int rpos = 0, ipos = 0, length = len(inbuff) + int i, nbytes, end_of_first_byte while ipos < length: control_byte = inbuff[ipos] & 0xF0 @@ -41,13 +43,13 @@ cdef np.ndarray[uint8_t, ndim=1] rle_decompress(int result_length, np.ndarray[ui rpos += 1 ipos += 1 elif control_byte == 0x60: - nbytes = end_of_first_byte*256 + (inbuff[ipos]) + 17 + nbytes = end_of_first_byte * 256 + (inbuff[ipos]) + 17 ipos += 1 for i in range(nbytes): result[rpos] = 0x20 rpos += 1 elif control_byte == 0x70: - nbytes = end_of_first_byte*256 + (inbuff[ipos]) + 17 + nbytes = end_of_first_byte * 256 + (inbuff[ipos]) + 17 ipos += 1 for i in range(nbytes): result[rpos] = 0x00 @@ -109,8 +111,9 @@ cdef np.ndarray[uint8_t, ndim=1] rle_decompress(int result_length, np.ndarray[ui # rdc_decompress decompresses data using the Ross Data Compression algorithm: # -# http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm -cdef np.ndarray[uint8_t, ndim=1] rdc_decompress(int result_length, np.ndarray[uint8_t, ndim=1] inbuff): +# http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm +cdef np.ndarray[uint8_t, ndim=1] rdc_decompress( + int result_length, np.ndarray[uint8_t, ndim=1] inbuff): cdef: uint8_t cmd @@ -124,7 +127,8 @@ cdef np.ndarray[uint8_t, ndim=1] rdc_decompress(int result_length, np.ndarray[ui ii += 1 ctrl_mask = ctrl_mask >> 1 if ctrl_mask == 0: - ctrl_bits = (inbuff[ipos] << 8) + inbuff[ipos + 1] + ctrl_bits = ((inbuff[ipos] << 8) + + inbuff[ipos + 1]) ipos += 2 ctrl_mask = 0x8000 @@ -219,7 +223,8 @@ cdef class Parser(object): int subheader_pointer_length int current_page_type bint is_little_endian - np.ndarray[uint8_t, ndim=1] (*decompress)(int result_length, np.ndarray[uint8_t, ndim=1] inbuff) + np.ndarray[uint8_t, ndim=1] (*decompress)( + int result_length, np.ndarray[uint8_t, ndim=1] inbuff) object parser def __init__(self, object parser): @@ -252,7 +257,8 @@ cdef class Parser(object): elif column_types[j] == b's': self.column_types[j] = column_type_string else: - raise ValueError("unknown column type: %s" % self.parser.columns[j].ctype) + raise ValueError("unknown column type: " + "%s" % self.parser.columns[j].ctype) # compression if parser.compression == const.rle_compression: @@ -279,7 +285,8 @@ cdef class Parser(object): # update the parser self.parser._current_row_on_page_index = self.current_row_on_page_index - self.parser._current_row_in_chunk_index = self.current_row_in_chunk_index + self.parser._current_row_in_chunk_index =\ + self.current_row_in_chunk_index self.parser._current_row_in_file_index = self.current_row_in_file_index cdef bint read_next_page(self): @@ -299,13 +306,16 @@ cdef class Parser(object): self.current_row_on_page_index = 0 self.current_page_type = self.parser._current_page_type self.current_page_block_count = self.parser._current_page_block_count - self.current_page_data_subheader_pointers_len = len(self.parser._current_page_data_subheader_pointers) - self.current_page_subheaders_count = self.parser._current_page_subheaders_count + self.current_page_data_subheader_pointers_len = len( + self.parser._current_page_data_subheader_pointers) + self.current_page_subheaders_count =\ + self.parser._current_page_subheaders_count cdef bint readline(self): cdef: - int offset, bit_offset, align_correction, subheader_pointer_length, mn + int offset, bit_offset, align_correction + int subheader_pointer_length, mn bint done, flag bit_offset = self.bit_offset @@ -321,7 +331,8 @@ cdef class Parser(object): # Loop until a data row is read while True: if self.current_page_type == page_meta_type: - flag = self.current_row_on_page_index >= self.current_page_data_subheader_pointers_len + flag = self.current_row_on_page_index >=\ + self.current_page_data_subheader_pointers_len if flag: done = self.read_next_page() if done: @@ -330,10 +341,12 @@ cdef class Parser(object): current_subheader_pointer = ( self.parser._current_page_data_subheader_pointers[ self.current_row_on_page_index]) - self.process_byte_array_with_data(current_subheader_pointer.offset, - current_subheader_pointer.length) + self.process_byte_array_with_data( + current_subheader_pointer.offset, + current_subheader_pointer.length) return False - elif self.current_page_type == page_mix_types_0 or self.current_page_type == page_mix_types_1: + elif (self.current_page_type == page_mix_types_0 or + self.current_page_type == page_mix_types_1): align_correction = (bit_offset + subheader_pointers_offset + self.current_page_subheaders_count * subheader_pointer_length) @@ -345,18 +358,18 @@ cdef class Parser(object): offset += self.current_row_on_page_index * self.row_length self.process_byte_array_with_data(offset, self.row_length) - mn = min(self.parser.row_count, self.parser._mix_page_row_count) + mn = min(self.parser.row_count, + self.parser._mix_page_row_count) if self.current_row_on_page_index == mn: done = self.read_next_page() if done: return True return False elif self.current_page_type == page_data_type: - self.process_byte_array_with_data(bit_offset + - subheader_pointers_offset + - self.current_row_on_page_index * - self.row_length, - self.row_length) + self.process_byte_array_with_data( + bit_offset + subheader_pointers_offset + + self.current_row_on_page_index * self.row_length, + self.row_length) flag = (self.current_row_on_page_index == self.current_page_block_count) if flag: @@ -371,17 +384,18 @@ cdef class Parser(object): cdef void process_byte_array_with_data(self, int offset, int length): cdef: - Py_ssize_t j - int s, k, m, jb, js, current_row - int64_t lngt, start, ct - np.ndarray[uint8_t, ndim=1] source - int64_t[:] column_types - int64_t[:] lengths - int64_t[:] offsets - uint8_t[:, :] byte_chunk - object[:, :] string_chunk - - source = np.frombuffer(self.cached_page[offset:offset+length], dtype=np.uint8) + Py_ssize_t j + int s, k, m, jb, js, current_row + int64_t lngt, start, ct + np.ndarray[uint8_t, ndim=1] source + int64_t[:] column_types + int64_t[:] lengths + int64_t[:] offsets + uint8_t[:, :] byte_chunk + object[:, :] string_chunk + + source = np.frombuffer( + self.cached_page[offset:offset + length], dtype=np.uint8) if self.decompress != NULL and (length < self.row_length): source = self.decompress(self.row_length, source) @@ -408,11 +422,12 @@ cdef class Parser(object): else: m = s for k in range(lngt): - byte_chunk[jb, m + k] = source[start + k] + byte_chunk[jb, m + k] = source[start + k] jb += 1 elif column_types[j] == column_type_string: # string - string_chunk[js, current_row] = source[start:(start+lngt)].tostring().rstrip() + string_chunk[js, current_row] = source[start:( + start + lngt)].tostring().rstrip() js += 1 self.current_row_on_page_index += 1 diff --git a/pandas/io/sas/sasreader.py b/pandas/io/sas/sasreader.py index 9a60200c78893..081d780f71cb3 100644 --- a/pandas/io/sas/sasreader.py +++ b/pandas/io/sas/sasreader.py @@ -58,4 +58,6 @@ def read_sas(filepath_or_buffer, format=None, index=None, encoding=None, if iterator or chunksize: return reader - return reader.read() + data = reader.read() + reader.close() + return data diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 324988360c9fe..47642c2e2bc28 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -8,18 +8,19 @@ from datetime import datetime, date, time import warnings -import traceback import re import numpy as np import pandas.lib as lib -import pandas.core.common as com -from pandas.compat import (lzip, map, zip, raise_with_traceback, +from pandas.types.missing import isnull +from pandas.types.dtypes import DatetimeTZDtype +from pandas.types.common import (is_list_like, is_dict_like, + is_datetime64tz_dtype) + +from pandas.compat import (map, zip, raise_with_traceback, string_types, text_type) from pandas.core.api import DataFrame, Series -from pandas.core.common import isnull from pandas.core.base import PandasObject -from pandas.types.api import DatetimeTZDtype from pandas.tseries.tools import to_datetime from contextlib import contextmanager @@ -39,6 +40,24 @@ class DatabaseError(IOError): _SQLALCHEMY_INSTALLED = None +def _validate_flavor_parameter(flavor): + """ + Checks whether a database 'flavor' was specified. + If not None, produces FutureWarning if 'sqlite' and + raises a ValueError if anything else. + """ + if flavor is not None: + if flavor == 'sqlite': + warnings.warn("the 'flavor' parameter is deprecated " + "and will be removed in a future version, " + "as 'sqlite' is the only supported option " + "when SQLAlchemy is not installed.", + FutureWarning, stacklevel=2) + else: + raise ValueError("database flavor {flavor} is not " + "supported".format(flavor=flavor)) + + def _is_sqlalchemy_connectable(con): global _SQLALCHEMY_INSTALLED if _SQLALCHEMY_INSTALLED is None: @@ -90,7 +109,7 @@ def _handle_date_column(col, format=None): # parse dates as timestamp format = 's' if format is None else format return to_datetime(col, errors='coerce', unit=format, utc=True) - elif com.is_datetime64tz_dtype(col): + elif is_datetime64tz_dtype(col): # coerce to UTC timezone # GH11216 return (to_datetime(col, errors='coerce') @@ -123,7 +142,7 @@ def _parse_date_columns(data_frame, parse_dates): # we could in theory do a 'nice' conversion from a FixedOffset tz # GH11216 for col_name, df_col in data_frame.iteritems(): - if com.is_datetime64tz_dtype(df_col): + if is_datetime64tz_dtype(df_col): data_frame[col_name] = _handle_date_column(df_col) return data_frame @@ -172,125 +191,6 @@ def execute(sql, con, cur=None, params=None): return pandas_sql.execute(*args) -# ----------------------------------------------------------------------------- -# -- Deprecated tquery and uquery - -def _safe_fetch(cur): - try: - result = cur.fetchall() - if not isinstance(result, list): - result = list(result) - return result - except Exception as e: # pragma: no cover - excName = e.__class__.__name__ - if excName == 'OperationalError': - return [] - - -def tquery(sql, con=None, cur=None, retry=True): - """ - DEPRECATED. Returns list of tuples corresponding to each row in given sql - query. - - If only one column selected, then plain list is returned. - - To obtain the same result in the future, you can use the following: - - >>> execute(sql, con, params).fetchall() - - Parameters - ---------- - sql: string - SQL query to be executed - con: DBAPI2 connection, default: None - cur: deprecated, cursor is obtained from connection, default: None - retry: boolean value to specify whether to retry after failure - default: True - - Returns - ------- - Results Iterable - - """ - warnings.warn( - "tquery is deprecated, and will be removed in future versions. " - "You can use ``execute(...).fetchall()`` instead.", - FutureWarning, stacklevel=2) - - cur = execute(sql, con, cur=cur) - result = _safe_fetch(cur) - - if con is not None: - try: - cur.close() - con.commit() - except Exception as e: - excName = e.__class__.__name__ - if excName == 'OperationalError': # pragma: no cover - print('Failed to commit, may need to restart interpreter') - else: - raise - - traceback.print_exc() - if retry: - return tquery(sql, con=con, retry=False) - - if result and len(result[0]) == 1: - # python 3 compat - result = list(lzip(*result)[0]) - elif result is None: # pragma: no cover - result = [] - - return result - - -def uquery(sql, con=None, cur=None, retry=True, params=None): - """ - DEPRECATED. Does the same thing as tquery, but instead of returning - results, it returns the number of rows affected. Good for update queries. - - To obtain the same result in the future, you can use the following: - - >>> execute(sql, con).rowcount - - Parameters - ---------- - sql: string - SQL query to be executed - con: DBAPI2 connection, default: None - cur: deprecated, cursor is obtained from connection, default: None - retry: boolean value to specify whether to retry after failure - default: True - params: list or tuple, optional, default: None - List of parameters to pass to execute method. - - Returns - ------- - Number of affected rows - - """ - warnings.warn( - "uquery is deprecated, and will be removed in future versions. " - "You can use ``execute(...).rowcount`` instead.", - FutureWarning, stacklevel=2) - - cur = execute(sql, con, cur=cur, params=params) - - result = cur.rowcount - try: - con.commit() - except Exception as e: - excName = e.__class__.__name__ - if excName != 'OperationalError': - raise - - traceback.print_exc() - if retry: - print('Looks like your connection failed, reconnecting...') - return uquery(sql, con, retry=False) - return result - - # ----------------------------------------------------------------------------- # -- Read and write to DataFrames @@ -515,7 +415,7 @@ def read_sql(sql, con, index_col=None, coerce_float=True, params=None, chunksize=chunksize) -def to_sql(frame, name, con, flavor='sqlite', schema=None, if_exists='fail', +def to_sql(frame, name, con, flavor=None, schema=None, if_exists='fail', index=True, index_label=None, chunksize=None, dtype=None): """ Write records stored in a DataFrame to a SQL database. @@ -530,10 +430,8 @@ def to_sql(frame, name, con, flavor='sqlite', schema=None, if_exists='fail', Using SQLAlchemy makes it possible to use any DB supported by that library. If a DBAPI2 object, only sqlite3 is supported. - flavor : {'sqlite', 'mysql'}, default 'sqlite' - The flavor of SQL to use. Ignored when using SQLAlchemy connectable. - 'mysql' is deprecated and will be removed in future versions, but it - will be further supported through SQLAlchemy connectables. + flavor : 'sqlite', default None + DEPRECATED: this parameter will be removed in a future version schema : string, default None Name of SQL schema in database to write to (if database flavor supports this). If None, use default schema (default). @@ -550,9 +448,10 @@ def to_sql(frame, name, con, flavor='sqlite', schema=None, if_exists='fail', chunksize : int, default None If not None, then rows will be written in batches of this size at a time. If None, all rows will be written at once. - dtype : dict of column name to SQL type, default None + dtype : single SQLtype or dict of column name to SQL type, default None Optional specifying the datatype for columns. The SQL type should be a SQLAlchemy type, or a string for sqlite3 fallback connection. + If all columns are of the same type, one single value can be used. """ if if_exists not in ('fail', 'replace', 'append'): @@ -571,7 +470,7 @@ def to_sql(frame, name, con, flavor='sqlite', schema=None, if_exists='fail', chunksize=chunksize, dtype=dtype) -def has_table(table_name, con, flavor='sqlite', schema=None): +def has_table(table_name, con, flavor=None, schema=None): """ Check if DataBase has named table. @@ -583,10 +482,8 @@ def has_table(table_name, con, flavor='sqlite', schema=None): Using SQLAlchemy makes it possible to use any DB supported by that library. If a DBAPI2 object, only sqlite3 is supported. - flavor: {'sqlite', 'mysql'}, default 'sqlite' - The flavor of SQL to use. Ignored when using SQLAlchemy connectable. - 'mysql' is deprecated and will be removed in future versions, but it - will be further supported through SQLAlchemy connectables. + flavor : 'sqlite', default None + DEPRECATED: this parameter will be removed in a future version schema : string, default None Name of SQL schema in database to write to (if database flavor supports this). If None, use default schema (default). @@ -601,12 +498,6 @@ def has_table(table_name, con, flavor='sqlite', schema=None): table_exists = has_table -_MYSQL_WARNING = ("The 'mysql' flavor with DBAPI connection is deprecated " - "and will be removed in future versions. " - "MySQL will be further supported with SQLAlchemy " - "connectables.") - - def _engine_builder(con): """ Returns a SQLAlchemy engine from a URI (if con is a string) @@ -630,15 +521,17 @@ def pandasSQL_builder(con, flavor=None, schema=None, meta=None, Convenience function to return the correct PandasSQL subclass based on the provided parameters """ + _validate_flavor_parameter(flavor) + # When support for DBAPI connections is removed, # is_cursor should not be necessary. con = _engine_builder(con) if _is_sqlalchemy_connectable(con): return SQLDatabase(con, schema=schema, meta=meta) + elif isinstance(con, string_types): + raise ImportError("Using URI string without sqlalchemy installed.") else: - if flavor == 'mysql': - warnings.warn(_MYSQL_WARNING, FutureWarning, stacklevel=3) - return SQLiteDatabase(con, flavor, is_cursor=is_cursor) + return SQLiteDatabase(con, is_cursor=is_cursor) class SQLTable(PandasObject): @@ -876,7 +769,7 @@ def _create_table_setup(self): for name, typ, is_index in column_names_and_types] if self.keys is not None: - if not com.is_list_like(self.keys): + if not is_list_like(self.keys): keys = [self.keys] else: keys = self.keys @@ -1033,11 +926,11 @@ class PandasSQL(PandasObject): def read_sql(self, *args, **kwargs): raise ValueError("PandasSQL must be created with an SQLAlchemy " - "connectable or connection+sql flavor") + "connectable or sqlite connection") def to_sql(self, *args, **kwargs): raise ValueError("PandasSQL must be created with an SQLAlchemy " - "connectable or connection+sql flavor") + "connectable or sqlite connection") class SQLDatabase(PandasSQL): @@ -1231,11 +1124,15 @@ def to_sql(self, frame, name, if_exists='fail', index=True, chunksize : int, default None If not None, then rows will be written in batches of this size at a time. If None, all rows will be written at once. - dtype : dict of column name to SQL type, default None + dtype : single type or dict of column name to SQL type, default None Optional specifying the datatype for columns. The SQL type should - be a SQLAlchemy type. + be a SQLAlchemy type. If all columns are of the same type, one + single value can be used. """ + if dtype and not is_dict_like(dtype): + dtype = {col_name: dtype for col_name in frame} + if dtype is not None: from sqlalchemy.types import to_instance, TypeEngine for col, my_type in dtype.items(): @@ -1306,38 +1203,16 @@ def _create_sql_schema(self, frame, table_name, keys=None, dtype=None): # ---- SQL without SQLAlchemy --- -# Flavour specific sql strings and handler class for access to DBs without -# SQLAlchemy installed -# SQL type convertions for each DB +# sqlite-specific sql strings and handler class +# dictionary used for readability purposes _SQL_TYPES = { - 'string': { - 'mysql': 'VARCHAR (63)', - 'sqlite': 'TEXT', - }, - 'floating': { - 'mysql': 'DOUBLE', - 'sqlite': 'REAL', - }, - 'integer': { - 'mysql': 'BIGINT', - 'sqlite': 'INTEGER', - }, - 'datetime': { - 'mysql': 'DATETIME', - 'sqlite': 'TIMESTAMP', - }, - 'date': { - 'mysql': 'DATE', - 'sqlite': 'DATE', - }, - 'time': { - 'mysql': 'TIME', - 'sqlite': 'TIME', - }, - 'boolean': { - 'mysql': 'BOOLEAN', - 'sqlite': 'INTEGER', - } + 'string': 'TEXT', + 'floating': 'REAL', + 'integer': 'INTEGER', + 'datetime': 'TIMESTAMP', + 'date': 'DATE', + 'time': 'TIME', + 'boolean': 'INTEGER', } @@ -1349,22 +1224,6 @@ def _get_unicode_name(name): return uname -def _get_valid_mysql_name(name): - # Filter for unquoted identifiers - # See http://dev.mysql.com/doc/refman/5.0/en/identifiers.html - uname = _get_unicode_name(name) - if not len(uname): - raise ValueError("Empty table or column name specified") - - basere = r'[0-9,a-z,A-Z$_]' - for c in uname: - if not re.match(basere, c): - if not (0x80 < ord(c) < 0xFFFF): - raise ValueError("Invalid MySQL identifier '%s'" % uname) - - return '`' + uname + '`' - - def _get_valid_sqlite_name(name): # See http://stackoverflow.com/questions/6514274/how-do-you-escape-strings\ # -for-sqlite-table-column-names-in-python @@ -1383,19 +1242,6 @@ def _get_valid_sqlite_name(name): return '"' + uname.replace('"', '""') + '"' -# SQL enquote and wildcard symbols -_SQL_WILDCARD = { - 'mysql': '%s', - 'sqlite': '?' -} - -# Validate and return escaped identifier -_SQL_GET_IDENTIFIER = { - 'mysql': _get_valid_mysql_name, - 'sqlite': _get_valid_sqlite_name, -} - - _SAFE_NAMES_WARNING = ("The spaces in these column names will not be changed. " "In pandas versions < 0.14, spaces were converted to " "underscores.") @@ -1426,9 +1272,8 @@ def _execute_create(self): def insert_statement(self): names = list(map(text_type, self.frame.columns)) - flv = self.pd_sql.flavor - wld = _SQL_WILDCARD[flv] # wildcard char - escape = _SQL_GET_IDENTIFIER[flv] + wld = '?' # wildcard char + escape = _get_valid_sqlite_name if self.index is not None: [names.insert(0, idx) for idx in self.index[::-1]] @@ -1458,14 +1303,13 @@ def _create_table_setup(self): if any(map(pat.search, column_names)): warnings.warn(_SAFE_NAMES_WARNING, stacklevel=6) - flv = self.pd_sql.flavor - escape = _SQL_GET_IDENTIFIER[flv] + escape = _get_valid_sqlite_name create_tbl_stmts = [escape(cname) + ' ' + ctype for cname, ctype, _ in column_names_and_types] if self.keys is not None and len(self.keys): - if not com.is_list_like(self.keys): + if not is_list_like(self.keys): keys = [self.keys] else: keys = self.keys @@ -1512,7 +1356,7 @@ def _sql_type_name(self, col): if col_type not in _SQL_TYPES: col_type = "string" - return _SQL_TYPES[col_type][self.pd_sql.flavor] + return _SQL_TYPES[col_type] class SQLiteDatabase(PandasSQL): @@ -1520,25 +1364,17 @@ class SQLiteDatabase(PandasSQL): Version of SQLDatabase to support sqlite connections (fallback without sqlalchemy). This should only be used internally. - For now still supports `flavor` argument to deal with 'mysql' database - for backwards compatibility, but this will be removed in future versions. - Parameters ---------- con : sqlite connection object """ - def __init__(self, con, flavor, is_cursor=False): + def __init__(self, con, flavor=None, is_cursor=False): + _validate_flavor_parameter(flavor) + self.is_cursor = is_cursor self.con = con - if flavor is None: - flavor = 'sqlite' - if flavor not in ['sqlite', 'mysql']: - raise NotImplementedError("flavors other than SQLite and MySQL " - "are not supported") - else: - self.flavor = flavor @contextmanager def run_transaction(self): @@ -1644,11 +1480,15 @@ def to_sql(self, frame, name, if_exists='fail', index=True, chunksize : int, default None If not None, then rows will be written in batches of this size at a time. If None, all rows will be written at once. - dtype : dict of column name to SQL type, default None + dtype : single type or dict of column name to SQL type, default None Optional specifying the datatype for columns. The SQL type should - be a string. + be a string. If all columns are of the same type, one single value + can be used. """ + if dtype and not is_dict_like(dtype): + dtype = {col_name: dtype for col_name in frame} + if dtype is not None: for col, my_type in dtype.items(): if not isinstance(my_type, str): @@ -1663,15 +1503,12 @@ def to_sql(self, frame, name, if_exists='fail', index=True, def has_table(self, name, schema=None): # TODO(wesm): unused? - # escape = _SQL_GET_IDENTIFIER[self.flavor] + # escape = _get_valid_sqlite_name # esc_name = escape(name) - wld = _SQL_WILDCARD[self.flavor] - flavor_map = { - 'sqlite': ("SELECT name FROM sqlite_master " - "WHERE type='table' AND name=%s;") % wld, - 'mysql': "SHOW TABLES LIKE %s" % wld} - query = flavor_map.get(self.flavor) + wld = '?' + query = ("SELECT name FROM sqlite_master " + "WHERE type='table' AND name=%s;") % wld return len(self.execute(query, [name, ]).fetchall()) > 0 @@ -1679,8 +1516,7 @@ def get_table(self, table_name, schema=None): return None # not supported in fallback mode def drop_table(self, name, schema=None): - escape = _SQL_GET_IDENTIFIER[self.flavor] - drop_sql = "DROP TABLE %s" % escape(name) + drop_sql = "DROP TABLE %s" % _get_valid_sqlite_name(name) self.execute(drop_sql) def _create_sql_schema(self, frame, table_name, keys=None, dtype=None): @@ -1689,7 +1525,7 @@ def _create_sql_schema(self, frame, table_name, keys=None, dtype=None): return str(table.sql_schema()) -def get_schema(frame, name, flavor='sqlite', keys=None, con=None, dtype=None): +def get_schema(frame, name, flavor=None, keys=None, con=None, dtype=None): """ Get the SQL db table schema for the given frame. @@ -1698,16 +1534,14 @@ def get_schema(frame, name, flavor='sqlite', keys=None, con=None, dtype=None): frame : DataFrame name : string name of SQL table - flavor : {'sqlite', 'mysql'}, default 'sqlite' - The flavor of SQL to use. Ignored when using SQLAlchemy connectable. - 'mysql' is deprecated and will be removed in future versions, but it - will be further supported through SQLAlchemy engines. keys : string or sequence, default: None columns to use a primary key con: an open SQL database connection object or a SQLAlchemy connectable Using SQLAlchemy makes it possible to use any DB supported by that library, default: None If a DBAPI2 object, only sqlite3 is supported. + flavor : 'sqlite', default None + DEPRECATED: this parameter will be removed in a future version dtype : dict of column name to SQL type, default None Optional specifying the datatype for columns. The SQL type should be a SQLAlchemy type, or a string for sqlite3 fallback connection. diff --git a/pandas/io/stata.py b/pandas/io/stata.py index ae7200cf6fb2e..25f13048a73fd 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -7,13 +7,17 @@ a once again improved version. You can find more information on http://presbrey.mit.edu/PyDTA and -http://statsmodels.sourceforge.net/devel/ +http://www.statsmodels.org/devel/ """ import numpy as np import sys import struct from dateutil.relativedelta import relativedelta + +from pandas.types.common import (is_categorical_dtype, is_datetime64_dtype, + _ensure_object) + from pandas.core.base import StringMixin from pandas.core.categorical import Categorical from pandas.core.frame import DataFrame @@ -24,7 +28,7 @@ zip, BytesIO from pandas.util.decorators import Appender import pandas as pd -import pandas.core.common as com + from pandas.io.common import get_filepath_or_buffer, BaseIterator from pandas.lib import max_len_string_array, infer_dtype from pandas.tslib import NaT, Timestamp @@ -163,15 +167,11 @@ def read_stata(filepath_or_buffer, convert_dates=True, chunksize=chunksize, encoding=encoding) if iterator or chunksize: - try: - return reader - except StopIteration: - reader.close() - - try: - return reader.read() - finally: + data = reader + else: + data = reader.read() reader.close() + return data _date_formats = ["%tc", "%tC", "%td", "%d", "%tw", "%tm", "%tq", "%th", "%ty"] @@ -358,7 +358,7 @@ def _datetime_to_stata_elapsed_vec(dates, fmt): def parse_dates_safe(dates, delta=False, year=False, days=False): d = {} - if com.is_datetime64_dtype(dates.values): + if is_datetime64_dtype(dates.values): if delta: delta = dates - stata_epoch d['delta'] = delta.values.astype( @@ -396,7 +396,7 @@ def parse_dates_safe(dates, delta=False, year=False, days=False): index = dates.index if bad_loc.any(): dates = Series(dates) - if com.is_datetime64_dtype(dates): + if is_datetime64_dtype(dates): dates[bad_loc] = to_datetime(stata_epoch) else: dates[bad_loc] = stata_epoch @@ -428,7 +428,7 @@ def parse_dates_safe(dates, delta=False, year=False, days=False): d = parse_dates_safe(dates, year=True) conv_dates = d.year else: - raise ValueError("fmt %s not understood" % fmt) + raise ValueError("Format %s is not a known Stata date format" % fmt) conv_dates = Series(conv_dates, dtype=np.float64) missing_value = struct.unpack('= 117: if self.path_or_buf.read(5) == b' - break # end of variable label table + break # end of value label table slength = self.path_or_buf.read(4) if not slength: - break # end of variable label table (format < 117) + break # end of value label table (format < 117) if self.format_version <= 117: labname = self._null_terminate(self.path_or_buf.read(33)) else: @@ -1407,13 +1407,13 @@ def read(self, nrows=None, convert_dates=None, convert_categoricals=None, index=None, convert_missing=None, preserve_dtypes=None, columns=None, order_categoricals=None): - # Handle empty file or chunk. If reading incrementally raise # StopIteration. If reading the whole thing return an empty # data frame. if (self.nobs == 0) and (nrows is None): self._can_read_value_labels = True self._data_read = True + self.close() return DataFrame(columns=self.varlist) # Handle options @@ -1459,6 +1459,7 @@ def read(self, nrows=None, convert_dates=None, # we are reading the file incrementally if convert_categoricals: self._read_value_labels() + self.close() raise StopIteration offset = self._lines_read * dtype.itemsize self.path_or_buf.seek(self.data_location + offset) @@ -1490,7 +1491,11 @@ def read(self, nrows=None, convert_dates=None, data = data.set_index(ix) if columns is not None: - data = self._do_select_columns(data, columns) + try: + data = self._do_select_columns(data, columns) + except ValueError: + self.close() + raise # Decode strings for col, typ in zip(data, self.typlist): @@ -1510,7 +1515,7 @@ def read(self, nrows=None, convert_dates=None, if self.dtyplist[i] is not None: col = data.columns[i] dtype = data[col].dtype - if (dtype != np.dtype(object)) and (dtype != self.dtyplist[i]): + if dtype != np.dtype(object) and dtype != self.dtyplist[i]: requires_type_conversion = True data_formatted.append( (col, Series(data[col], index, self.dtyplist[i]))) @@ -1527,9 +1532,13 @@ def read(self, nrows=None, convert_dates=None, self.fmtlist))[0] for i in cols: col = data.columns[i] - data[col] = _stata_elapsed_date_to_datetime_vec( - data[col], - self.fmtlist[i]) + try: + data[col] = _stata_elapsed_date_to_datetime_vec( + data[col], + self.fmtlist[i]) + except ValueError: + self.close() + raise if convert_categoricals and self.format_version > 108: data = self._do_convert_categoricals(data, @@ -1645,7 +1654,15 @@ def _do_convert_categoricals(self, data, value_label_dict, lbllist, categories.append(value_label_dict[label][category]) else: categories.append(category) # Partially labeled - cat_data.categories = categories + try: + cat_data.categories = categories + except ValueError: + vc = Series(categories).value_counts() + repeats = list(vc.index[vc > 1]) + repeats = '\n' + '-' * 80 + '\n'.join(repeats) + msg = 'Value labels for column {0} are not unique. The ' \ + 'repeated labels are:\n{1}'.format(col, repeats) + raise ValueError(msg) # TODO: is the next line needed above in the data(...) method? cat_data = Series(cat_data, index=data.index) cat_converted_data.append((col, cat_data)) @@ -1662,7 +1679,7 @@ def variable_labels(self): """Returns variable labels as a dict, associating each variable name with corresponding label """ - return dict(zip(self.varlist, self.vlblist)) + return dict(zip(self.varlist, self._variable_labels)) def value_labels(self): """Returns a dict, associating each variable name a dict, associating @@ -1692,7 +1709,7 @@ def _set_endianness(endianness): def _pad_bytes(name, length): """ - Takes a char string and pads it wih null bytes until it's length chars + Takes a char string and pads it with null bytes until it's length chars """ return name + "\x00" * (length - len(name)) @@ -1705,7 +1722,7 @@ def _convert_datetime_to_stata_type(fmt): "%tq", "th", "%th", "ty", "%ty"]: return np.float64 # Stata expects doubles for SIFs else: - raise ValueError("fmt %s not understood" % fmt) + raise NotImplementedError("Format %s not implemented" % fmt) def _maybe_convert_to_int_keys(convert_dates, varlist): @@ -1717,9 +1734,8 @@ def _maybe_convert_to_int_keys(convert_dates, varlist): new_dict.update({varlist.index(key): convert_dates[key]}) else: if not isinstance(key, int): - raise ValueError( - "convert_dates key is not in varlist and is not an int" - ) + raise ValueError("convert_dates key must be a " + "column or an integer") new_dict.update({key: convert_dates[key]}) return new_dict @@ -1746,7 +1762,7 @@ def _dtype_to_stata_type(dtype, column): elif dtype.type == np.object_: # try to coerce it to the biggest string # not memory efficient, what else could we # do? - itemsize = max_len_string_array(com._ensure_object(column.values)) + itemsize = max_len_string_array(_ensure_object(column.values)) return chr(max(itemsize, 1)) elif dtype == np.float64: return chr(255) @@ -1759,8 +1775,7 @@ def _dtype_to_stata_type(dtype, column): elif dtype == np.int8: return chr(251) else: # pragma : no cover - raise ValueError("Data type %s not currently understood. " - "Please report an error to the developers." % dtype) + raise NotImplementedError("Data type %s not supported." % dtype) def _dtype_to_default_stata_fmt(dtype, column): @@ -1784,7 +1799,7 @@ def _dtype_to_default_stata_fmt(dtype, column): if not (inferred_dtype in ('string', 'unicode') or len(column) == 0): raise ValueError('Writing general object arrays is not supported') - itemsize = max_len_string_array(com._ensure_object(column.values)) + itemsize = max_len_string_array(_ensure_object(column.values)) if itemsize > 244: raise ValueError(excessive_string_length_error % column.name) return "%" + str(max(itemsize, 1)) + "s" @@ -1797,35 +1812,41 @@ def _dtype_to_default_stata_fmt(dtype, column): elif dtype == np.int8 or dtype == np.int16: return "%8.0g" else: # pragma : no cover - raise ValueError("Data type %s not currently understood. " - "Please report an error to the developers." % dtype) + raise NotImplementedError("Data type %s not supported." % dtype) class StataWriter(StataParser): """ - A class for writing Stata binary dta files from array-like objects + A class for writing Stata binary dta files Parameters ---------- - fname : file path or buffer - Where to save the dta file. - data : array-like - Array-like input to save. Pandas objects are also accepted. + fname : str or buffer + String path of file-like object + data : DataFrame + Input to save convert_dates : dict - Dictionary mapping column of datetime types to the stata internal - format that you want to use for the dates. Options are - 'tc', 'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either a - number or a name. + Dictionary mapping columns containing datetime types to stata internal + format to use when wirting the dates. Options are 'tc', 'td', 'tm', + 'tw', 'th', 'tq', 'ty'. Column can be either an integer or a name. + Datetime columns that do not have a conversion type specified will be + converted to 'tc'. Raises NotImplementedError if a datetime column has + timezone information + write_index : bool + Write the index to Stata dataset. encoding : str - Default is latin-1. Note that Stata does not support unicode. + Default is latin-1. Unicode is not supported byteorder : str - Can be ">", "<", "little", or "big". The default is None which uses - `sys.byteorder` + Can be ">", "<", "little", or "big". default is `sys.byteorder` time_stamp : datetime - A date time to use when writing the file. Can be None, in which - case the current time is used. + A datetime to use as file creation date. Default is the current time dataset_label : str - A label for the data set. Should be 80 characters or smaller. + A label for the data set. Must be 80 characters or smaller. + variable_labels : dict + Dictionary containing columns as keys and variable labels as values. + Each label must be 80 characters or smaller. + + .. versionadded:: 0.19.0 Returns ------- @@ -1833,6 +1854,17 @@ class StataWriter(StataParser): The StataWriter instance has a write_file method, which will write the file to the given `fname`. + Raises + ------ + NotImplementedError + * If datetimes contain timezone information + ValueError + * Columns listed in convert_dates are noth either datetime64[ns] + or datetime.datetime + * Column dtype is not representable in Stata + * Column listed in convert_dates is not in DataFrame + * Categorical label contains more than 32,000 characters + Examples -------- >>> import pandas as pd @@ -1849,21 +1881,20 @@ class StataWriter(StataParser): def __init__(self, fname, data, convert_dates=None, write_index=True, encoding="latin-1", byteorder=None, time_stamp=None, - data_label=None): + data_label=None, variable_labels=None): super(StataWriter, self).__init__(encoding) - self._convert_dates = convert_dates + self._convert_dates = {} if convert_dates is None else convert_dates self._write_index = write_index self._time_stamp = time_stamp self._data_label = data_label + self._variable_labels = variable_labels # attach nobs, nvars, data, varlist, typlist self._prepare_pandas(data) if byteorder is None: byteorder = sys.byteorder self._byteorder = _set_endianness(byteorder) - self._file = _open_file_binary_write( - fname, self._encoding or self._default_encoding - ) + self._fname = fname self.type_converters = {253: np.int32, 252: np.int16, 251: np.int8} def _write(self, to_write): @@ -1880,7 +1911,7 @@ def _prepare_categoricals(self, data): """Check for categorical columns, retain categorical information for Stata file and convert categorical data to int""" - is_cat = [com.is_categorical_dtype(data[col]) for col in data] + is_cat = [is_categorical_dtype(data[col]) for col in data] self._is_col_cat = is_cat self._value_labels = [] if not any(is_cat): @@ -2030,15 +2061,22 @@ def _prepare_pandas(self, data): self.varlist = data.columns.tolist() dtypes = data.dtypes - if self._convert_dates is not None: - self._convert_dates = _maybe_convert_to_int_keys( - self._convert_dates, self.varlist + + # Ensure all date columns are converted + for col in data: + if col in self._convert_dates: + continue + if is_datetime64_dtype(data[col]): + self._convert_dates[col] = 'tc' + + self._convert_dates = _maybe_convert_to_int_keys(self._convert_dates, + self.varlist) + for key in self._convert_dates: + new_type = _convert_datetime_to_stata_type( + self._convert_dates[key] ) - for key in self._convert_dates: - new_type = _convert_datetime_to_stata_type( - self._convert_dates[key] - ) - dtypes[key] = np.dtype(new_type) + dtypes[key] = np.dtype(new_type) + self.typlist = [] self.fmtlist = [] for col, dtype in dtypes.iteritems(): @@ -2051,16 +2089,21 @@ def _prepare_pandas(self, data): self.fmtlist[key] = self._convert_dates[key] def write_file(self): - self._write_header(time_stamp=self._time_stamp, - data_label=self._data_label) - self._write_descriptors() - self._write_variable_labels() - # write 5 zeros for expansion fields - self._write(_pad_bytes("", 5)) - self._prepare_data() - self._write_data() - self._write_value_labels() - self._file.close() + self._file = _open_file_binary_write( + self._fname, self._encoding or self._default_encoding + ) + try: + self._write_header(time_stamp=self._time_stamp, + data_label=self._data_label) + self._write_descriptors() + self._write_variable_labels() + # write 5 zeros for expansion fields + self._write(_pad_bytes("", 5)) + self._prepare_data() + self._write_data() + self._write_value_labels() + finally: + self._file.close() def _write_value_labels(self): for vl in self._value_labels: @@ -2131,11 +2174,29 @@ def _write_descriptors(self, typlist=None, varlist=None, srtlist=None, else: # Default is empty label self._write(_pad_bytes("", 33)) - def _write_variable_labels(self, labels=None): - nvar = self.nvar - if labels is None: - for i in range(nvar): - self._write(_pad_bytes("", 81)) + def _write_variable_labels(self): + # Missing labels are 80 blank characters plus null termination + blank = _pad_bytes('', 81) + + if self._variable_labels is None: + for i in range(self.nvar): + self._write(blank) + return + + for col in self.data: + if col in self._variable_labels: + label = self._variable_labels[col] + if len(label) > 80: + raise ValueError('Variable labels must be 80 characters ' + 'or fewer') + is_latin1 = all(ord(c) < 256 for c in label) + if not is_latin1: + raise ValueError('Variable labels must contain only ' + 'characters that can be encoded in ' + 'Latin-1') + self._write(_pad_bytes(label, 81)) + else: + self._write(blank) def _prepare_data(self): data = self.data diff --git a/pandas/tests/data/categorical_0_14_1.pickle b/pandas/io/tests/data/categorical_0_14_1.pickle similarity index 100% rename from pandas/tests/data/categorical_0_14_1.pickle rename to pandas/io/tests/data/categorical_0_14_1.pickle diff --git a/pandas/tests/data/categorical_0_15_2.pickle b/pandas/io/tests/data/categorical_0_15_2.pickle similarity index 100% rename from pandas/tests/data/categorical_0_15_2.pickle rename to pandas/io/tests/data/categorical_0_15_2.pickle diff --git a/pandas/io/tests/data/legacy_msgpack/0.18.1/0.18.1_x86_64_darwin_2.7.12.msgpack b/pandas/io/tests/data/legacy_msgpack/0.18.1/0.18.1_x86_64_darwin_2.7.12.msgpack new file mode 100644 index 0000000000000..978c2c5045314 Binary files /dev/null and b/pandas/io/tests/data/legacy_msgpack/0.18.1/0.18.1_x86_64_darwin_2.7.12.msgpack differ diff --git a/pandas/io/tests/data/legacy_msgpack/0.18.1/0.18.1_x86_64_darwin_3.5.2.msgpack b/pandas/io/tests/data/legacy_msgpack/0.18.1/0.18.1_x86_64_darwin_3.5.2.msgpack new file mode 100644 index 0000000000000..ea8efdc86dd2d Binary files /dev/null and b/pandas/io/tests/data/legacy_msgpack/0.18.1/0.18.1_x86_64_darwin_3.5.2.msgpack differ diff --git a/pandas/io/tests/data/legacy_pickle/0.14.1/0.14.1_x86_64_darwin_2.7.12.pickle b/pandas/io/tests/data/legacy_pickle/0.14.1/0.14.1_x86_64_darwin_2.7.12.pickle new file mode 100644 index 0000000000000..917ad2b0ff1a3 Binary files /dev/null and b/pandas/io/tests/data/legacy_pickle/0.14.1/0.14.1_x86_64_darwin_2.7.12.pickle differ diff --git a/pandas/io/tests/data/legacy_pickle/0.15.0/0.15.0_x86_64_darwin_2.7.12.pickle b/pandas/io/tests/data/legacy_pickle/0.15.0/0.15.0_x86_64_darwin_2.7.12.pickle new file mode 100644 index 0000000000000..c7a745cf9b458 Binary files /dev/null and b/pandas/io/tests/data/legacy_pickle/0.15.0/0.15.0_x86_64_darwin_2.7.12.pickle differ diff --git a/pandas/io/tests/data/legacy_pickle/0.18.1/0.18.1_x86_64_darwin_2.7.12.pickle b/pandas/io/tests/data/legacy_pickle/0.18.1/0.18.1_x86_64_darwin_2.7.12.pickle new file mode 100644 index 0000000000000..bb237f53476b5 Binary files /dev/null and b/pandas/io/tests/data/legacy_pickle/0.18.1/0.18.1_x86_64_darwin_2.7.12.pickle differ diff --git a/pandas/io/tests/data/legacy_pickle/0.18.1/0.18.1_x86_64_darwin_3.5.2.pickle b/pandas/io/tests/data/legacy_pickle/0.18.1/0.18.1_x86_64_darwin_3.5.2.pickle new file mode 100644 index 0000000000000..db1d17a8b67c3 Binary files /dev/null and b/pandas/io/tests/data/legacy_pickle/0.18.1/0.18.1_x86_64_darwin_3.5.2.pickle differ diff --git a/pandas/io/tests/data/stata15.dta b/pandas/io/tests/data/stata15.dta new file mode 100644 index 0000000000000..d13e2fa337db3 Binary files /dev/null and b/pandas/io/tests/data/stata15.dta differ diff --git a/pandas/io/tests/data/test5.xls b/pandas/io/tests/data/test5.xls new file mode 100644 index 0000000000000..4bb7cd4767dd7 Binary files /dev/null and b/pandas/io/tests/data/test5.xls differ diff --git a/pandas/io/tests/data/test5.xlsm b/pandas/io/tests/data/test5.xlsm new file mode 100644 index 0000000000000..845cec785b498 Binary files /dev/null and b/pandas/io/tests/data/test5.xlsm differ diff --git a/pandas/io/tests/data/test5.xlsx b/pandas/io/tests/data/test5.xlsx new file mode 100644 index 0000000000000..13781bb06048f Binary files /dev/null and b/pandas/io/tests/data/test5.xlsx differ diff --git a/pandas/io/tests/data/test_mmap.csv b/pandas/io/tests/data/test_mmap.csv new file mode 100644 index 0000000000000..cc2cd7c30349b --- /dev/null +++ b/pandas/io/tests/data/test_mmap.csv @@ -0,0 +1,5 @@ +a,b,c +1,one,I +2,two,II + +3,three,III diff --git a/pandas/io/tests/data/yahoo_options1.html b/pandas/io/tests/data/yahoo_options1.html deleted file mode 100644 index 2846a2bd12732..0000000000000 --- a/pandas/io/tests/data/yahoo_options1.html +++ /dev/null @@ -1,6065 +0,0 @@ - - - - - AAPL Options | Yahoo! Inc. Stock - Yahoo! Finance - - - - - - - - - - - - - - - - - - - - - - - - - -
-
-
- -
- - - - - -
- -
- - - - - - - -
- - -
- - -
-
- - - - - -
-
-
- - - - - -
-
-
- - - - - -
- -
- - - -
-
-
-
- - - - Dow - - - - Up - - - 1.32% - - - - - - - Nasdaq - - - - Up - - - 1.60% - - - - - - -
- -
-
-
- - -
- -
-

More on AAPL

-
-
- - -

Quotes

- - -

Charts

- - -

News & Info

- - -

Company

- - -

Analyst Coverage

- - -

Ownership

- - -

Financials

- - - -
-
- -
-
- -
-
-
-
-
- -
-
-
-
-
-
-
-
-
-
-
-
-
-
- - -
-
- - - - - -
-
-
- - - - -
-
-
-
-
-
-

Apple Inc. (AAPL)

- -
-
-
-
- - 104.83 - - - - - Up +1.84(1.79%) - - - NasdaqGS - As of 4:00PM EDT - -
-
| - - -
-
- -
- - -
-
-
-
-
- - - - -
-
-
- - - - - -
- -
- - - -
- - -
-
- - - - - -
-
-
- -
-
- -
-
October 24, 2014
- -
- - -
- -
- - -
-
-
- - -
-
-
-
-
- - -
-
- In The Money -
-
- - - -
-
-

Show Me Strikes From

-
- $ - to $ -
- Apply Filter - Clear Filter -
- - - - - -
- -
-
-

Show Me Strikes From

-
- $ - to $ -
- Apply Filter - Clear Filter -
- - - - - -
- - -
- - -
-
-
- - - - - -
- -
- - - -
- - -
- -
-
- - - -
- - - - - - - - - - - - - - - - -
- - - - - - - - - - - - \ No newline at end of file diff --git a/pandas/io/tests/data/yahoo_options2.html b/pandas/io/tests/data/yahoo_options2.html deleted file mode 100644 index bae9c193e03e1..0000000000000 --- a/pandas/io/tests/data/yahoo_options2.html +++ /dev/null @@ -1,5853 +0,0 @@ - - - - - AAPL Option Chain | Yahoo! Inc. Stock - Yahoo! Finance - - - - - - - - - - - - - - - - - - - - - - - - - -
-
-
- -
-
- - -
- -
- -
- - - - - - - -
- - -
- - -
-
- - - - - -
-
-
- - - - - -
-
-
- - - - - -
- -
- - - -
-
-
-
- - - - Dow - - - - Up - - - 0.58% - - - - - - - Nasdaq - - - - Down - - - 0.06% - - - - - - -
- -
-
-
- - -
- -
-

More on AAPL

-
-
- - -

Quotes

- - -

Charts

- - -

News & Info

- - -

Company

- - -

Analyst Coverage

- - -

Ownership

- - -

Financials

- - - -
-
- -
-
- -
-
-
-
-
- -
-
-
-
-
-
-
-
-
-
-
-
-
-
- - -
-
- - - - - -
-
-
- - - - -
-
-
-
-
-
-

Apple Inc. (AAPL)

- -
-
-
-
- - 108.86 - - - - - Up +0.26(0.24%) - - - NasdaqGS - As of 4:00PM EST - -
-
| - - After Hours: - 108.86 0.00 (0.00%) 7:59PM EST - - -
-
- -
- - -
-
-
-
-
- - - - -
-
-
- - - - - -
- -
- - - -
- - -
-
- - - - - -
-
-
- -
-
- -
-
November 7, 2014
- -
- - - -
-
-
- - -
-
-
-
-
- - -
-
- In The Money -
-
- - - -
-
-

Show Me Strikes From

-
- $ - to $ -
- Apply Filter - Clear Filter -
- - - - - -
- -
-
-

Show Me Strikes From

-
- $ - to $ -
- Apply Filter - Clear Filter -
- - - - - -
- - -
- - -
-
-
- - - - - -
- -
- - - -
- - -
- -
-
- - - -
-
- - - - - - - - - - - - - - - -
- - - - - - - - - - - - \ No newline at end of file diff --git a/pandas/io/tests/data/yahoo_options3.html b/pandas/io/tests/data/yahoo_options3.html deleted file mode 100644 index 6e79bb9bf9f36..0000000000000 --- a/pandas/io/tests/data/yahoo_options3.html +++ /dev/null @@ -1,2807 +0,0 @@ - - - - - SPWR Option Chain | Yahoo! Inc. Stock - Yahoo! Finance - - - - - - - - - - - - - - - - - - - - - - - - - -
-
-
- -
  • FirefoxInstall the new Firefox »
  • -
    - - -
    - -
    - -
    - - - - - - - -
    - - -
    - - -
    - -
    - - - -
    -
    -
    -
    - - - - Dow - - - - Down - - - 0.58% - - - - - - - Nasdaq - - - - Down - - - 0.32% - - - - - - -
    - -
    -
    -
    - - -
    - -
    -

    More on SPWR

    -
    -
    - - -

    Quotes

    - - -

    Charts

    - - -

    News & Info

    - - -

    Company

    - - -

    Analyst Coverage

    - - -

    Ownership

    - - -

    Financials

    - - -
    -
    - -
    -
    - -
    -
    -
    -
    -
    - -
    -
    -
    -
    -
    -
    -
    -
    -
    -
    -
    -
    -
    -
    - - -
    - - -
    -
    -
    -
    -
    -
    -

    SunPower Corporation (SPWR)

    - -
    -
    -
    -
    - - 33.05 - - - - - Up +0.07(0.21%) - - - NASDAQ - As of 4:00PM EDT - -
    -
    | - - After Hours: - 33.10 Up +0.05 (0.15%) 7:47PM EDT - - -
    -
    - -
    - - -
    -
    -
    -
    -
    - - -
    - - - -
    - - -
    -
    - -
    -
    May 1, 2015
    - -
    - - - -
    -
    -
    - - -
    -
    -
    -
    -
    - - -
    -
    - In The Money -
    -
    - - - -
    -
    -

    Show Me Strikes From

    -
    - $ - to $ -
    - Apply Filter - Clear Filter -
    - - - - - -
    - -
    -
    -

    Show Me Strikes From

    -
    - $ - to $ -
    - Apply Filter - Clear Filter -
    - - - - - -
    - - -
    -
    - - - -
    - - -
    - -
    -
    - - - -
    -
    - - - - - - - - - - - - - - - -
    - - - - - - - - - - - - \ No newline at end of file diff --git a/pandas/io/tests/generate_legacy_storage_files.py b/pandas/io/tests/generate_legacy_storage_files.py index bfa8ff6d30a9c..d0365cb2c30b3 100644 --- a/pandas/io/tests/generate_legacy_storage_files.py +++ b/pandas/io/tests/generate_legacy_storage_files.py @@ -5,7 +5,7 @@ SparseSeries, SparseDataFrame, Index, MultiIndex, bdate_range, to_msgpack, date_range, period_range, - Timestamp, Categorical, Period) + Timestamp, NaT, Categorical, Period) from pandas.compat import u import os import sys @@ -80,6 +80,7 @@ def create_data(): [u'one', u'two', u'one', u'two', u'one', u'two', u'one', u'two']])), names=[u'first', u'second'])) + series = dict(float=Series(data[u'A']), int=Series(data[u'B']), mixed=Series(data[u'E']), @@ -135,6 +136,17 @@ def create_data(): items=[u'A', u'B', u'A']), mixed_dup=mixed_dup_panel) + cat = dict(int8=Categorical(list('abcdefg')), + int16=Categorical(np.arange(1000)), + int32=Categorical(np.arange(10000))) + + timestamp = dict(normal=Timestamp('2011-01-01'), + nat=NaT, + tz=Timestamp('2011-01-01', tz='US/Eastern'), + freq=Timestamp('2011-01-01', freq='D'), + both=Timestamp('2011-01-01', tz='Asia/Tokyo', + freq='M')) + return dict(series=series, frame=frame, panel=panel, @@ -143,7 +155,9 @@ def create_data(): mi=mi, sp_series=dict(float=_create_sp_series(), ts=_create_sp_tsseries()), - sp_frame=dict(float=_create_sp_frame())) + sp_frame=dict(float=_create_sp_frame()), + cat=cat, + timestamp=timestamp) def create_pickle_data(): diff --git a/pandas/io/tests/json/test_pandas.py b/pandas/io/tests/json/test_pandas.py index 9f8aedc2e399e..47bdd25572fc7 100644 --- a/pandas/io/tests/json/test_pandas.py +++ b/pandas/io/tests/json/test_pandas.py @@ -1,4 +1,5 @@ # pylint: disable-msg=W0612,E1101 +import nose from pandas.compat import range, lrange, StringIO, OrderedDict import os @@ -907,17 +908,17 @@ def test_tz_is_utc(self): ts = Timestamp('2013-01-10 05:00:00Z') self.assertEqual(exp, pd.json.dumps(ts, iso_dates=True)) - dt = ts.to_datetime() + dt = ts.to_pydatetime() self.assertEqual(exp, pd.json.dumps(dt, iso_dates=True)) ts = Timestamp('2013-01-10 00:00:00', tz='US/Eastern') self.assertEqual(exp, pd.json.dumps(ts, iso_dates=True)) - dt = ts.to_datetime() + dt = ts.to_pydatetime() self.assertEqual(exp, pd.json.dumps(dt, iso_dates=True)) ts = Timestamp('2013-01-10 00:00:00-0500') self.assertEqual(exp, pd.json.dumps(ts, iso_dates=True)) - dt = ts.to_datetime() + dt = ts.to_pydatetime() self.assertEqual(exp, pd.json.dumps(dt, iso_dates=True)) def test_tz_range_is_utc(self): @@ -948,8 +949,63 @@ def test_tz_range_is_utc(self): df = DataFrame({'DT': dti}) self.assertEqual(dfexp, pd.json.dumps(df, iso_dates=True)) + def test_read_jsonl(self): + # GH9180 + result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True) + expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) + assert_frame_equal(result, expected) + + def test_to_jsonl(self): + # GH9180 + df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) + result = df.to_json(orient="records", lines=True) + expected = '{"a":1,"b":2}\n{"a":1,"b":2}' + self.assertEqual(result, expected) + + def test_latin_encoding(self): + if compat.PY2: + self.assertRaisesRegexp( + TypeError, '\[unicode\] is not implemented as a table column') + return + + # GH 13774 + raise nose.SkipTest("encoding not implemented in .to_json(), " + "xref #13774") + + values = [[b'E\xc9, 17', b'', b'a', b'b', b'c'], + [b'E\xc9, 17', b'a', b'b', b'c'], + [b'EE, 17', b'', b'a', b'b', b'c'], + [b'E\xc9, 17', b'\xf8\xfc', b'a', b'b', b'c'], + [b'', b'a', b'b', b'c'], + [b'\xf8\xfc', b'a', b'b', b'c'], + [b'A\xf8\xfc', b'', b'a', b'b', b'c'], + [np.nan, b'', b'b', b'c'], + [b'A\xf8\xfc', np.nan, b'', b'b', b'c']] + + def _try_decode(x, encoding='latin-1'): + try: + return x.decode(encoding) + except AttributeError: + return x + + # not sure how to remove latin-1 from code in python 2 and 3 + values = [[_try_decode(x) for x in y] for y in values] + + examples = [] + for dtype in ['category', object]: + for val in values: + examples.append(Series(val, dtype=dtype)) + + def roundtrip(s, encoding='latin-1'): + with ensure_clean('test.json') as path: + s.to_json(path, encoding=encoding) + retr = read_json(path, encoding=encoding) + assert_series_equal(s, retr, check_categorical=False) + + for s in examples: + roundtrip(s) + if __name__ == '__main__': - import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure', '-s'], exit=False) diff --git a/pandas/io/tests/json/test_ujson.py b/pandas/io/tests/json/test_ujson.py index 13b2dafec9c89..704023bd847b7 100644 --- a/pandas/io/tests/json/test_ujson.py +++ b/pandas/io/tests/json/test_ujson.py @@ -1306,43 +1306,47 @@ def testSeries(self): # column indexed outp = Series(ujson.decode(ujson.encode(s))).sort_values() - self.assertTrue((s == outp).values.all()) + exp = Series([10, 20, 30, 40, 50, 60], + index=['6', '7', '8', '9', '10', '15']) + tm.assert_series_equal(outp, exp) outp = Series(ujson.decode(ujson.encode(s), numpy=True)).sort_values() - self.assertTrue((s == outp).values.all()) + tm.assert_series_equal(outp, exp) dec = _clean_dict(ujson.decode(ujson.encode(s, orient="split"))) outp = Series(**dec) - self.assertTrue((s == outp).values.all()) - self.assertTrue(s.name == outp.name) + tm.assert_series_equal(outp, s) dec = _clean_dict(ujson.decode(ujson.encode(s, orient="split"), numpy=True)) outp = Series(**dec) - self.assertTrue((s == outp).values.all()) - self.assertTrue(s.name == outp.name) - outp = Series(ujson.decode(ujson.encode( - s, orient="records"), numpy=True)) - self.assertTrue((s == outp).values.all()) + exp_np = Series(np.array([10, 20, 30, 40, 50, 60])) + exp_pd = Series([10, 20, 30, 40, 50, 60]) + outp = Series(ujson.decode(ujson.encode(s, orient="records"), + numpy=True)) + tm.assert_series_equal(outp, exp_np) outp = Series(ujson.decode(ujson.encode(s, orient="records"))) - self.assertTrue((s == outp).values.all()) + exp = Series([10, 20, 30, 40, 50, 60]) + tm.assert_series_equal(outp, exp_pd) - outp = Series(ujson.decode( - ujson.encode(s, orient="values"), numpy=True)) - self.assertTrue((s == outp).values.all()) + outp = Series(ujson.decode(ujson.encode(s, orient="values"), + numpy=True)) + tm.assert_series_equal(outp, exp_np) outp = Series(ujson.decode(ujson.encode(s, orient="values"))) - self.assertTrue((s == outp).values.all()) + tm.assert_series_equal(outp, exp_pd) outp = Series(ujson.decode(ujson.encode( s, orient="index"))).sort_values() - self.assertTrue((s == outp).values.all()) + exp = Series([10, 20, 30, 40, 50, 60], + index=['6', '7', '8', '9', '10', '15']) + tm.assert_series_equal(outp, exp) outp = Series(ujson.decode(ujson.encode( s, orient="index"), numpy=True)).sort_values() - self.assertTrue((s == outp).values.all()) + tm.assert_series_equal(outp, exp) def testSeriesNested(self): s = Series([10, 20, 30, 40, 50, 60], name="series", diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py index 7fca37cef473e..09d521e5a7e46 100644 --- a/pandas/io/tests/parser/c_parser_only.py +++ b/pandas/io/tests/parser/c_parser_only.py @@ -12,9 +12,10 @@ import pandas as pd import pandas.util.testing as tm -from pandas import DataFrame, Series, Index, MultiIndex +from pandas import DataFrame, Series, Index, MultiIndex, Categorical from pandas import compat from pandas.compat import StringIO, range, lrange +from pandas.types.dtypes import CategoricalDtype class CParserTests(object): @@ -135,6 +136,11 @@ def test_passing_dtype(self): dtype={'A': 'timedelta64', 'B': 'float64'}, index_col=0) + # valid but unsupported - fixed width unicode string + self.assertRaises(TypeError, self.read_csv, path, + dtype={'A': 'U8'}, + index_col=0) + # see gh-12048: empty frame actual = self.read_csv(StringIO('A,B'), dtype=str) expected = DataFrame({'A': [], 'B': []}, index=[], dtype=str) @@ -172,46 +178,6 @@ def error(val): self.assertTrue(sum(precise_errors) <= sum(normal_errors)) self.assertTrue(max(precise_errors) <= max(normal_errors)) - def test_compact_ints(self): - if compat.is_platform_windows() and not self.low_memory: - raise nose.SkipTest( - "segfaults on win-64, only when all tests are run") - - data = ('0,1,0,0\n' - '1,1,0,0\n' - '0,1,0,1') - - result = self.read_csv(StringIO(data), delimiter=',', header=None, - compact_ints=True, as_recarray=True) - ex_dtype = np.dtype([(str(i), 'i1') for i in range(4)]) - self.assertEqual(result.dtype, ex_dtype) - - result = self.read_csv(StringIO(data), delimiter=',', header=None, - as_recarray=True, compact_ints=True, - use_unsigned=True) - ex_dtype = np.dtype([(str(i), 'u1') for i in range(4)]) - self.assertEqual(result.dtype, ex_dtype) - - def test_compact_ints_as_recarray(self): - if compat.is_platform_windows() and self.low_memory: - raise nose.SkipTest( - "segfaults on win-64, only when all tests are run") - - data = ('0,1,0,0\n' - '1,1,0,0\n' - '0,1,0,1') - - result = self.read_csv(StringIO(data), delimiter=',', header=None, - compact_ints=True, as_recarray=True) - ex_dtype = np.dtype([(str(i), 'i1') for i in range(4)]) - self.assertEqual(result.dtype, ex_dtype) - - result = self.read_csv(StringIO(data), delimiter=',', header=None, - as_recarray=True, compact_ints=True, - use_unsigned=True) - ex_dtype = np.dtype([(str(i), 'u1') for i in range(4)]) - self.assertEqual(result.dtype, ex_dtype) - def test_pass_dtype(self): data = """\ one,two @@ -224,6 +190,92 @@ def test_pass_dtype(self): self.assertEqual(result['one'].dtype, 'u1') self.assertEqual(result['two'].dtype, 'object') + def test_categorical_dtype(self): + # GH 10153 + data = """a,b,c +1,a,3.4 +1,a,3.4 +2,b,4.5""" + expected = pd.DataFrame({'a': Categorical(['1', '1', '2']), + 'b': Categorical(['a', 'a', 'b']), + 'c': Categorical(['3.4', '3.4', '4.5'])}) + actual = self.read_csv(StringIO(data), dtype='category') + tm.assert_frame_equal(actual, expected) + + actual = self.read_csv(StringIO(data), dtype=CategoricalDtype()) + tm.assert_frame_equal(actual, expected) + + actual = self.read_csv(StringIO(data), dtype={'a': 'category', + 'b': 'category', + 'c': CategoricalDtype()}) + tm.assert_frame_equal(actual, expected) + + actual = self.read_csv(StringIO(data), dtype={'b': 'category'}) + expected = pd.DataFrame({'a': [1, 1, 2], + 'b': Categorical(['a', 'a', 'b']), + 'c': [3.4, 3.4, 4.5]}) + tm.assert_frame_equal(actual, expected) + + actual = self.read_csv(StringIO(data), dtype={1: 'category'}) + tm.assert_frame_equal(actual, expected) + + # unsorted + data = """a,b,c +1,b,3.4 +1,b,3.4 +2,a,4.5""" + expected = pd.DataFrame({'a': Categorical(['1', '1', '2']), + 'b': Categorical(['b', 'b', 'a']), + 'c': Categorical(['3.4', '3.4', '4.5'])}) + actual = self.read_csv(StringIO(data), dtype='category') + tm.assert_frame_equal(actual, expected) + + # missing + data = """a,b,c +1,b,3.4 +1,nan,3.4 +2,a,4.5""" + expected = pd.DataFrame({'a': Categorical(['1', '1', '2']), + 'b': Categorical(['b', np.nan, 'a']), + 'c': Categorical(['3.4', '3.4', '4.5'])}) + actual = self.read_csv(StringIO(data), dtype='category') + tm.assert_frame_equal(actual, expected) + + def test_categorical_dtype_encoding(self): + # GH 10153 + pth = tm.get_data_path('unicode_series.csv') + encoding = 'latin-1' + expected = self.read_csv(pth, header=None, encoding=encoding) + expected[1] = Categorical(expected[1]) + actual = self.read_csv(pth, header=None, encoding=encoding, + dtype={1: 'category'}) + tm.assert_frame_equal(actual, expected) + + pth = tm.get_data_path('utf16_ex.txt') + encoding = 'utf-16' + expected = self.read_table(pth, encoding=encoding) + expected = expected.apply(Categorical) + actual = self.read_table(pth, encoding=encoding, dtype='category') + tm.assert_frame_equal(actual, expected) + + def test_categorical_dtype_chunksize(self): + # GH 10153 + data = """a,b +1,a +1,b +1,b +2,c""" + expecteds = [pd.DataFrame({'a': [1, 1], + 'b': Categorical(['a', 'b'])}), + pd.DataFrame({'a': [1, 2], + 'b': Categorical(['b', 'c'])}, + index=[2, 3])] + actuals = self.read_csv(StringIO(data), dtype={'b': 'category'}, + chunksize=2) + + for actual, expected in zip(actuals, expecteds): + tm.assert_frame_equal(actual, expected) + def test_pass_dtype_as_recarray(self): if compat.is_platform_windows() and self.low_memory: raise nose.SkipTest( @@ -236,10 +288,12 @@ def test_pass_dtype_as_recarray(self): 3,4.5 4,5.5""" - result = self.read_csv(StringIO(data), dtype={'one': 'u1', 1: 'S1'}, - as_recarray=True) - self.assertEqual(result['one'].dtype, 'u1') - self.assertEqual(result['two'].dtype, 'S1') + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False): + result = self.read_csv(StringIO(data), dtype={ + 'one': 'u1', 1: 'S1'}, as_recarray=True) + self.assertEqual(result['one'].dtype, 'u1') + self.assertEqual(result['two'].dtype, 'S1') def test_empty_pass_dtype(self): data = 'one,two' @@ -323,10 +377,6 @@ def test_usecols_dtypes(self): self.assertTrue((result.dtypes == [object, np.int, np.float]).all()) self.assertTrue((result2.dtypes == [object, np.float]).all()) - def test_memory_map(self): - # it works! - self.read_csv(self.csv1, memory_map=True) - def test_disable_bool_parsing(self): # #2090 @@ -423,3 +473,91 @@ def test_empty_header_read(count): for count in range(1, 101): test_empty_header_read(count) + + def test_parse_trim_buffers(self): + # This test is part of a bugfix for issue #13703. It attmepts to + # to stress the system memory allocator, to cause it to move the + # stream buffer and either let the OS reclaim the region, or let + # other memory requests of parser otherwise modify the contents + # of memory space, where it was formely located. + # This test is designed to cause a `segfault` with unpatched + # `tokenizer.c`. Sometimes the test fails on `segfault`, other + # times it fails due to memory corruption, which causes the + # loaded DataFrame to differ from the expected one. + + # Generate a large mixed-type CSV file on-the-fly (one record is + # approx 1.5KiB). + record_ = \ + """9999-9,99:99,,,,ZZ,ZZ,,,ZZZ-ZZZZ,.Z-ZZZZ,-9.99,,,9.99,Z""" \ + """ZZZZ,,-99,9,ZZZ-ZZZZ,ZZ-ZZZZ,,9.99,ZZZ-ZZZZZ,ZZZ-ZZZZZ,""" \ + """ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,9""" \ + """99,ZZZ-ZZZZ,,ZZ-ZZZZ,,,,,ZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,,,9,9,""" \ + """9,9,99,99,999,999,ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,9,ZZ-ZZZZ,9.""" \ + """99,ZZ-ZZZZ,ZZ-ZZZZ,,,,ZZZZ,,,ZZ,ZZ,,,,,,,,,,,,,9,,,999.""" \ + """99,999.99,,,ZZZZZ,,,Z9,,,,,,,ZZZ,ZZZ,,,,,,,,,,,ZZZZZ,ZZ""" \ + """ZZZ,ZZZ-ZZZZZZ,ZZZ-ZZZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZ""" \ + """ZZ,,,999999,999999,ZZZ,ZZZ,,,ZZZ,ZZZ,999.99,999.99,,,,Z""" \ + """ZZ-ZZZ,ZZZ-ZZZ,-9.99,-9.99,9,9,,99,,9.99,9.99,9,9,9.99,""" \ + """9.99,,,,9.99,9.99,,99,,99,9.99,9.99,,,ZZZ,ZZZ,,999.99,,""" \ + """999.99,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,ZZZZZ,ZZZZZ,ZZZ,ZZZ,9,9,""" \ + """,,,,,ZZZ-ZZZZ,ZZZ999Z,,,999.99,,999.99,ZZZ-ZZZZ,,,9.999""" \ + """,9.999,9.999,9.999,-9.999,-9.999,-9.999,-9.999,9.999,9.""" \ + """999,9.999,9.999,9.999,9.999,9.999,9.999,99999,ZZZ-ZZZZ,""" \ + """,9.99,ZZZ,,,,,,,,ZZZ,,,,,9,,,,9,,,,,,,,,,ZZZ-ZZZZ,ZZZ-Z""" \ + """ZZZ,,ZZZZZ,ZZZZZ,ZZZZZ,ZZZZZ,,,9.99,,ZZ-ZZZZ,ZZ-ZZZZ,ZZ""" \ + """,999,,,,ZZ-ZZZZ,ZZZ,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,99.99,99.99""" \ + """,,,9.99,9.99,9.99,9.99,ZZZ-ZZZZ,,,ZZZ-ZZZZZ,,,,,-9.99,-""" \ + """9.99,-9.99,-9.99,,,,,,,,,ZZZ-ZZZZ,,9,9.99,9.99,99ZZ,,-9""" \ + """.99,-9.99,ZZZ-ZZZZ,,,,,,,ZZZ-ZZZZ,9.99,9.99,9999,,,,,,,""" \ + """,,,-9.9,Z/Z-ZZZZ,999.99,9.99,,999.99,ZZ-ZZZZ,ZZ-ZZZZ,9.""" \ + """99,9.99,9.99,9.99,9.99,9.99,,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZ""" \ + """ZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ,ZZZ,ZZZ,ZZZ,9.99,,,-9.99,ZZ""" \ + """-ZZZZ,-999.99,,-9999,,999.99,,,,999.99,99.99,,,ZZ-ZZZZZ""" \ + """ZZZ,ZZ-ZZZZ-ZZZZZZZ,,,,ZZ-ZZ-ZZZZZZZZ,ZZZZZZZZ,ZZZ-ZZZZ""" \ + """,9999,999.99,ZZZ-ZZZZ,-9.99,-9.99,ZZZ-ZZZZ,99:99:99,,99""" \ + """,99,,9.99,,-99.99,,,,,,9.99,ZZZ-ZZZZ,-9.99,-9.99,9.99,9""" \ + """.99,,ZZZ,,,,,,,ZZZ,ZZZ,,,,,""" + + # Set the number of lines so that a call to `parser_trim_buffers` + # is triggered: after a couple of full chunks are consumed a + # relatively small 'residual' chunk would cause reallocation + # within the parser. + chunksize, n_lines = 128, 2 * 128 + 15 + csv_data = "\n".join([record_] * n_lines) + "\n" + + # We will use StringIO to load the CSV from this text buffer. + # pd.read_csv() will iterate over the file in chunks and will + # finally read a residual chunk of really small size. + + # Generate the expected output: manually create the dataframe + # by splitting by comma and repeating the `n_lines` times. + row = tuple(val_ if val_ else float("nan") + for val_ in record_.split(",")) + expected = pd.DataFrame([row for _ in range(n_lines)], + dtype=object, columns=None, index=None) + + # Iterate over the CSV file in chunks of `chunksize` lines + chunks_ = self.read_csv(StringIO(csv_data), header=None, + dtype=object, chunksize=chunksize) + result = pd.concat(chunks_, axis=0, ignore_index=True) + + # Check for data corruption if there was no segfault + tm.assert_frame_equal(result, expected) + + def test_internal_null_byte(self): + # see gh-14012 + # + # The null byte ('\x00') should not be used as a + # true line terminator, escape character, or comment + # character, only as a placeholder to indicate that + # none was specified. + # + # This test should be moved to common.py ONLY when + # Python's csv class supports parsing '\x00'. + names = ['a', 'b', 'c'] + data = "1,2,3\n4,\x00,6\n7,8,9" + expected = pd.DataFrame([[1, 2.0, 3], [4, np.nan, 6], + [7, 8, 9]], columns=names) + + result = self.read_csv(StringIO(data), names=names) + tm.assert_frame_equal(result, expected) diff --git a/pandas/io/tests/parser/comment.py b/pandas/io/tests/parser/comment.py index f7cd1e190ec16..9987a017cf985 100644 --- a/pandas/io/tests/parser/comment.py +++ b/pandas/io/tests/parser/comment.py @@ -104,3 +104,15 @@ def test_custom_comment_char(self): result = self.read_csv(StringIO(data), comment='#') expected = DataFrame({'a': [1, 4], 'b': [2, 5], 'c': [3, 6]}) tm.assert_frame_equal(result, expected) + + def test_commment_first_line(self): + # see gh-4623 + data = '# notes\na,b,c\n# more notes\n1,2,3' + + expected = DataFrame([[1, 2, 3]], columns=['a', 'b', 'c']) + result = self.read_csv(StringIO(data), comment='#') + tm.assert_frame_equal(result, expected) + + expected = DataFrame({0: ['a', '1'], 1: ['b', '2'], 2: ['c', '3']}) + result = self.read_csv(StringIO(data), comment='#', header=None) + tm.assert_frame_equal(result, expected) diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index 44892dc17c47b..0b59b695e1dca 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -3,6 +3,7 @@ import csv import os import platform +import codecs import re import sys @@ -45,6 +46,27 @@ def test_empty_decimal_marker(self): with tm.assertRaisesRegexp(ValueError, msg): self.read_csv(StringIO(data), decimal='') + def test_bad_stream_exception(self): + # Issue 13652: + # This test validates that both python engine + # and C engine will raise UnicodeDecodeError instead of + # c engine raising CParserError and swallowing exception + # that caused read to fail. + handle = open(self.csv_shiftjs, "rb") + codec = codecs.lookup("utf-8") + utf8 = codecs.lookup('utf-8') + # stream must be binary UTF8 + stream = codecs.StreamRecoder( + handle, utf8.encode, utf8.decode, codec.streamreader, + codec.streamwriter) + if compat.PY3: + msg = "'utf-8' codec can't decode byte" + else: + msg = "'utf8' codec can't decode byte" + with tm.assertRaisesRegexp(UnicodeDecodeError, msg): + self.read_csv(stream) + stream.close() + def test_read_csv(self): if not compat.PY3: if compat.is_platform_windows(): @@ -127,11 +149,6 @@ def test_squeeze_no_view(self): result = self.read_csv(StringIO(data), index_col='time', squeeze=True) self.assertFalse(result._is_view) - def test_multiple_skts_example(self): - # TODO: Complete this - data = "year, month, a, b\n 2001, 01, 0.0, 10.\n 2001, 02, 1.1, 11." # noqa - pass - def test_malformed(self): # see gh-6607 @@ -196,9 +213,9 @@ def test_malformed(self): skiprows=[2]) it.read() - # skip_footer is not supported with the C parser yet + # skipfooter is not supported with the C parser yet if self.engine == 'python': - # skip_footer + # skipfooter data = """ignore A,B,C 1,2,3 # comment @@ -210,7 +227,7 @@ def test_malformed(self): with tm.assertRaisesRegexp(Exception, msg): self.read_table(StringIO(data), sep=',', header=1, comment='#', - skip_footer=1) + skipfooter=1) def test_quoting(self): bad_line_small = """printer\tresult\tvariant_name @@ -268,8 +285,11 @@ def test_csv_mixed_type(self): b,3,4 c,4,5 """ - # TODO: complete this - df = self.read_csv(StringIO(data)) # noqa + expected = DataFrame({'A': ['a', 'b', 'c'], + 'B': [1, 3, 4], + 'C': [2, 4, 5]}) + out = self.read_csv(StringIO(data)) + tm.assert_frame_equal(out, expected) def test_read_csv_dataframe(self): df = self.read_csv(self.csv1, index_col=0, parse_dates=True) @@ -439,6 +459,18 @@ def test_get_chunk_passed_chunksize(self): piece = result.get_chunk() self.assertEqual(len(piece), 2) + def test_read_chunksize_generated_index(self): + # GH 12185 + reader = self.read_csv(StringIO(self.data1), chunksize=2) + df = self.read_csv(StringIO(self.data1)) + + tm.assert_frame_equal(pd.concat(reader), df) + + reader = self.read_csv(StringIO(self.data1), chunksize=2, index_col=0) + df = self.read_csv(StringIO(self.data1), index_col=0) + + tm.assert_frame_equal(pd.concat(reader), df) + def test_read_text_list(self): data = """A,B,C\nfoo,1,2,3\nbar,4,5,6""" as_list = [['A', 'B', 'C'], ['foo', '1', '2', '3'], ['bar', @@ -502,11 +534,11 @@ def test_iterator(self): self.assertEqual(len(result), 3) tm.assert_frame_equal(pd.concat(result), expected) - # skip_footer is not supported with the C parser yet + # skipfooter is not supported with the C parser yet if self.engine == 'python': - # test bad parameter (skip_footer) + # test bad parameter (skipfooter) reader = self.read_csv(StringIO(self.data1), index_col=0, - iterator=True, skip_footer=True) + iterator=True, skipfooter=True) self.assertRaises(ValueError, reader.read, 3) def test_pass_names_with_index(self): @@ -608,10 +640,6 @@ def test_url(self): @tm.slow def test_file(self): - - # FILE - if sys.version_info[:2] < (2, 6): - raise nose.SkipTest("file:// not supported with Python < 2.6") dirpath = tm.get_data_path() localtable = os.path.join(dirpath, 'salary.table.csv') local_table = self.read_table(localtable) @@ -626,9 +654,10 @@ def test_file(self): tm.assert_frame_equal(url_table, local_table) def test_nonexistent_path(self): - # don't segfault pls #2428 + # gh-2428: pls no segfault + # gh-14086: raise more helpful FileNotFoundError path = '%s.csv' % tm.rands(10) - self.assertRaises(IOError, self.read_csv, path) + self.assertRaises(compat.FileNotFoundError, self.read_csv, path) def test_missing_trailing_delimiters(self): data = """A,B,C,D @@ -925,8 +954,8 @@ def test_empty_with_nrows_chunksize(self): StringIO('foo,bar\n'), chunksize=10))) tm.assert_frame_equal(result, expected) - # 'as_recarray' is not supported yet for the Python parser - if self.engine == 'c': + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False): result = self.read_csv(StringIO('foo,bar\n'), nrows=10, as_recarray=True) result = DataFrame(result[2], columns=result[1], @@ -934,11 +963,13 @@ def test_empty_with_nrows_chunksize(self): tm.assert_frame_equal(DataFrame.from_records( result), expected, check_index_type=False) - result = next(iter(self.read_csv( - StringIO('foo,bar\n'), chunksize=10, as_recarray=True))) + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False): + result = next(iter(self.read_csv(StringIO('foo,bar\n'), + chunksize=10, as_recarray=True))) result = DataFrame(result[2], columns=result[1], index=result[0]) - tm.assert_frame_equal(DataFrame.from_records( - result), expected, check_index_type=False) + tm.assert_frame_equal(DataFrame.from_records(result), expected, + check_index_type=False) def test_eof_states(self): # see gh-10728, gh-10548 @@ -1330,3 +1361,244 @@ def test_raise_on_no_columns(self): # test with more than a single newline data = "\n\n\n" self.assertRaises(EmptyDataError, self.read_csv, StringIO(data)) + + def test_compact_ints_use_unsigned(self): + # see gh-13323 + data = 'a,b,c\n1,9,258' + + # sanity check + expected = DataFrame({ + 'a': np.array([1], dtype=np.int64), + 'b': np.array([9], dtype=np.int64), + 'c': np.array([258], dtype=np.int64), + }) + out = self.read_csv(StringIO(data)) + tm.assert_frame_equal(out, expected) + + expected = DataFrame({ + 'a': np.array([1], dtype=np.int8), + 'b': np.array([9], dtype=np.int8), + 'c': np.array([258], dtype=np.int16), + }) + + # default behaviour for 'use_unsigned' + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False): + out = self.read_csv(StringIO(data), compact_ints=True) + tm.assert_frame_equal(out, expected) + + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False): + out = self.read_csv(StringIO(data), compact_ints=True, + use_unsigned=False) + tm.assert_frame_equal(out, expected) + + expected = DataFrame({ + 'a': np.array([1], dtype=np.uint8), + 'b': np.array([9], dtype=np.uint8), + 'c': np.array([258], dtype=np.uint16), + }) + + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False): + out = self.read_csv(StringIO(data), compact_ints=True, + use_unsigned=True) + tm.assert_frame_equal(out, expected) + + def test_compact_ints_as_recarray(self): + data = ('0,1,0,0\n' + '1,1,0,0\n' + '0,1,0,1') + + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False): + result = self.read_csv(StringIO(data), delimiter=',', header=None, + compact_ints=True, as_recarray=True) + ex_dtype = np.dtype([(str(i), 'i1') for i in range(4)]) + self.assertEqual(result.dtype, ex_dtype) + + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False): + result = self.read_csv(StringIO(data), delimiter=',', header=None, + as_recarray=True, compact_ints=True, + use_unsigned=True) + ex_dtype = np.dtype([(str(i), 'u1') for i in range(4)]) + self.assertEqual(result.dtype, ex_dtype) + + def test_as_recarray(self): + # basic test + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False): + data = 'a,b\n1,a\n2,b' + expected = np.array([(1, 'a'), (2, 'b')], + dtype=[('a', ' exception is raised + continue + + if engine == 'python' and arg == 'buffer_lines': + # unsupported --> exception is raised + continue + + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False): + kwargs = {arg: non_default_val} + read_csv(StringIO(data), engine=engine, + **kwargs) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) diff --git a/pandas/io/tests/parser/usecols.py b/pandas/io/tests/parser/usecols.py index 0d3ae95f0d1d4..16a19c50be960 100644 --- a/pandas/io/tests/parser/usecols.py +++ b/pandas/io/tests/parser/usecols.py @@ -5,12 +5,12 @@ for all of the parsers defined in parsers.py """ -from datetime import datetime import nose +import numpy as np import pandas.util.testing as tm -from pandas import DataFrame +from pandas import DataFrame, Index from pandas.lib import Timestamp from pandas.compat import StringIO @@ -98,35 +98,31 @@ def test_usecols_index_col_False(self): def test_usecols_index_col_conflict(self): # see gh-4201: test that index_col as integer reflects usecols - data = """SecId,Time,Price,P2,P3 -10000,2013-5-11,100,10,1 -500,2013-5-12,101,11,1 -""" - expected = DataFrame({'Price': [100, 101]}, index=[ - datetime(2013, 5, 11), datetime(2013, 5, 12)]) - expected.index.name = 'Time' + data = 'a,b,c,d\nA,a,1,one\nB,b,2,two' + expected = DataFrame({'c': [1, 2]}, index=Index( + ['a', 'b'], name='b')) - df = self.read_csv(StringIO(data), usecols=[ - 'Time', 'Price'], parse_dates=True, index_col=0) + df = self.read_csv(StringIO(data), usecols=['b', 'c'], + index_col=0) tm.assert_frame_equal(expected, df) - df = self.read_csv(StringIO(data), usecols=[ - 'Time', 'Price'], parse_dates=True, index_col='Time') + df = self.read_csv(StringIO(data), usecols=['b', 'c'], + index_col='b') tm.assert_frame_equal(expected, df) - df = self.read_csv(StringIO(data), usecols=[ - 1, 2], parse_dates=True, index_col='Time') + df = self.read_csv(StringIO(data), usecols=[1, 2], + index_col='b') tm.assert_frame_equal(expected, df) - df = self.read_csv(StringIO(data), usecols=[ - 1, 2], parse_dates=True, index_col=0) + df = self.read_csv(StringIO(data), usecols=[1, 2], + index_col=0) tm.assert_frame_equal(expected, df) expected = DataFrame( - {'P3': [1, 1], 'Price': (100, 101), 'P2': (10, 11)}) - expected = expected.set_index(['Price', 'P2']) - df = self.read_csv(StringIO(data), usecols=[ - 'Price', 'P2', 'P3'], parse_dates=True, index_col=['Price', 'P2']) + {'b': ['a', 'b'], 'c': [1, 2], 'd': ('one', 'two')}) + expected = expected.set_index(['b', 'c']) + df = self.read_csv(StringIO(data), usecols=['b', 'c', 'd'], + index_col=['b', 'c']) tm.assert_frame_equal(expected, df) def test_usecols_implicit_index_col(self): @@ -354,3 +350,19 @@ def test_usecols_with_multibyte_unicode_characters(self): df = self.read_csv(StringIO(s), usecols=[u'あああ', u'いい']) tm.assert_frame_equal(df, expected) + + def test_empty_usecols(self): + # should not raise + data = 'a,b,c\n1,2,3\n4,5,6' + expected = DataFrame() + result = self.read_csv(StringIO(data), usecols=set([])) + tm.assert_frame_equal(result, expected) + + def test_np_array_usecols(self): + # See gh-12546 + data = 'a,b,c\n1,2,3' + usecols = np.array(['a', 'b']) + + expected = DataFrame([[1, 2]], columns=usecols) + result = self.read_csv(StringIO(data), usecols=usecols) + tm.assert_frame_equal(result, expected) diff --git a/pandas/io/tests/sas/test_sas7bdat.py b/pandas/io/tests/sas/test_sas7bdat.py index 6661d9fee5df0..06eb9774679b1 100644 --- a/pandas/io/tests/sas/test_sas7bdat.py +++ b/pandas/io/tests/sas/test_sas7bdat.py @@ -44,7 +44,8 @@ def test_from_buffer(self): df0 = self.data[j] for k in self.test_ix[j]: fname = os.path.join(self.dirpath, "test%d.sas7bdat" % k) - byts = open(fname, 'rb').read() + with open(fname, 'rb') as f: + byts = f.read() buf = io.BytesIO(byts) df = pd.read_sas(buf, format="sas7bdat", encoding='utf-8') tm.assert_frame_equal(df, df0, check_exact=False) @@ -54,7 +55,8 @@ def test_from_iterator(self): df0 = self.data[j] for k in self.test_ix[j]: fname = os.path.join(self.dirpath, "test%d.sas7bdat" % k) - byts = open(fname, 'rb').read() + with open(fname, 'rb') as f: + byts = f.read() buf = io.BytesIO(byts) rdr = pd.read_sas(buf, format="sas7bdat", iterator=True, encoding='utf-8') @@ -79,6 +81,7 @@ def test_encoding_options(): from pandas.io.sas.sas7bdat import SAS7BDATReader rdr = SAS7BDATReader(fname, convert_header_text=False) df3 = rdr.read() + rdr.close() for x, y in zip(df1.columns, df3.columns): assert(x == y.decode()) diff --git a/pandas/io/tests/sas/test_xport.py b/pandas/io/tests/sas/test_xport.py index ae378c41cd24b..d0627a80f9604 100644 --- a/pandas/io/tests/sas/test_xport.py +++ b/pandas/io/tests/sas/test_xport.py @@ -39,11 +39,13 @@ def test1_basic(self): # Test incremental read with `read` method. reader = read_sas(self.file01, format="xport", iterator=True) data = reader.read(10) + reader.close() tm.assert_frame_equal(data, data_csv.iloc[0:10, :]) # Test incremental read with `get_chunk` method. reader = read_sas(self.file01, format="xport", chunksize=10) data = reader.get_chunk() + reader.close() tm.assert_frame_equal(data, data_csv.iloc[0:10, :]) # Read full file with `read_sas` method @@ -66,6 +68,7 @@ def test1_index(self): reader = read_sas(self.file01, index="SEQN", format="xport", iterator=True) data = reader.read(10) + reader.close() tm.assert_frame_equal(data, data_csv.iloc[0:10, :], check_index_type=False) @@ -73,6 +76,7 @@ def test1_index(self): reader = read_sas(self.file01, index="SEQN", format="xport", chunksize=10) data = reader.get_chunk() + reader.close() tm.assert_frame_equal(data, data_csv.iloc[0:10, :], check_index_type=False) diff --git a/pandas/io/tests/test_common.py b/pandas/io/tests/test_common.py index 8615b75d87626..c08d235b07c9e 100644 --- a/pandas/io/tests/test_common.py +++ b/pandas/io/tests/test_common.py @@ -1,13 +1,14 @@ """ Tests for the pandas.io.common functionalities """ -from pandas.compat import StringIO +import mmap import os from os.path import isabs import pandas.util.testing as tm from pandas.io import common +from pandas.compat import is_platform_windows, StringIO from pandas import read_csv, concat @@ -85,5 +86,56 @@ def test_iterator(self): it = read_csv(StringIO(self.data1), chunksize=1) first = next(it) tm.assert_frame_equal(first, expected.iloc[[0]]) - expected.index = [0 for i in range(len(expected))] tm.assert_frame_equal(concat(it), expected.iloc[1:]) + + +class TestMMapWrapper(tm.TestCase): + + def setUp(self): + self.mmap_file = os.path.join(tm.get_data_path(), + 'test_mmap.csv') + + def test_constructor_bad_file(self): + non_file = StringIO('I am not a file') + non_file.fileno = lambda: -1 + + # the error raised is different on Windows + if is_platform_windows(): + msg = "The parameter is incorrect" + err = OSError + else: + msg = "[Errno 22]" + err = mmap.error + + tm.assertRaisesRegexp(err, msg, common.MMapWrapper, non_file) + + target = open(self.mmap_file, 'r') + target.close() + + msg = "I/O operation on closed file" + tm.assertRaisesRegexp(ValueError, msg, common.MMapWrapper, target) + + def test_get_attr(self): + with open(self.mmap_file, 'r') as target: + wrapper = common.MMapWrapper(target) + + attrs = dir(wrapper.mmap) + attrs = [attr for attr in attrs + if not attr.startswith('__')] + attrs.append('__next__') + + for attr in attrs: + self.assertTrue(hasattr(wrapper, attr)) + + self.assertFalse(hasattr(wrapper, 'foo')) + + def test_next(self): + with open(self.mmap_file, 'r') as target: + wrapper = common.MMapWrapper(target) + lines = target.readlines() + + for line in lines: + next_line = next(wrapper) + self.assertEqual(next_line.strip(), line.strip()) + + self.assertRaises(StopIteration, next, wrapper) diff --git a/pandas/io/tests/test_data.py b/pandas/io/tests/test_data.py deleted file mode 100644 index 1efa8b13598a7..0000000000000 --- a/pandas/io/tests/test_data.py +++ /dev/null @@ -1,586 +0,0 @@ -# flake8: noqa - -from __future__ import print_function -from pandas import compat -import warnings -import nose -from nose.tools import assert_equal -from datetime import datetime -import os - -import numpy as np -import pandas as pd -from pandas import DataFrame, Timestamp -from pandas.util.testing import (assert_series_equal, assert_produces_warning, - network, assert_frame_equal) -import pandas.util.testing as tm - -with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - from pandas.io import data as web - -from pandas.io.data import DataReader, SymbolWarning, RemoteDataError, _yahoo_codes - -if compat.PY3: - from urllib.error import HTTPError -else: - from urllib2 import HTTPError - - -def _skip_if_no_lxml(): - try: - import lxml - except ImportError: - raise nose.SkipTest("no lxml") - -def _skip_if_no_bs(): - try: - import bs4 - import html5lib - except ImportError: - raise nose.SkipTest("no html5lib/bs4") - - -def assert_n_failed_equals_n_null_columns(wngs, obj, cls=SymbolWarning): - all_nan_cols = pd.Series(dict((k, pd.isnull(v).all()) for k, v in - compat.iteritems(obj))) - n_all_nan_cols = all_nan_cols.sum() - valid_warnings = pd.Series([wng for wng in wngs if wng.category == cls]) - assert_equal(len(valid_warnings), n_all_nan_cols) - failed_symbols = all_nan_cols[all_nan_cols].index - msgs = valid_warnings.map(lambda x: x.message) - assert msgs.str.contains('|'.join(failed_symbols)).all() - - -class TestGoogle(tm.TestCase): - @classmethod - def setUpClass(cls): - super(TestGoogle, cls).setUpClass() - cls.locales = tm.get_locales(prefix='en_US') - if not cls.locales: - raise nose.SkipTest("US English locale not available for testing") - - @classmethod - def tearDownClass(cls): - super(TestGoogle, cls).tearDownClass() - del cls.locales - - @network - def test_google(self): - # asserts that google is minimally working and that it throws - # an exception when DataReader can't get a 200 response from - # google - start = datetime(2010, 1, 1) - end = datetime(2013, 1, 27) - - for locale in self.locales: - with tm.set_locale(locale): - panel = web.DataReader("F", 'google', start, end) - self.assertEqual(panel.Close[-1], 13.68) - - self.assertRaises(Exception, web.DataReader, "NON EXISTENT TICKER", - 'google', start, end) - - @network - def test_get_quote_fails(self): - self.assertRaises(NotImplementedError, web.get_quote_google, - pd.Series(['GOOG', 'AAPL', 'GOOG'])) - - @network - def test_get_goog_volume(self): - for locale in self.locales: - with tm.set_locale(locale): - df = web.get_data_google('GOOG').sort_index() - self.assertEqual(df.Volume.ix['JAN-02-2015'], 1446662) - - @network - def test_get_multi1(self): - for locale in self.locales: - sl = ['AAPL', 'AMZN', 'GOOG'] - with tm.set_locale(locale): - pan = web.get_data_google(sl, '2012', '2013') - ts = pan.Close.GOOG.index[pan.Close.AAPL < pan.Close.GOOG] - if (hasattr(pan, 'Close') and hasattr(pan.Close, 'GOOG') and - hasattr(pan.Close, 'AAPL')): - self.assertEqual(ts[0].dayofyear, 3) - else: - self.assertRaises(AttributeError, lambda: pan.Close) - - @network - def test_get_multi_invalid(self): - sl = ['AAPL', 'AMZN', 'INVALID'] - with tm.assert_produces_warning(SymbolWarning): - pan = web.get_data_google(sl, '2012') - self.assertIn('INVALID', pan.minor_axis) - - @network - def test_get_multi_all_invalid(self): - sl = ['INVALID', 'INVALID2', 'INVALID3'] - with tm.assert_produces_warning(SymbolWarning): - self.assertRaises(RemoteDataError, web.get_data_google, sl, '2012') - - @network - def test_get_multi2(self): - with warnings.catch_warnings(record=True) as w: - for locale in self.locales: - with tm.set_locale(locale): - pan = web.get_data_google(['GE', 'MSFT', 'INTC'], - 'JAN-01-12', 'JAN-31-12') - result = pan.Close.ix['01-18-12'] - assert_n_failed_equals_n_null_columns(w, result) - - # sanity checking - - self.assertTrue(np.issubdtype(result.dtype, np.floating)) - result = pan.Open.ix['Jan-15-12':'Jan-20-12'] - self.assertEqual((4, 3), result.shape) - assert_n_failed_equals_n_null_columns(w, result) - - @network - def test_dtypes(self): - #GH3995, #GH8980 - data = web.get_data_google('F', start='JAN-01-10', end='JAN-27-13') - self.assertTrue(np.issubdtype(data.Open.dtype, np.number)) - self.assertTrue(np.issubdtype(data.Close.dtype, np.number)) - self.assertTrue(np.issubdtype(data.Low.dtype, np.number)) - self.assertTrue(np.issubdtype(data.High.dtype, np.number)) - self.assertTrue(np.issubdtype(data.Volume.dtype, np.number)) - - @network - def test_unicode_date(self): - #GH8967 - data = web.get_data_google('F', start='JAN-01-10', end='JAN-27-13') - self.assertEqual(data.index.name, 'Date') - - -class TestYahoo(tm.TestCase): - @classmethod - def setUpClass(cls): - super(TestYahoo, cls).setUpClass() - _skip_if_no_lxml() - - @network - def test_yahoo(self): - # asserts that yahoo is minimally working and that it throws - # an exception when DataReader can't get a 200 response from - # yahoo - start = datetime(2010, 1, 1) - end = datetime(2013, 1, 27) - - self.assertEqual(web.DataReader("F", 'yahoo', start, end)['Close'][-1], - 13.68) - - @network - def test_yahoo_fails(self): - start = datetime(2010, 1, 1) - end = datetime(2013, 1, 27) - self.assertRaises(Exception, web.DataReader, "NON EXISTENT TICKER", - 'yahoo', start, end) - - @network - def test_get_quote_series(self): - df = web.get_quote_yahoo(pd.Series(['GOOG', 'AAPL', 'GOOG'])) - assert_series_equal(df.ix[0], df.ix[2]) - - @network - def test_get_quote_string(self): - df = web.get_quote_yahoo('GOOG') - - @network - def test_get_quote_string(self): - _yahoo_codes.update({'MarketCap': 'j1'}) - df = web.get_quote_yahoo('GOOG') - self.assertFalse(pd.isnull(df['MarketCap'][0])) - - @network - def test_get_quote_stringlist(self): - df = web.get_quote_yahoo(['GOOG', 'AAPL', 'GOOG']) - assert_series_equal(df.ix[0], df.ix[2]) - - @network - def test_get_components_dow_jones(self): - raise nose.SkipTest('unreliable test, receive partial components back for dow_jones') - - df = web.get_components_yahoo('^DJI') #Dow Jones - self.assertIsInstance(df, pd.DataFrame) - self.assertEqual(len(df), 30) - - @network - def test_get_components_dax(self): - raise nose.SkipTest('unreliable test, receive partial components back for dax') - - df = web.get_components_yahoo('^GDAXI') #DAX - self.assertIsInstance(df, pd.DataFrame) - self.assertEqual(len(df), 30) - self.assertEqual(df[df.name.str.contains('adidas', case=False)].index, - 'ADS.DE') - - @network - def test_get_components_nasdaq_100(self): - # as of 7/12/13 the conditional will test false because the link is invalid - raise nose.SkipTest('unreliable test, receive partial components back for nasdaq_100') - - df = web.get_components_yahoo('^NDX') #NASDAQ-100 - self.assertIsInstance(df, pd.DataFrame) - - if len(df) > 1: - # Usual culprits, should be around for a while - self.assertTrue('AAPL' in df.index) - self.assertTrue('GOOG' in df.index) - self.assertTrue('AMZN' in df.index) - else: - expected = DataFrame({'exchange': 'N/A', 'name': '@^NDX'}, - index=['@^NDX']) - assert_frame_equal(df, expected) - - @network - def test_get_data_single_symbol(self): - #single symbol - #http://finance.yahoo.com/q/hp?s=GOOG&a=09&b=08&c=2010&d=09&e=10&f=2010&g=d - # just test that we succeed - web.get_data_yahoo('GOOG') - - @network - def test_get_data_interval(self): - # daily interval data - pan = web.get_data_yahoo('XOM', '2013-01-01', '2013-12-31', interval='d') - self.assertEqual(len(pan), 252) - - # weekly interval data - pan = web.get_data_yahoo('XOM', '2013-01-01', '2013-12-31', interval='w') - self.assertEqual(len(pan), 53) - - # montly interval data - pan = web.get_data_yahoo('XOM', '2013-01-01', '2013-12-31', interval='m') - self.assertEqual(len(pan), 12) - - # dividend data - pan = web.get_data_yahoo('XOM', '2013-01-01', '2013-12-31', interval='v') - self.assertEqual(len(pan), 4) - - # test fail on invalid interval - self.assertRaises(ValueError, web.get_data_yahoo, 'XOM', interval='NOT VALID') - - @network - def test_get_data_multiple_symbols(self): - # just test that we succeed - sl = ['AAPL', 'AMZN', 'GOOG'] - web.get_data_yahoo(sl, '2012') - - @network - def test_get_data_multiple_symbols_two_dates(self): - pan = web.get_data_yahoo(['GE', 'MSFT', 'INTC'], 'JAN-01-12', - 'JAN-31-12') - result = pan.Close.ix['01-18-12'] - self.assertEqual(len(result), 3) - - # sanity checking - self.assertTrue(np.issubdtype(result.dtype, np.floating)) - - expected = np.array([[18.99, 28.4, 25.18], - [18.58, 28.31, 25.13], - [19.03, 28.16, 25.52], - [18.81, 28.82, 25.87]]) - result = pan.Open.ix['Jan-15-12':'Jan-20-12'] - self.assertEqual(expected.shape, result.shape) - - @network - def test_get_date_ret_index(self): - pan = web.get_data_yahoo(['GE', 'INTC', 'IBM'], '1977', '1987', - ret_index=True) - self.assertTrue(hasattr(pan, 'Ret_Index')) - if hasattr(pan, 'Ret_Index') and hasattr(pan.Ret_Index, 'INTC'): - tstamp = pan.Ret_Index.INTC.first_valid_index() - result = pan.Ret_Index.ix[tstamp]['INTC'] - self.assertEqual(result, 1.0) - - # sanity checking - self.assertTrue(np.issubdtype(pan.values.dtype, np.floating)) - - -class TestYahooOptions(tm.TestCase): - - @classmethod - def setUpClass(cls): - super(TestYahooOptions, cls).setUpClass() - raise nose.SkipTest('disable Yahoo Options tests') - - _skip_if_no_lxml() - _skip_if_no_bs() - raise nose.SkipTest('unreliable test') - - # aapl has monthlies - cls.aapl = web.Options('aapl', 'yahoo') - d = (Timestamp.today() + pd.offsets.MonthBegin(1)).normalize() - cls.year = d.year - cls.month = d.month - cls.expiry = d - cls.expiry2 = d + pd.offsets.MonthBegin(1) - cls.dirpath = tm.get_data_path() - cls.html1 = os.path.join(cls.dirpath, 'yahoo_options1.html') - cls.html2 = os.path.join(cls.dirpath, 'yahoo_options2.html') - cls.html3 = os.path.join(cls.dirpath, 'yahoo_options3.html') #Empty table GH#22 - cls.data1 = cls.aapl._option_frames_from_url(cls.html1)['puts'] - - @classmethod - def tearDownClass(cls): - super(TestYahooOptions, cls).tearDownClass() - del cls.aapl, cls.expiry - - @network - def test_get_options_data(self): - # regression test GH6105 - self.assertRaises(ValueError, self.aapl.get_options_data, month=3) - self.assertRaises(ValueError, self.aapl.get_options_data, year=1992) - - try: - options = self.aapl.get_options_data(expiry=self.expiry) - except RemoteDataError as e: - raise nose.SkipTest(e) - self.assertTrue(len(options) > 1) - - @network - def test_get_near_stock_price(self): - try: - options = self.aapl.get_near_stock_price(call=True, put=True, - expiry=[self.expiry,self.expiry2]) - except RemoteDataError as e: - raise nose.SkipTest(e) - self.assertTrue(len(options) > 1) - - @network - def test_get_call_data(self): - try: - calls = self.aapl.get_call_data(expiry=self.expiry) - except RemoteDataError as e: - raise nose.SkipTest(e) - self.assertTrue(len(calls) > 1) - - @network - def test_get_put_data(self): - try: - puts = self.aapl.get_put_data(expiry=self.expiry) - except RemoteDataError as e: - raise nose.SkipTest(e) - self.assertTrue(len(puts) > 1) - - @network - def test_get_expiry_dates(self): - try: - dates, _ = self.aapl._get_expiry_dates_and_links() - except RemoteDataError as e: - raise nose.SkipTest(e) - self.assertTrue(len(dates) > 1) - - @network - def test_get_all_data(self): - - try: - data = self.aapl.get_all_data(put=True) - except RemoteDataError as e: - raise nose.SkipTest(e) - self.assertTrue(len(data) > 1) - - @network - def test_get_data_with_list(self): - try: - data = self.aapl.get_call_data(expiry=self.aapl.expiry_dates) - except RemoteDataError as e: - raise nose.SkipTest(e) - self.assertTrue(len(data) > 1) - - @network - def test_get_all_data_calls_only(self): - try: - data = self.aapl.get_all_data(call=True, put=False) - except RemoteDataError as e: - raise nose.SkipTest(e) - self.assertTrue(len(data) > 1) - - @network - def test_get_underlying_price(self): - #GH7 - try: - options_object = web.Options('^spxpm', 'yahoo') - url = options_object._yahoo_url_from_expiry(options_object.expiry_dates[0]) - root = options_object._parse_url(url) - quote_price = options_object._underlying_price_from_root(root) - except RemoteDataError as e: - raise nose.SkipTest(e) - self.assertIsInstance(quote_price, float) - - def test_sample_page_price_quote_time1(self): - #Tests the weekend quote time format - price, quote_time = self.aapl._underlying_price_and_time_from_url(self.html1) - self.assertIsInstance(price, (int, float, complex)) - self.assertIsInstance(quote_time, (datetime, Timestamp)) - - def test_chop(self): - #regression test for #7625 - self.aapl.chop_data(self.data1, above_below=2, underlying_price=np.nan) - chopped = self.aapl.chop_data(self.data1, above_below=2, underlying_price=100) - self.assertIsInstance(chopped, DataFrame) - self.assertTrue(len(chopped) > 1) - - def test_chop_out_of_strike_range(self): - #regression test for #7625 - self.aapl.chop_data(self.data1, above_below=2, underlying_price=np.nan) - chopped = self.aapl.chop_data(self.data1, above_below=2, underlying_price=100000) - self.assertIsInstance(chopped, DataFrame) - self.assertTrue(len(chopped) > 1) - - - @network - def test_sample_page_price_quote_time2(self): - #Tests the EDT page format - #regression test for #8741 - price, quote_time = self.aapl._underlying_price_and_time_from_url(self.html2) - self.assertIsInstance(price, (int, float, complex)) - self.assertIsInstance(quote_time, (datetime, Timestamp)) - - @network - def test_sample_page_chg_float(self): - #Tests that numeric columns with comma's are appropriately dealt with - self.assertEqual(self.data1['Chg'].dtype, 'float64') - - @network - def test_month_year(self): - try: - data = self.aapl.get_call_data(month=self.month, year=self.year) - except RemoteDataError as e: - raise nose.SkipTest(e) - - self.assertTrue(len(data) > 1) - - @network - def test_empty_table(self): - #GH22 - empty = self.aapl._option_frames_from_url(self.html3)['puts'] - self.assertTrue(len(empty) == 0) - - -class TestOptionsWarnings(tm.TestCase): - @classmethod - def setUpClass(cls): - super(TestOptionsWarnings, cls).setUpClass() - - @classmethod - def tearDownClass(cls): - super(TestOptionsWarnings, cls).tearDownClass() - - @network - def test_options_source_warning(self): - with assert_produces_warning(): - aapl = web.Options('aapl') - - -class TestDataReader(tm.TestCase): - - @network - def test_read_yahoo(self): - gs = DataReader("GS", "yahoo") - self.assertIsInstance(gs, DataFrame) - - @network - def test_read_google(self): - gs = DataReader("GS", "google") - self.assertIsInstance(gs, DataFrame) - - @network - def test_read_fred(self): - vix = DataReader("VIXCLS", "fred") - self.assertIsInstance(vix, DataFrame) - - @network - def test_read_famafrench(self): - raise nose.SkipTest('buggy as of 2/14/16; maybe a data revision?') - for name in ("F-F_Research_Data_Factors", - "F-F_Research_Data_Factors_weekly", "6_Portfolios_2x3", - "F-F_ST_Reversal_Factor", "F-F_Momentum_Factor"): - ff = DataReader(name, "famafrench") - self.assertTrue(ff is not None) - self.assertIsInstance(ff, dict) - - -class TestFred(tm.TestCase): - - @classmethod - def setUpClass(cls): - super(TestFred, cls).setUpClass() - raise nose.SkipTest('disable Fred tests') - - @network - def test_fred(self): - raise nose.SkipTest('buggy as of 2/14/16; maybe a data revision?') - - # Throws an exception when DataReader can't get a 200 response from - # FRED. - - start = datetime(2010, 1, 1) - end = datetime(2013, 1, 27) - - received = web.DataReader("GDP", "fred", start, end)['GDP'].tail(1)[0] - self.assertTrue(int(received) > 10000) - - self.assertRaises(Exception, web.DataReader, "NON EXISTENT SERIES", - 'fred', start, end) - - @network - def test_fred_nan(self): - start = datetime(2010, 1, 1) - end = datetime(2013, 1, 27) - df = web.DataReader("DFII5", "fred", start, end) - self.assertTrue(pd.isnull(df.ix['2010-01-01'][0])) - - @network - def test_fred_parts(self): - raise nose.SkipTest('buggy as of 2/18/14; maybe a data revision?') - - start = datetime(2010, 1, 1) - end = datetime(2013, 1, 27) - df = web.get_data_fred("CPIAUCSL", start, end) - self.assertEqual(df.ix['2010-05-01'][0], 217.23) - - t = df.CPIAUCSL.values - self.assertTrue(np.issubdtype(t.dtype, np.floating)) - self.assertEqual(t.shape, (37,)) - - @network - def test_fred_part2(self): - expected = [[576.7], - [962.9], - [684.7], - [848.3], - [933.3]] - result = web.get_data_fred("A09024USA144NNBR", start="1915").ix[:5] - tm.assert_numpy_array_equal(result.values, np.array(expected)) - - @network - def test_invalid_series(self): - name = "NOT A REAL SERIES" - self.assertRaises(Exception, web.get_data_fred, name) - - @network - def test_fred_multi(self): - raise nose.SkipTest('buggy as of 2/18/14; maybe a data revision?') - - names = ['CPIAUCSL', 'CPALTT01USQ661S', 'CPILFESL'] - start = datetime(2010, 1, 1) - end = datetime(2013, 1, 27) - - received = web.DataReader(names, "fred", start, end).head(1) - expected = DataFrame([[217.478, 0.99701529, 220.544]], columns=names, - index=[pd.tslib.Timestamp('2010-01-01 00:00:00')]) - expected.index.rename('DATE', inplace=True) - assert_frame_equal(received, expected, check_less_precise=True) - - @network - def test_fred_multi_bad_series(self): - - names = ['NOTAREALSERIES', 'CPIAUCSL', "ALSO FAKE"] - with tm.assertRaises(HTTPError): - DataReader(names, data_source="fred") - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/io/tests/test_excel.py b/pandas/io/tests/test_excel.py index b7e5360a6f3db..449c27482e0a5 100644 --- a/pandas/io/tests/test_excel.py +++ b/pandas/io/tests/test_excel.py @@ -244,6 +244,21 @@ def test_excel_passes_na(self): columns=['Test']) tm.assert_frame_equal(parsed, expected) + # 13967 + excel = self.get_excelfile('test5') + + parsed = read_excel(excel, 'Sheet1', keep_default_na=False, + na_values=['apple']) + expected = DataFrame([['1.#QNAN'], [1], ['nan'], [np.nan], ['rabbit']], + columns=['Test']) + tm.assert_frame_equal(parsed, expected) + + parsed = read_excel(excel, 'Sheet1', keep_default_na=True, + na_values=['apple']) + expected = DataFrame([[np.nan], [1], [np.nan], [np.nan], ['rabbit']], + columns=['Test']) + tm.assert_frame_equal(parsed, expected) + def test_excel_table_sheet_by_index(self): excel = self.get_excelfile('test1') @@ -725,6 +740,46 @@ def test_read_excel_multiindex(self): header=[0, 1], skiprows=2) tm.assert_frame_equal(actual, expected) + def test_read_excel_multiindex_empty_level(self): + # GH 12453 + _skip_if_no_xlsxwriter() + with ensure_clean('.xlsx') as path: + df = DataFrame({ + ('Zero', ''): {0: 0}, + ('One', 'x'): {0: 1}, + ('Two', 'X'): {0: 3}, + ('Two', 'Y'): {0: 7} + }) + + expected = DataFrame({ + ('Zero', 'Unnamed: 3_level_1'): {0: 0}, + ('One', u'x'): {0: 1}, + ('Two', u'X'): {0: 3}, + ('Two', u'Y'): {0: 7} + }) + + df.to_excel(path) + actual = pd.read_excel(path, header=[0, 1]) + tm.assert_frame_equal(actual, expected) + + df = pd.DataFrame({ + ('Beg', ''): {0: 0}, + ('Middle', 'x'): {0: 1}, + ('Tail', 'X'): {0: 3}, + ('Tail', 'Y'): {0: 7} + }) + + expected = pd.DataFrame({ + ('Beg', 'Unnamed: 0_level_1'): {0: 0}, + ('Middle', u'x'): {0: 1}, + ('Tail', u'X'): {0: 3}, + ('Tail', u'Y'): {0: 7} + }) + + df.to_excel(path) + actual = pd.read_excel(path, header=[0, 1]) + tm.assert_frame_equal(actual, expected) + def test_excel_multindex_roundtrip(self): # GH 4679 _skip_if_no_xlsxwriter() @@ -1288,6 +1343,20 @@ def test_to_excel_multiindex(self): parse_dates=False) tm.assert_frame_equal(frame, df) + # GH13511 + def test_to_excel_multiindex_nan_label(self): + _skip_if_no_xlrd() + + frame = pd.DataFrame({'A': [None, 2, 3], + 'B': [10, 20, 30], + 'C': np.random.sample(3)}) + frame = frame.set_index(['A', 'B']) + + with ensure_clean(self.ext) as path: + frame.to_excel(path, merge_cells=self.merge_cells) + df = read_excel(path, index_col=[0, 1]) + tm.assert_frame_equal(frame, df) + # Test for Issue 11328. If column indices are integers, make # sure they are handled correctly for either setting of # merge_cells diff --git a/pandas/io/tests/test_gbq.py b/pandas/io/tests/test_gbq.py index 5cb681f4d2e7d..921fd824d6ffd 100644 --- a/pandas/io/tests/test_gbq.py +++ b/pandas/io/tests/test_gbq.py @@ -4,6 +4,8 @@ import pytz import platform from time import sleep +import os +import logging import numpy as np @@ -21,7 +23,11 @@ PRIVATE_KEY_JSON_PATH = None PRIVATE_KEY_JSON_CONTENTS = None -DATASET_ID = 'pydata_pandas_bq_testing' +if compat.PY3: + DATASET_ID = 'pydata_pandas_bq_testing_py3' +else: + DATASET_ID = 'pydata_pandas_bq_testing_py2' + TABLE_ID = 'new_test' DESTINATION_TABLE = "{0}.{1}".format(DATASET_ID + "1", TABLE_ID) @@ -35,25 +41,50 @@ def _skip_if_no_project_id(): - if not PROJECT_ID: + if not _get_project_id(): raise nose.SkipTest( "Cannot run integration tests without a project id") def _skip_if_no_private_key_path(): - if not PRIVATE_KEY_JSON_PATH: + if not _get_private_key_path(): raise nose.SkipTest("Cannot run integration tests without a " "private key json file path") def _skip_if_no_private_key_contents(): - if not PRIVATE_KEY_JSON_CONTENTS: + if not _get_private_key_contents(): raise nose.SkipTest("Cannot run integration tests without a " "private key json contents") - _skip_if_no_project_id() - _skip_if_no_private_key_path() - _skip_if_no_private_key_contents() + +def _in_travis_environment(): + return 'TRAVIS_BUILD_DIR' in os.environ and \ + 'GBQ_PROJECT_ID' in os.environ + + +def _get_project_id(): + if _in_travis_environment(): + return os.environ.get('GBQ_PROJECT_ID') + else: + return PROJECT_ID + + +def _get_private_key_path(): + if _in_travis_environment(): + return os.path.join(*[os.environ.get('TRAVIS_BUILD_DIR'), 'ci', + 'travis_gbq.json']) + else: + return PRIVATE_KEY_JSON_PATH + + +def _get_private_key_contents(): + if _in_travis_environment(): + with open(os.path.join(*[os.environ.get('TRAVIS_BUILD_DIR'), 'ci', + 'travis_gbq.json'])) as f: + return f.read() + else: + return PRIVATE_KEY_JSON_CONTENTS def _test_imports(): @@ -73,8 +104,12 @@ def _test_imports(): if _SETUPTOOLS_INSTALLED: try: - from apiclient.discovery import build # noqa - from apiclient.errors import HttpError # noqa + try: + from googleapiclient.discovery import build # noqa + from googleapiclient.errors import HttpError # noqa + except: + from apiclient.discovery import build # noqa + from apiclient.errors import HttpError # noqa from oauth2client.client import OAuth2WebServerFlow # noqa from oauth2client.client import AccessTokenRefreshError # noqa @@ -140,20 +175,49 @@ def _test_imports(): "service account support") -def test_requirements(): +def _setup_common(): try: _test_imports() except (ImportError, NotImplementedError) as import_exception: raise nose.SkipTest(import_exception) + if _in_travis_environment(): + logging.getLogger('oauth2client').setLevel(logging.ERROR) + logging.getLogger('apiclient').setLevel(logging.ERROR) + + +def _check_if_can_get_correct_default_credentials(): + # Checks if "Application Default Credentials" can be fetched + # from the environment the tests are running in. + # See Issue #13577 + + import httplib2 + try: + from googleapiclient.discovery import build + except ImportError: + from apiclient.discovery import build + try: + from oauth2client.client import GoogleCredentials + credentials = GoogleCredentials.get_application_default() + http = httplib2.Http() + http = credentials.authorize(http) + bigquery_service = build('bigquery', 'v2', http=http) + jobs = bigquery_service.jobs() + job_data = {'configuration': {'query': {'query': 'SELECT 1'}}} + jobs.insert(projectId=_get_project_id(), body=job_data).execute() + return True + except: + return False + def clean_gbq_environment(private_key=None): - dataset = gbq._Dataset(PROJECT_ID, private_key=private_key) + dataset = gbq._Dataset(_get_project_id(), private_key=private_key) for i in range(1, 10): if DATASET_ID + str(i) in dataset.datasets(): dataset_id = DATASET_ID + str(i) - table = gbq._Table(PROJECT_ID, dataset_id, private_key=private_key) + table = gbq._Table(_get_project_id(), dataset_id, + private_key=private_key) for j in range(1, 20): if TABLE_ID + str(j) in dataset.tables(dataset_id): table.delete(TABLE_ID + str(j)) @@ -187,11 +251,11 @@ def test_generate_bq_schema_deprecated(): class TestGBQConnectorIntegration(tm.TestCase): def setUp(self): - test_requirements() - + _setup_common() _skip_if_no_project_id() - self.sut = gbq.GbqConnector(PROJECT_ID) + self.sut = gbq.GbqConnector(_get_project_id(), + private_key=_get_private_key_path()) def test_should_be_able_to_make_a_connector(self): self.assertTrue(self.sut is not None, @@ -213,16 +277,31 @@ def test_should_be_able_to_get_results_from_query(self): schema, pages = self.sut.run_query('SELECT 1') self.assertTrue(pages is not None) + def test_get_application_default_credentials_does_not_throw_error(self): + if _check_if_can_get_correct_default_credentials(): + raise nose.SkipTest("Can get default_credentials " + "from the environment!") + credentials = self.sut.get_application_default_credentials() + self.assertIsNone(credentials) + + def test_get_application_default_credentials_returns_credentials(self): + if not _check_if_can_get_correct_default_credentials(): + raise nose.SkipTest("Cannot get default_credentials " + "from the environment!") + from oauth2client.client import GoogleCredentials + credentials = self.sut.get_application_default_credentials() + self.assertTrue(isinstance(credentials, GoogleCredentials)) + class TestGBQConnectorServiceAccountKeyPathIntegration(tm.TestCase): def setUp(self): - test_requirements() + _setup_common() _skip_if_no_project_id() _skip_if_no_private_key_path() - self.sut = gbq.GbqConnector(PROJECT_ID, - private_key=PRIVATE_KEY_JSON_PATH) + self.sut = gbq.GbqConnector(_get_project_id(), + private_key=_get_private_key_path()) def test_should_be_able_to_make_a_connector(self): self.assertTrue(self.sut is not None, @@ -247,13 +326,13 @@ def test_should_be_able_to_get_results_from_query(self): class TestGBQConnectorServiceAccountKeyContentsIntegration(tm.TestCase): def setUp(self): - test_requirements() + _setup_common() _skip_if_no_project_id() - _skip_if_no_private_key_contents() + _skip_if_no_private_key_path() - self.sut = gbq.GbqConnector(PROJECT_ID, - private_key=PRIVATE_KEY_JSON_CONTENTS) + self.sut = gbq.GbqConnector(_get_project_id(), + private_key=_get_private_key_path()) def test_should_be_able_to_make_a_connector(self): self.assertTrue(self.sut is not None, @@ -278,7 +357,18 @@ def test_should_be_able_to_get_results_from_query(self): class GBQUnitTests(tm.TestCase): def setUp(self): - test_requirements() + _setup_common() + + def test_import_google_api_python_client(self): + if compat.PY2: + with tm.assertRaises(ImportError): + from googleapiclient.discovery import build # noqa + from googleapiclient.errors import HttpError # noqa + from apiclient.discovery import build # noqa + from apiclient.errors import HttpError # noqa + else: + from googleapiclient.discovery import build # noqa + from googleapiclient.errors import HttpError # noqa def test_should_return_bigquery_integers_as_python_floats(self): result = gbq._parse_entry(1, 'INTEGER') @@ -342,12 +432,12 @@ def test_read_gbq_with_empty_private_key_file_should_fail(self): private_key=empty_file_path) def test_read_gbq_with_corrupted_private_key_json_should_fail(self): - _skip_if_no_private_key_contents() + _skip_if_no_private_key_path() with tm.assertRaises(gbq.InvalidPrivateKeyFormat): gbq.read_gbq( 'SELECT 1', project_id='x', - private_key=re.sub('[a-z]', '9', PRIVATE_KEY_JSON_CONTENTS)) + private_key=re.sub('[a-z]', '9', _get_private_key_path())) class TestReadGBQIntegration(tm.TestCase): @@ -360,7 +450,7 @@ def setUpClass(cls): _skip_if_no_project_id() - test_requirements() + _setup_common() def setUp(self): # - PER-TEST FIXTURES - @@ -381,87 +471,108 @@ def tearDown(self): # executed. pass + def test_should_read_as_user_account(self): + if _in_travis_environment(): + raise nose.SkipTest("Cannot run local auth in travis environment") + + query = 'SELECT "PI" as VALID_STRING' + df = gbq.read_gbq(query, project_id=_get_project_id()) + tm.assert_frame_equal(df, DataFrame({'VALID_STRING': ['PI']})) + def test_should_read_as_service_account_with_key_path(self): _skip_if_no_private_key_path() query = 'SELECT "PI" as VALID_STRING' - df = gbq.read_gbq(query, project_id=PROJECT_ID, - private_key=PRIVATE_KEY_JSON_PATH) + df = gbq.read_gbq(query, project_id=_get_project_id(), + private_key=_get_private_key_path()) tm.assert_frame_equal(df, DataFrame({'VALID_STRING': ['PI']})) def test_should_read_as_service_account_with_key_contents(self): _skip_if_no_private_key_contents() query = 'SELECT "PI" as VALID_STRING' - df = gbq.read_gbq(query, project_id=PROJECT_ID, - private_key=PRIVATE_KEY_JSON_CONTENTS) + df = gbq.read_gbq(query, project_id=_get_project_id(), + private_key=_get_private_key_contents()) tm.assert_frame_equal(df, DataFrame({'VALID_STRING': ['PI']})) def test_should_properly_handle_valid_strings(self): query = 'SELECT "PI" as VALID_STRING' - df = gbq.read_gbq(query, project_id=PROJECT_ID) + df = gbq.read_gbq(query, project_id=_get_project_id(), + private_key=_get_private_key_path()) tm.assert_frame_equal(df, DataFrame({'VALID_STRING': ['PI']})) def test_should_properly_handle_empty_strings(self): query = 'SELECT "" as EMPTY_STRING' - df = gbq.read_gbq(query, project_id=PROJECT_ID) + df = gbq.read_gbq(query, project_id=_get_project_id(), + private_key=_get_private_key_path()) tm.assert_frame_equal(df, DataFrame({'EMPTY_STRING': [""]})) def test_should_properly_handle_null_strings(self): query = 'SELECT STRING(NULL) as NULL_STRING' - df = gbq.read_gbq(query, project_id=PROJECT_ID) + df = gbq.read_gbq(query, project_id=_get_project_id(), + private_key=_get_private_key_path()) tm.assert_frame_equal(df, DataFrame({'NULL_STRING': [None]})) def test_should_properly_handle_valid_integers(self): query = 'SELECT INTEGER(3) as VALID_INTEGER' - df = gbq.read_gbq(query, project_id=PROJECT_ID) + df = gbq.read_gbq(query, project_id=_get_project_id(), + private_key=_get_private_key_path()) tm.assert_frame_equal(df, DataFrame({'VALID_INTEGER': [3]})) def test_should_properly_handle_null_integers(self): query = 'SELECT INTEGER(NULL) as NULL_INTEGER' - df = gbq.read_gbq(query, project_id=PROJECT_ID) + df = gbq.read_gbq(query, project_id=_get_project_id(), + private_key=_get_private_key_path()) tm.assert_frame_equal(df, DataFrame({'NULL_INTEGER': [np.nan]})) def test_should_properly_handle_valid_floats(self): query = 'SELECT PI() as VALID_FLOAT' - df = gbq.read_gbq(query, project_id=PROJECT_ID) + df = gbq.read_gbq(query, project_id=_get_project_id(), + private_key=_get_private_key_path()) tm.assert_frame_equal(df, DataFrame( {'VALID_FLOAT': [3.141592653589793]})) def test_should_properly_handle_null_floats(self): query = 'SELECT FLOAT(NULL) as NULL_FLOAT' - df = gbq.read_gbq(query, project_id=PROJECT_ID) + df = gbq.read_gbq(query, project_id=_get_project_id(), + private_key=_get_private_key_path()) tm.assert_frame_equal(df, DataFrame({'NULL_FLOAT': [np.nan]})) def test_should_properly_handle_timestamp_unix_epoch(self): query = 'SELECT TIMESTAMP("1970-01-01 00:00:00") as UNIX_EPOCH' - df = gbq.read_gbq(query, project_id=PROJECT_ID) + df = gbq.read_gbq(query, project_id=_get_project_id(), + private_key=_get_private_key_path()) tm.assert_frame_equal(df, DataFrame( {'UNIX_EPOCH': [np.datetime64('1970-01-01T00:00:00.000000Z')]})) def test_should_properly_handle_arbitrary_timestamp(self): query = 'SELECT TIMESTAMP("2004-09-15 05:00:00") as VALID_TIMESTAMP' - df = gbq.read_gbq(query, project_id=PROJECT_ID) + df = gbq.read_gbq(query, project_id=_get_project_id(), + private_key=_get_private_key_path()) tm.assert_frame_equal(df, DataFrame({ 'VALID_TIMESTAMP': [np.datetime64('2004-09-15T05:00:00.000000Z')] })) def test_should_properly_handle_null_timestamp(self): query = 'SELECT TIMESTAMP(NULL) as NULL_TIMESTAMP' - df = gbq.read_gbq(query, project_id=PROJECT_ID) + df = gbq.read_gbq(query, project_id=_get_project_id(), + private_key=_get_private_key_path()) tm.assert_frame_equal(df, DataFrame({'NULL_TIMESTAMP': [NaT]})) def test_should_properly_handle_true_boolean(self): query = 'SELECT BOOLEAN(TRUE) as TRUE_BOOLEAN' - df = gbq.read_gbq(query, project_id=PROJECT_ID) + df = gbq.read_gbq(query, project_id=_get_project_id(), + private_key=_get_private_key_path()) tm.assert_frame_equal(df, DataFrame({'TRUE_BOOLEAN': [True]})) def test_should_properly_handle_false_boolean(self): query = 'SELECT BOOLEAN(FALSE) as FALSE_BOOLEAN' - df = gbq.read_gbq(query, project_id=PROJECT_ID) + df = gbq.read_gbq(query, project_id=_get_project_id(), + private_key=_get_private_key_path()) tm.assert_frame_equal(df, DataFrame({'FALSE_BOOLEAN': [False]})) def test_should_properly_handle_null_boolean(self): query = 'SELECT BOOLEAN(NULL) as NULL_BOOLEAN' - df = gbq.read_gbq(query, project_id=PROJECT_ID) + df = gbq.read_gbq(query, project_id=_get_project_id(), + private_key=_get_private_key_path()) tm.assert_frame_equal(df, DataFrame({'NULL_BOOLEAN': [None]})) def test_unicode_string_conversion_and_normalization(self): @@ -476,13 +587,15 @@ def test_unicode_string_conversion_and_normalization(self): query = 'SELECT "{0}" as UNICODE_STRING'.format(unicode_string) - df = gbq.read_gbq(query, project_id=PROJECT_ID) + df = gbq.read_gbq(query, project_id=_get_project_id(), + private_key=_get_private_key_path()) tm.assert_frame_equal(df, correct_test_datatype) def test_index_column(self): query = "SELECT 'a' as STRING_1, 'b' as STRING_2" - result_frame = gbq.read_gbq( - query, project_id=PROJECT_ID, index_col="STRING_1") + result_frame = gbq.read_gbq(query, project_id=_get_project_id(), + index_col="STRING_1", + private_key=_get_private_key_path()) correct_frame = DataFrame( {'STRING_1': ['a'], 'STRING_2': ['b']}).set_index("STRING_1") tm.assert_equal(result_frame.index.name, correct_frame.index.name) @@ -490,8 +603,9 @@ def test_index_column(self): def test_column_order(self): query = "SELECT 'a' as STRING_1, 'b' as STRING_2, 'c' as STRING_3" col_order = ['STRING_3', 'STRING_1', 'STRING_2'] - result_frame = gbq.read_gbq( - query, project_id=PROJECT_ID, col_order=col_order) + result_frame = gbq.read_gbq(query, project_id=_get_project_id(), + col_order=col_order, + private_key=_get_private_key_path()) correct_frame = DataFrame({'STRING_1': ['a'], 'STRING_2': [ 'b'], 'STRING_3': ['c']})[col_order] tm.assert_frame_equal(result_frame, correct_frame) @@ -499,8 +613,9 @@ def test_column_order(self): def test_column_order_plus_index(self): query = "SELECT 'a' as STRING_1, 'b' as STRING_2, 'c' as STRING_3" col_order = ['STRING_3', 'STRING_2'] - result_frame = gbq.read_gbq(query, project_id=PROJECT_ID, - index_col='STRING_1', col_order=col_order) + result_frame = gbq.read_gbq(query, project_id=_get_project_id(), + index_col='STRING_1', col_order=col_order, + private_key=_get_private_key_path()) correct_frame = DataFrame( {'STRING_1': ['a'], 'STRING_2': ['b'], 'STRING_3': ['c']}) correct_frame.set_index('STRING_1', inplace=True) @@ -510,16 +625,19 @@ def test_column_order_plus_index(self): def test_malformed_query(self): with tm.assertRaises(gbq.GenericGBQException): gbq.read_gbq("SELCET * FORM [publicdata:samples.shakespeare]", - project_id=PROJECT_ID) + project_id=_get_project_id(), + private_key=_get_private_key_path()) def test_bad_project_id(self): with tm.assertRaises(gbq.GenericGBQException): - gbq.read_gbq("SELECT 1", project_id='001') + gbq.read_gbq("SELECT 1", project_id='001', + private_key=_get_private_key_path()) def test_bad_table_name(self): with tm.assertRaises(gbq.GenericGBQException): gbq.read_gbq("SELECT * FROM [publicdata:samples.nope]", - project_id=PROJECT_ID) + project_id=_get_project_id(), + private_key=_get_private_key_path()) def test_download_dataset_larger_than_200k_rows(self): test_size = 200005 @@ -528,7 +646,8 @@ def test_download_dataset_larger_than_200k_rows(self): df = gbq.read_gbq("SELECT id FROM [publicdata:samples.wikipedia] " "GROUP EACH BY id ORDER BY id ASC LIMIT {0}" .format(test_size), - project_id=PROJECT_ID) + project_id=_get_project_id(), + private_key=_get_private_key_path()) self.assertEqual(len(df.drop_duplicates()), test_size) def test_zero_rows(self): @@ -536,12 +655,62 @@ def test_zero_rows(self): df = gbq.read_gbq("SELECT title, id " "FROM [publicdata:samples.wikipedia] " "WHERE timestamp=-9999999", - project_id=PROJECT_ID) + project_id=_get_project_id(), + private_key=_get_private_key_path()) page_array = np.zeros( (0,), dtype=[('title', object), ('id', np.dtype(float))]) expected_result = DataFrame(page_array, columns=['title', 'id']) self.assert_frame_equal(df, expected_result) + def test_legacy_sql(self): + legacy_sql = "SELECT id FROM [publicdata.samples.wikipedia] LIMIT 10" + + # Test that a legacy sql statement fails when + # setting dialect='standard' + with tm.assertRaises(gbq.GenericGBQException): + gbq.read_gbq(legacy_sql, project_id=_get_project_id(), + dialect='standard', + private_key=_get_private_key_path()) + + # Test that a legacy sql statement succeeds when + # setting dialect='legacy' + df = gbq.read_gbq(legacy_sql, project_id=_get_project_id(), + dialect='legacy', + private_key=_get_private_key_path()) + self.assertEqual(len(df.drop_duplicates()), 10) + + def test_standard_sql(self): + standard_sql = "SELECT DISTINCT id FROM " \ + "`publicdata.samples.wikipedia` LIMIT 10" + + # Test that a standard sql statement fails when using + # the legacy SQL dialect (default value) + with tm.assertRaises(gbq.GenericGBQException): + gbq.read_gbq(standard_sql, project_id=_get_project_id(), + private_key=_get_private_key_path()) + + # Test that a standard sql statement succeeds when + # setting dialect='standard' + df = gbq.read_gbq(standard_sql, project_id=_get_project_id(), + dialect='standard', + private_key=_get_private_key_path()) + self.assertEqual(len(df.drop_duplicates()), 10) + + def test_invalid_option_for_sql_dialect(self): + sql_statement = "SELECT DISTINCT id FROM " \ + "`publicdata.samples.wikipedia` LIMIT 10" + + # Test that an invalid option for `dialect` raises ValueError + with tm.assertRaises(ValueError): + gbq.read_gbq(sql_statement, project_id=_get_project_id(), + dialect='invalid', + private_key=_get_private_key_path()) + + # Test that a correct option for dialect succeeds + # to make sure ValueError was due to invalid dialect + gbq.read_gbq(sql_statement, project_id=_get_project_id(), + dialect='standard', private_key=_get_private_key_path()) + class TestToGBQIntegration(tm.TestCase): # Changes to BigQuery table schema may take up to 2 minutes as of May 2015 @@ -558,18 +727,22 @@ def setUpClass(cls): _skip_if_no_project_id() - test_requirements() - clean_gbq_environment() + _setup_common() + clean_gbq_environment(_get_private_key_path()) - gbq._Dataset(PROJECT_ID).create(DATASET_ID + "1") + gbq._Dataset(_get_project_id(), + private_key=_get_private_key_path() + ).create(DATASET_ID + "1") def setUp(self): # - PER-TEST FIXTURES - # put here any instruction you want to be run *BEFORE* *EVERY* test is # executed. - self.dataset = gbq._Dataset(PROJECT_ID) - self.table = gbq._Table(PROJECT_ID, DATASET_ID + "1") + self.dataset = gbq._Dataset(_get_project_id(), + private_key=_get_private_key_path()) + self.table = gbq._Table(_get_project_id(), DATASET_ID + "1", + private_key=_get_private_key_path()) @classmethod def tearDownClass(cls): @@ -577,7 +750,7 @@ def tearDownClass(cls): # put here any instruction you want to execute only *ONCE* *AFTER* # executing all tests. - clean_gbq_environment() + clean_gbq_environment(_get_private_key_path()) def tearDown(self): # - PER-TEST FIXTURES - @@ -591,13 +764,15 @@ def test_upload_data(self): test_size = 20001 df = make_mixed_dataframe_v2(test_size) - gbq.to_gbq(df, destination_table, PROJECT_ID, chunksize=10000) + gbq.to_gbq(df, destination_table, _get_project_id(), chunksize=10000, + private_key=_get_private_key_path()) sleep(30) # <- Curses Google!!! result = gbq.read_gbq("SELECT COUNT(*) as NUM_ROWS FROM {0}" .format(destination_table), - project_id=PROJECT_ID) + project_id=_get_project_id(), + private_key=_get_private_key_path()) self.assertEqual(result['NUM_ROWS'][0], test_size) def test_upload_data_if_table_exists_fail(self): @@ -609,11 +784,13 @@ def test_upload_data_if_table_exists_fail(self): # Test the default value of if_exists is 'fail' with tm.assertRaises(gbq.TableCreationError): - gbq.to_gbq(df, destination_table, PROJECT_ID) + gbq.to_gbq(df, destination_table, _get_project_id(), + private_key=_get_private_key_path()) # Test the if_exists parameter with value 'fail' with tm.assertRaises(gbq.TableCreationError): - gbq.to_gbq(df, destination_table, PROJECT_ID, if_exists='fail') + gbq.to_gbq(df, destination_table, _get_project_id(), + if_exists='fail', private_key=_get_private_key_path()) def test_upload_data_if_table_exists_append(self): destination_table = DESTINATION_TABLE + "3" @@ -623,22 +800,26 @@ def test_upload_data_if_table_exists_append(self): df_different_schema = tm.makeMixedDataFrame() # Initialize table with sample data - gbq.to_gbq(df, destination_table, PROJECT_ID, chunksize=10000) + gbq.to_gbq(df, destination_table, _get_project_id(), chunksize=10000, + private_key=_get_private_key_path()) # Test the if_exists parameter with value 'append' - gbq.to_gbq(df, destination_table, PROJECT_ID, if_exists='append') + gbq.to_gbq(df, destination_table, _get_project_id(), + if_exists='append', private_key=_get_private_key_path()) sleep(30) # <- Curses Google!!! result = gbq.read_gbq("SELECT COUNT(*) as NUM_ROWS FROM {0}" .format(destination_table), - project_id=PROJECT_ID) + project_id=_get_project_id(), + private_key=_get_private_key_path()) self.assertEqual(result['NUM_ROWS'][0], test_size * 2) # Try inserting with a different schema, confirm failure with tm.assertRaises(gbq.InvalidSchema): gbq.to_gbq(df_different_schema, destination_table, - PROJECT_ID, if_exists='append') + _get_project_id(), if_exists='append', + private_key=_get_private_key_path()) def test_upload_data_if_table_exists_replace(self): destination_table = DESTINATION_TABLE + "4" @@ -648,17 +829,20 @@ def test_upload_data_if_table_exists_replace(self): df_different_schema = tm.makeMixedDataFrame() # Initialize table with sample data - gbq.to_gbq(df, destination_table, PROJECT_ID, chunksize=10000) + gbq.to_gbq(df, destination_table, _get_project_id(), chunksize=10000, + private_key=_get_private_key_path()) # Test the if_exists parameter with the value 'replace'. gbq.to_gbq(df_different_schema, destination_table, - PROJECT_ID, if_exists='replace') + _get_project_id(), if_exists='replace', + private_key=_get_private_key_path()) sleep(30) # <- Curses Google!!! result = gbq.read_gbq("SELECT COUNT(*) as NUM_ROWS FROM {0}" .format(destination_table), - project_id=PROJECT_ID) + project_id=_get_project_id(), + private_key=_get_private_key_path()) self.assertEqual(result['NUM_ROWS'][0], 5) def test_google_upload_errors_should_raise_exception(self): @@ -671,7 +855,8 @@ def test_google_upload_errors_should_raise_exception(self): index=range(2)) with tm.assertRaises(gbq.StreamingInsertError): - gbq.to_gbq(bad_df, destination_table, PROJECT_ID, verbose=True) + gbq.to_gbq(bad_df, destination_table, _get_project_id(), + verbose=True, private_key=_get_private_key_path()) def test_generate_schema(self): df = tm.makeMixedDataFrame() @@ -730,7 +915,9 @@ def test_list_dataset(self): def test_list_table_zero_results(self): dataset_id = DATASET_ID + "2" self.dataset.create(dataset_id) - table_list = gbq._Dataset(PROJECT_ID).tables(dataset_id) + table_list = gbq._Dataset(_get_project_id(), + private_key=_get_private_key_path() + ).tables(dataset_id) self.assertEqual(len(table_list), 0, 'Expected gbq.list_table() to return 0') @@ -756,7 +943,7 @@ def test_dataset_exists(self): def create_table_data_dataset_does_not_exist(self): dataset_id = DATASET_ID + "6" table_id = TABLE_ID + "1" - table_with_new_dataset = gbq._Table(PROJECT_ID, dataset_id) + table_with_new_dataset = gbq._Table(_get_project_id(), dataset_id) df = make_mixed_dataframe_v2(10) table_with_new_dataset.create(table_id, gbq._generate_bq_schema(df)) self.assertTrue(self.dataset.exists(dataset_id), @@ -786,8 +973,8 @@ def setUpClass(cls): _skip_if_no_project_id() _skip_if_no_private_key_path() - test_requirements() - clean_gbq_environment(PRIVATE_KEY_JSON_PATH) + _setup_common() + clean_gbq_environment(_get_private_key_path()) def setUp(self): # - PER-TEST FIXTURES - @@ -801,7 +988,7 @@ def tearDownClass(cls): # put here any instruction you want to execute only *ONCE* *AFTER* # executing all tests. - clean_gbq_environment(PRIVATE_KEY_JSON_PATH) + clean_gbq_environment(_get_private_key_path()) def tearDown(self): # - PER-TEST FIXTURES - @@ -815,15 +1002,15 @@ def test_upload_data_as_service_account_with_key_path(self): test_size = 10 df = make_mixed_dataframe_v2(test_size) - gbq.to_gbq(df, destination_table, PROJECT_ID, chunksize=10000, - private_key=PRIVATE_KEY_JSON_PATH) + gbq.to_gbq(df, destination_table, _get_project_id(), chunksize=10000, + private_key=_get_private_key_path()) sleep(30) # <- Curses Google!!! result = gbq.read_gbq( "SELECT COUNT(*) as NUM_ROWS FROM {0}".format(destination_table), - project_id=PROJECT_ID, - private_key=PRIVATE_KEY_JSON_PATH) + project_id=_get_project_id(), + private_key=_get_private_key_path()) self.assertEqual(result['NUM_ROWS'][0], test_size) @@ -842,11 +1029,11 @@ def setUpClass(cls): # put here any instruction you want to execute only *ONCE* *BEFORE* # executing *ALL* tests described below. + _setup_common() _skip_if_no_project_id() _skip_if_no_private_key_contents() - test_requirements() - clean_gbq_environment(PRIVATE_KEY_JSON_CONTENTS) + clean_gbq_environment(_get_private_key_contents()) def setUp(self): # - PER-TEST FIXTURES - @@ -860,7 +1047,7 @@ def tearDownClass(cls): # put here any instruction you want to execute only *ONCE* *AFTER* # executing all tests. - clean_gbq_environment(PRIVATE_KEY_JSON_CONTENTS) + clean_gbq_environment(_get_private_key_contents()) def tearDown(self): # - PER-TEST FIXTURES - @@ -874,15 +1061,15 @@ def test_upload_data_as_service_account_with_key_contents(self): test_size = 10 df = make_mixed_dataframe_v2(test_size) - gbq.to_gbq(df, destination_table, PROJECT_ID, chunksize=10000, - private_key=PRIVATE_KEY_JSON_CONTENTS) + gbq.to_gbq(df, destination_table, _get_project_id(), chunksize=10000, + private_key=_get_private_key_contents()) sleep(30) # <- Curses Google!!! result = gbq.read_gbq( "SELECT COUNT(*) as NUM_ROWS FROM {0}".format(destination_table), - project_id=PROJECT_ID, - private_key=PRIVATE_KEY_JSON_CONTENTS) + project_id=_get_project_id(), + private_key=_get_private_key_contents()) self.assertEqual(result['NUM_ROWS'][0], test_size) if __name__ == '__main__': diff --git a/pandas/io/tests/test_html.py b/pandas/io/tests/test_html.py index 5a95fe7727df0..7b4e775db9476 100644 --- a/pandas/io/tests/test_html.py +++ b/pandas/io/tests/test_html.py @@ -694,6 +694,72 @@ def test_bool_header_arg(self): with tm.assertRaises(TypeError): read_html(self.spam_data, header=arg) + def test_converters(self): + # GH 13461 + html_data = """ + + + + + + + + + + + + +
    a
    0.763
    0.244
    """ + + expected_df = DataFrame({'a': ['0.763', '0.244']}) + html_df = read_html(html_data, converters={'a': str})[0] + tm.assert_frame_equal(expected_df, html_df) + + def test_na_values(self): + # GH 13461 + html_data = """ + + + + + + + + + + + + +
    a
    0.763
    0.244
    """ + + expected_df = DataFrame({'a': [0.763, np.nan]}) + html_df = read_html(html_data, na_values=[0.244])[0] + tm.assert_frame_equal(expected_df, html_df) + + def test_keep_default_na(self): + html_data = """ + + + + + + + + + + + + +
    a
    N/A
    NA
    """ + + expected_df = DataFrame({'a': ['N/A', 'NA']}) + html_df = read_html(html_data, keep_default_na=False)[0] + tm.assert_frame_equal(expected_df, html_df) + + expected_df = DataFrame({'a': [np.nan, np.nan]}) + html_df = read_html(html_data, keep_default_na=True)[0] + tm.assert_frame_equal(expected_df, html_df) + def _lang_enc(filename): return os.path.splitext(os.path.basename(filename))[0].split('_') diff --git a/pandas/io/tests/test_packers.py b/pandas/io/tests/test_packers.py index ad7d6c3c9f94f..cf61ad9a35935 100644 --- a/pandas/io/tests/test_packers.py +++ b/pandas/io/tests/test_packers.py @@ -1,6 +1,5 @@ import nose -import warnings import os import datetime import numpy as np @@ -8,7 +7,7 @@ from distutils.version import LooseVersion from pandas import compat -from pandas.compat import u +from pandas.compat import u, PY3 from pandas import (Series, DataFrame, Panel, MultiIndex, bdate_range, date_range, period_range, Index, Categorical) from pandas.core.common import PerformanceWarning @@ -58,6 +57,19 @@ def check_arbitrary(a, b): assert_series_equal(a, b) elif isinstance(a, Index): assert_index_equal(a, b) + elif isinstance(a, Categorical): + # Temp, + # Categorical.categories is changed from str to bytes in PY3 + # maybe the same as GH 13591 + if PY3 and b.categories.inferred_type == 'string': + pass + else: + tm.assert_categorical_equal(a, b) + elif a is NaT: + assert b is NaT + elif isinstance(a, Timestamp): + assert a == b + assert a.freq == b.freq else: assert(a == b) @@ -530,25 +542,6 @@ def test_sparse_frame(self): self._check_roundtrip(ss3, tm.assert_frame_equal, check_frame_type=True) - def test_sparse_panel(self): - - with warnings.catch_warnings(record=True): - - items = ['x', 'y', 'z'] - p = Panel(dict((i, tm.makeDataFrame().ix[:2, :2]) for i in items)) - sp = p.to_sparse() - - self._check_roundtrip(sp, tm.assert_panel_equal, - check_panel_type=True) - - sp2 = p.to_sparse(kind='integer') - self._check_roundtrip(sp2, tm.assert_panel_equal, - check_panel_type=True) - - sp3 = p.to_sparse(fill_value=0) - self._check_roundtrip(sp3, tm.assert_panel_equal, - check_panel_type=True) - class TestCompression(TestPackers): """See https://github.com/pydata/pandas/pull/9783 @@ -815,8 +808,8 @@ def check_min_structure(self, data): for typ, v in self.minimum_structure.items(): assert typ in data, '"{0}" not found in unpacked data'.format(typ) for kind in v: - assert kind in data[ - typ], '"{0}" not found in data["{1}"]'.format(kind, typ) + msg = '"{0}" not found in data["{1}"]'.format(kind, typ) + assert kind in data[typ], msg def compare(self, vf, version): # GH12277 encoding default used to be latin-1, now utf-8 @@ -839,8 +832,8 @@ def compare(self, vf, version): # use a specific comparator # if available - comparator = getattr( - self, "compare_{typ}_{dt}".format(typ=typ, dt=dt), None) + comp_method = "compare_{typ}_{dt}".format(typ=typ, dt=dt) + comparator = getattr(self, comp_method, None) if comparator is not None: comparator(result, expected, typ, version) else: @@ -872,9 +865,8 @@ def read_msgpacks(self, version): n = 0 for f in os.listdir(pth): # GH12142 0.17 files packed in P2 can't be read in P3 - if (compat.PY3 and - version.startswith('0.17.') and - f.split('.')[-4][-1] == '2'): + if (compat.PY3 and version.startswith('0.17.') and + f.split('.')[-4][-1] == '2'): continue vf = os.path.join(pth, f) try: diff --git a/pandas/io/tests/test_pickle.py b/pandas/io/tests/test_pickle.py index c12d6e02e3a2e..a49f50b1bcb9f 100644 --- a/pandas/io/tests/test_pickle.py +++ b/pandas/io/tests/test_pickle.py @@ -46,6 +46,12 @@ def compare_element(self, result, expected, typ, version=None): if typ.startswith('sp_'): comparator = getattr(tm, "assert_%s_equal" % typ) comparator(result, expected, exact_indices=False) + elif typ == 'timestamp': + if expected is pd.NaT: + assert result is pd.NaT + else: + tm.assert_equal(result, expected) + tm.assert_equal(result.freq, expected.freq) else: comparator = getattr(tm, "assert_%s_equal" % typ, tm.assert_almost_equal) @@ -80,6 +86,14 @@ def compare(self, vf, version): comparator(result, expected, typ, version) return data + def compare_sp_series_ts(self, res, exp, typ, version): + # SparseTimeSeries integrated into SparseSeries in 0.12.0 + # and deprecated in 0.17.0 + if version and LooseVersion(version) <= "0.12.0": + tm.assert_sp_series_equal(res, exp, check_series_type=False) + else: + tm.assert_sp_series_equal(res, exp) + def compare_series_ts(self, result, expected, typ, version): # GH 7748 tm.assert_series_equal(result, expected) @@ -109,8 +123,12 @@ def compare_series_dt_tz(self, result, expected, typ, version): tm.assert_series_equal(result, expected) def compare_series_cat(self, result, expected, typ, version): - # Categorical.ordered is changed in < 0.16.0 - if LooseVersion(version) < '0.16.0': + # Categorical dtype is added in 0.15.0 + # ordered is changed in 0.16.0 + if LooseVersion(version) < '0.15.0': + tm.assert_series_equal(result, expected, check_dtype=False, + check_categorical=False) + elif LooseVersion(version) < '0.16.0': tm.assert_series_equal(result, expected, check_categorical=False) else: tm.assert_series_equal(result, expected) @@ -125,8 +143,12 @@ def compare_frame_dt_mixed_tzs(self, result, expected, typ, version): tm.assert_frame_equal(result, expected) def compare_frame_cat_onecol(self, result, expected, typ, version): - # Categorical.ordered is changed in < 0.16.0 - if LooseVersion(version) < '0.16.0': + # Categorical dtype is added in 0.15.0 + # ordered is changed in 0.16.0 + if LooseVersion(version) < '0.15.0': + tm.assert_frame_equal(result, expected, check_dtype=False, + check_categorical=False) + elif LooseVersion(version) < '0.16.0': tm.assert_frame_equal(result, expected, check_categorical=False) else: tm.assert_frame_equal(result, expected) @@ -141,6 +163,13 @@ def compare_index_period(self, result, expected, typ, version): tm.assert_equal(result.freqstr, 'M') tm.assert_index_equal(result.shift(2), expected.shift(2)) + def compare_sp_frame_float(self, result, expected, typ, version): + if LooseVersion(version) <= '0.18.1': + tm.assert_sp_frame_equal(result, expected, exact_indices=False, + check_dtype=False) + else: + tm.assert_sp_frame_equal(result, expected) + def read_pickles(self, version): if not is_platform_little_endian(): raise nose.SkipTest("known failure on non-little endian") @@ -217,6 +246,44 @@ def python_unpickler(path): result = python_unpickler(path) self.compare_element(result, expected, typ) + def test_pickle_v0_14_1(self): + + # we have the name warning + # 10482 + with tm.assert_produces_warning(UserWarning): + cat = pd.Categorical(values=['a', 'b', 'c'], + categories=['a', 'b', 'c', 'd'], + name='foobar', ordered=False) + pickle_path = os.path.join(tm.get_data_path(), + 'categorical_0_14_1.pickle') + # This code was executed once on v0.14.1 to generate the pickle: + # + # cat = Categorical(labels=np.arange(3), levels=['a', 'b', 'c', 'd'], + # name='foobar') + # with open(pickle_path, 'wb') as f: pickle.dump(cat, f) + # + tm.assert_categorical_equal(cat, pd.read_pickle(pickle_path)) + + def test_pickle_v0_15_2(self): + # ordered -> _ordered + # GH 9347 + + # we have the name warning + # 10482 + with tm.assert_produces_warning(UserWarning): + cat = pd.Categorical(values=['a', 'b', 'c'], + categories=['a', 'b', 'c', 'd'], + name='foobar', ordered=False) + pickle_path = os.path.join(tm.get_data_path(), + 'categorical_0_15_2.pickle') + # This code was executed once on v0.15.2 to generate the pickle: + # + # cat = Categorical(labels=np.arange(3), levels=['a', 'b', 'c', 'd'], + # name='foobar') + # with open(pickle_path, 'wb') as f: pickle.dump(cat, f) + # + tm.assert_categorical_equal(cat, pd.read_pickle(pickle_path)) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 96b66265ea586..44ff9f8a5a1dd 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -34,7 +34,8 @@ assert_panel_equal, assert_frame_equal, assert_series_equal, - assert_produces_warning) + assert_produces_warning, + set_timezone) from pandas import concat, Timestamp from pandas import compat from pandas.compat import range, lrange, u @@ -46,8 +47,8 @@ from distutils.version import LooseVersion -_default_compressor = LooseVersion(tables.__version__) >= '2.2' \ - and 'blosc' or 'zlib' +_default_compressor = ('blosc' if LooseVersion(tables.__version__) >= '2.2' + else 'zlib') _multiprocess_can_split_ = False @@ -132,13 +133,15 @@ def _maybe_remove(store, key): pass -def compat_assert_produces_warning(w, f): +@contextmanager +def compat_assert_produces_warning(w): """ don't produce a warning under PY3 """ if compat.PY3: - f() + yield else: - with tm.assert_produces_warning(expected_warning=w): - f() + with tm.assert_produces_warning(expected_warning=w, + check_stacklevel=False): + yield class Base(tm.TestCase): @@ -336,7 +339,8 @@ def test_api(self): # File path doesn't exist path = "" - self.assertRaises(IOError, read_hdf, path, 'df') + self.assertRaises(compat.FileNotFoundError, + read_hdf, path, 'df') def test_api_default_format(self): @@ -528,16 +532,25 @@ def f(): # conv read if mode in ['w']: - self.assertRaises(KeyError, read_hdf, + self.assertRaises(ValueError, read_hdf, path, 'df', mode=mode) else: result = read_hdf(path, 'df', mode=mode) assert_frame_equal(result, df) + def check_default_mode(): + + # read_hdf uses default mode + with ensure_clean_path(self.path) as path: + df.to_hdf(path, 'df', mode='w') + result = read_hdf(path, 'df') + assert_frame_equal(result, df) + check('r') check('r+') check('a') check('w') + check_default_mode() def test_reopen_handle(self): @@ -807,28 +820,30 @@ def test_append(self): assert_panel_equal(store['wp1'], wp) # ndim - p4d = tm.makePanel4D() - _maybe_remove(store, 'p4d') - store.append('p4d', p4d.ix[:, :, :10, :]) - store.append('p4d', p4d.ix[:, :, 10:, :]) - assert_panel4d_equal(store['p4d'], p4d) - - # test using axis labels - _maybe_remove(store, 'p4d') - store.append('p4d', p4d.ix[:, :, :10, :], axes=[ - 'items', 'major_axis', 'minor_axis']) - store.append('p4d', p4d.ix[:, :, 10:, :], axes=[ - 'items', 'major_axis', 'minor_axis']) - assert_panel4d_equal(store['p4d'], p4d) - - # test using differnt number of items on each axis - p4d2 = p4d.copy() - p4d2['l4'] = p4d['l1'] - p4d2['l5'] = p4d['l1'] - _maybe_remove(store, 'p4d2') - store.append( - 'p4d2', p4d2, axes=['items', 'major_axis', 'minor_axis']) - assert_panel4d_equal(store['p4d2'], p4d2) + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + p4d = tm.makePanel4D() + _maybe_remove(store, 'p4d') + store.append('p4d', p4d.ix[:, :, :10, :]) + store.append('p4d', p4d.ix[:, :, 10:, :]) + assert_panel4d_equal(store['p4d'], p4d) + + # test using axis labels + _maybe_remove(store, 'p4d') + store.append('p4d', p4d.ix[:, :, :10, :], axes=[ + 'items', 'major_axis', 'minor_axis']) + store.append('p4d', p4d.ix[:, :, 10:, :], axes=[ + 'items', 'major_axis', 'minor_axis']) + assert_panel4d_equal(store['p4d'], p4d) + + # test using differnt number of items on each axis + p4d2 = p4d.copy() + p4d2['l4'] = p4d['l1'] + p4d2['l5'] = p4d['l1'] + _maybe_remove(store, 'p4d2') + store.append( + 'p4d2', p4d2, axes=['items', 'major_axis', 'minor_axis']) + assert_panel4d_equal(store['p4d2'], p4d2) # test using differt order of items on the non-index axes _maybe_remove(store, 'wp1') @@ -1219,72 +1234,74 @@ def test_append_with_different_block_ordering(self): self.assertRaises(ValueError, store.append, 'df', df) def test_ndim_indexables(self): - """ test using ndim tables in new ways""" - - with ensure_clean_store(self.path) as store: + # test using ndim tables in new ways - p4d = tm.makePanel4D() + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with ensure_clean_store(self.path) as store: - def check_indexers(key, indexers): - for i, idx in enumerate(indexers): - self.assertTrue(getattr(getattr( - store.root, key).table.description, idx)._v_pos == i) - - # append then change (will take existing schema) - indexers = ['items', 'major_axis', 'minor_axis'] - - _maybe_remove(store, 'p4d') - store.append('p4d', p4d.ix[:, :, :10, :], axes=indexers) - store.append('p4d', p4d.ix[:, :, 10:, :]) - assert_panel4d_equal(store.select('p4d'), p4d) - check_indexers('p4d', indexers) - - # same as above, but try to append with differnt axes - _maybe_remove(store, 'p4d') - store.append('p4d', p4d.ix[:, :, :10, :], axes=indexers) - store.append('p4d', p4d.ix[:, :, 10:, :], axes=[ - 'labels', 'items', 'major_axis']) - assert_panel4d_equal(store.select('p4d'), p4d) - check_indexers('p4d', indexers) - - # pass incorrect number of axes - _maybe_remove(store, 'p4d') - self.assertRaises(ValueError, store.append, 'p4d', p4d.ix[ - :, :, :10, :], axes=['major_axis', 'minor_axis']) - - # different than default indexables #1 - indexers = ['labels', 'major_axis', 'minor_axis'] - _maybe_remove(store, 'p4d') - store.append('p4d', p4d.ix[:, :, :10, :], axes=indexers) - store.append('p4d', p4d.ix[:, :, 10:, :]) - assert_panel4d_equal(store['p4d'], p4d) - check_indexers('p4d', indexers) - - # different than default indexables #2 - indexers = ['major_axis', 'labels', 'minor_axis'] - _maybe_remove(store, 'p4d') - store.append('p4d', p4d.ix[:, :, :10, :], axes=indexers) - store.append('p4d', p4d.ix[:, :, 10:, :]) - assert_panel4d_equal(store['p4d'], p4d) - check_indexers('p4d', indexers) - - # partial selection - result = store.select('p4d', ['labels=l1']) - expected = p4d.reindex(labels=['l1']) - assert_panel4d_equal(result, expected) - - # partial selection2 - result = store.select('p4d', [Term( - 'labels=l1'), Term('items=ItemA'), Term('minor_axis=B')]) - expected = p4d.reindex( - labels=['l1'], items=['ItemA'], minor_axis=['B']) - assert_panel4d_equal(result, expected) - - # non-existant partial selection - result = store.select('p4d', [Term( - 'labels=l1'), Term('items=Item1'), Term('minor_axis=B')]) - expected = p4d.reindex(labels=['l1'], items=[], minor_axis=['B']) - assert_panel4d_equal(result, expected) + p4d = tm.makePanel4D() + + def check_indexers(key, indexers): + for i, idx in enumerate(indexers): + descr = getattr(store.root, key).table.description + self.assertTrue(getattr(descr, idx)._v_pos == i) + + # append then change (will take existing schema) + indexers = ['items', 'major_axis', 'minor_axis'] + + _maybe_remove(store, 'p4d') + store.append('p4d', p4d.ix[:, :, :10, :], axes=indexers) + store.append('p4d', p4d.ix[:, :, 10:, :]) + assert_panel4d_equal(store.select('p4d'), p4d) + check_indexers('p4d', indexers) + + # same as above, but try to append with differnt axes + _maybe_remove(store, 'p4d') + store.append('p4d', p4d.ix[:, :, :10, :], axes=indexers) + store.append('p4d', p4d.ix[:, :, 10:, :], axes=[ + 'labels', 'items', 'major_axis']) + assert_panel4d_equal(store.select('p4d'), p4d) + check_indexers('p4d', indexers) + + # pass incorrect number of axes + _maybe_remove(store, 'p4d') + self.assertRaises(ValueError, store.append, 'p4d', p4d.ix[ + :, :, :10, :], axes=['major_axis', 'minor_axis']) + + # different than default indexables #1 + indexers = ['labels', 'major_axis', 'minor_axis'] + _maybe_remove(store, 'p4d') + store.append('p4d', p4d.ix[:, :, :10, :], axes=indexers) + store.append('p4d', p4d.ix[:, :, 10:, :]) + assert_panel4d_equal(store['p4d'], p4d) + check_indexers('p4d', indexers) + + # different than default indexables #2 + indexers = ['major_axis', 'labels', 'minor_axis'] + _maybe_remove(store, 'p4d') + store.append('p4d', p4d.ix[:, :, :10, :], axes=indexers) + store.append('p4d', p4d.ix[:, :, 10:, :]) + assert_panel4d_equal(store['p4d'], p4d) + check_indexers('p4d', indexers) + + # partial selection + result = store.select('p4d', ['labels=l1']) + expected = p4d.reindex(labels=['l1']) + assert_panel4d_equal(result, expected) + + # partial selection2 + result = store.select('p4d', [Term( + 'labels=l1'), Term('items=ItemA'), Term('minor_axis=B')]) + expected = p4d.reindex( + labels=['l1'], items=['ItemA'], minor_axis=['B']) + assert_panel4d_equal(result, expected) + + # non-existant partial selection + result = store.select('p4d', [Term( + 'labels=l1'), Term('items=Item1'), Term('minor_axis=B')]) + expected = p4d.reindex(labels=['l1'], items=[], + minor_axis=['B']) + assert_panel4d_equal(result, expected) def test_append_with_strings(self): @@ -1815,24 +1832,27 @@ def test_append_misc(self): with ensure_clean_store(self.path) as store: - # unsuported data types for non-tables - p4d = tm.makePanel4D() - self.assertRaises(TypeError, store.put, 'p4d', p4d) + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): - # unsuported data types - self.assertRaises(TypeError, store.put, 'abc', None) - self.assertRaises(TypeError, store.put, 'abc', '123') - self.assertRaises(TypeError, store.put, 'abc', 123) - self.assertRaises(TypeError, store.put, 'abc', np.arange(5)) + # unsuported data types for non-tables + p4d = tm.makePanel4D() + self.assertRaises(TypeError, store.put, 'p4d', p4d) - df = tm.makeDataFrame() - store.append('df', df, chunksize=1) - result = store.select('df') - tm.assert_frame_equal(result, df) + # unsuported data types + self.assertRaises(TypeError, store.put, 'abc', None) + self.assertRaises(TypeError, store.put, 'abc', '123') + self.assertRaises(TypeError, store.put, 'abc', 123) + self.assertRaises(TypeError, store.put, 'abc', np.arange(5)) - store.append('df1', df, expectedrows=10) - result = store.select('df1') - tm.assert_frame_equal(result, df) + df = tm.makeDataFrame() + store.append('df', df, chunksize=1) + result = store.select('df') + tm.assert_frame_equal(result, df) + + store.append('df1', df, expectedrows=10) + result = store.select('df1') + tm.assert_frame_equal(result, df) # more chunksize in append tests def check(obj, comparator): @@ -1854,8 +1874,9 @@ def check(obj, comparator): p = tm.makePanel() check(p, assert_panel_equal) - p4d = tm.makePanel4D() - check(p4d, assert_panel4d_equal) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + p4d = tm.makePanel4D() + check(p4d, assert_panel4d_equal) # empty frame, GH4273 with ensure_clean_store(self.path) as store: @@ -2021,19 +2042,20 @@ def test_table_mixed_dtypes(self): store.append('p1_mixed', wp) assert_panel_equal(store.select('p1_mixed'), wp) - # ndim - wp = tm.makePanel4D() - wp['obj1'] = 'foo' - wp['obj2'] = 'bar' - wp['bool1'] = wp['l1'] > 0 - wp['bool2'] = wp['l2'] > 0 - wp['int1'] = 1 - wp['int2'] = 2 - wp = wp.consolidate() + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + # ndim + wp = tm.makePanel4D() + wp['obj1'] = 'foo' + wp['obj2'] = 'bar' + wp['bool1'] = wp['l1'] > 0 + wp['bool2'] = wp['l2'] > 0 + wp['int1'] = 1 + wp['int2'] = 2 + wp = wp.consolidate() - with ensure_clean_store(self.path) as store: - store.append('p4d_mixed', wp) - assert_panel4d_equal(store.select('p4d_mixed'), wp) + with ensure_clean_store(self.path) as store: + store.append('p4d_mixed', wp) + assert_panel4d_equal(store.select('p4d_mixed'), wp) def test_unimplemented_dtypes_table_columns(self): @@ -2354,29 +2376,34 @@ def test_invalid_terms(self): with ensure_clean_store(self.path) as store: - df = tm.makeTimeDataFrame() - df['string'] = 'foo' - df.ix[0:4, 'string'] = 'bar' - wp = tm.makePanel() - p4d = tm.makePanel4D() - store.put('df', df, format='table') - store.put('wp', wp, format='table') - store.put('p4d', p4d, format='table') - - # some invalid terms - self.assertRaises(ValueError, store.select, - 'wp', "minor=['A', 'B']") - self.assertRaises(ValueError, store.select, - 'wp', ["index=['20121114']"]) - self.assertRaises(ValueError, store.select, 'wp', [ - "index=['20121114', '20121114']"]) - self.assertRaises(TypeError, Term) - - # more invalid - self.assertRaises(ValueError, store.select, 'df', 'df.index[3]') - self.assertRaises(SyntaxError, store.select, 'df', 'index>') - self.assertRaises(ValueError, store.select, 'wp', - "major_axis<'20000108' & minor_axis['A', 'B']") + with compat_assert_produces_warning(FutureWarning): + + df = tm.makeTimeDataFrame() + df['string'] = 'foo' + df.ix[0:4, 'string'] = 'bar' + wp = tm.makePanel() + + p4d = tm.makePanel4D() + store.put('df', df, format='table') + store.put('wp', wp, format='table') + store.put('p4d', p4d, format='table') + + # some invalid terms + self.assertRaises(ValueError, store.select, + 'wp', "minor=['A', 'B']") + self.assertRaises(ValueError, store.select, + 'wp', ["index=['20121114']"]) + self.assertRaises(ValueError, store.select, 'wp', [ + "index=['20121114', '20121114']"]) + self.assertRaises(TypeError, Term) + + # more invalid + self.assertRaises( + ValueError, store.select, 'df', 'df.index[3]') + self.assertRaises(SyntaxError, store.select, 'df', 'index>') + self.assertRaises( + ValueError, store.select, 'wp', + "major_axis<'20000108' & minor_axis['A', 'B']") # from the docs with ensure_clean_path(self.path) as path: @@ -2403,12 +2430,16 @@ def test_terms(self): with ensure_clean_store(self.path) as store: wp = tm.makePanel() - p4d = tm.makePanel4D() wpneg = Panel.fromDict({-1: tm.makeDataFrame(), 0: tm.makeDataFrame(), 1: tm.makeDataFrame()}) + + with compat_assert_produces_warning(FutureWarning): + + p4d = tm.makePanel4D() + store.put('p4d', p4d, format='table') + store.put('wp', wp, format='table') - store.put('p4d', p4d, format='table') store.put('wpneg', wpneg, format='table') # panel @@ -2424,12 +2455,15 @@ def test_terms(self): tm.assert_panel_equal(result, expected) # p4d - result = store.select('p4d', [Term('major_axis<"20000108"'), - Term("minor_axis=['A', 'B']"), - Term("items=['ItemA', 'ItemB']")]) - expected = p4d.truncate(after='20000108').reindex( - minor=['A', 'B'], items=['ItemA', 'ItemB']) - assert_panel4d_equal(result, expected) + with compat_assert_produces_warning(FutureWarning): + + result = store.select('p4d', + [Term('major_axis<"20000108"'), + Term("minor_axis=['A', 'B']"), + Term("items=['ItemA', 'ItemB']")]) + expected = p4d.truncate(after='20000108').reindex( + minor=['A', 'B'], items=['ItemA', 'ItemB']) + assert_panel4d_equal(result, expected) # back compat invalid terms terms = [dict(field='major_axis', op='>', value='20121114'), @@ -2441,34 +2475,34 @@ def test_terms(self): check_stacklevel=False): Term(t) - # valid terms - terms = [ - ('major_axis=20121114'), - ('major_axis>20121114'), - (("major_axis=['20121114', '20121114']"),), - ('major_axis=datetime.datetime(2012, 11, 14)'), - 'major_axis> 20121114', - 'major_axis >20121114', - 'major_axis > 20121114', - (("minor_axis=['A', 'B']"),), - (("minor_axis=['A', 'B']"),), - ((("minor_axis==['A', 'B']"),),), - (("items=['ItemA', 'ItemB']"),), - ('items=ItemA'), - ] - - for t in terms: - store.select('wp', t) - store.select('p4d', t) - - # valid for p4d only - terms = [ - (("labels=['l1', 'l2']"),), - Term("labels=['l1', 'l2']"), - ] - - for t in terms: - store.select('p4d', t) + with compat_assert_produces_warning(FutureWarning): + + # valid terms + terms = [('major_axis=20121114'), + ('major_axis>20121114'), + (("major_axis=['20121114', '20121114']"),), + ('major_axis=datetime.datetime(2012, 11, 14)'), + 'major_axis> 20121114', + 'major_axis >20121114', + 'major_axis > 20121114', + (("minor_axis=['A', 'B']"),), + (("minor_axis=['A', 'B']"),), + ((("minor_axis==['A', 'B']"),),), + (("items=['ItemA', 'ItemB']"),), + ('items=ItemA'), + ] + + for t in terms: + store.select('wp', t) + store.select('p4d', t) + + # valid for p4d only + terms = [(("labels=['l1', 'l2']"),), + Term("labels=['l1', 'l2']"), + ] + + for t in terms: + store.select('p4d', t) with tm.assertRaisesRegexp(TypeError, 'Only named functions are supported'): @@ -2664,23 +2698,6 @@ def test_sparse_frame(self): self._check_double_roundtrip(ss3, tm.assert_frame_equal, check_frame_type=True) - def test_sparse_panel(self): - - items = ['x', 'y', 'z'] - p = Panel(dict((i, tm.makeDataFrame().ix[:2, :2]) for i in items)) - sp = p.to_sparse() - - self._check_double_roundtrip(sp, assert_panel_equal, - check_panel_type=True) - - sp2 = p.to_sparse(kind='integer') - self._check_double_roundtrip(sp2, assert_panel_equal, - check_panel_type=True) - - sp3 = p.to_sparse(fill_value=0) - self._check_double_roundtrip(sp3, assert_panel_equal, - check_panel_type=True) - def test_float_index(self): # GH #454 @@ -2835,7 +2852,7 @@ def test_store_hierarchical(self): with ensure_clean_store(self.path) as store: store['frame'] = frame recons = store['frame'] - assert(recons.index.names == ('foo', 'bar')) + tm.assert_frame_equal(recons, frame) def test_store_index_name(self): df = tm.makeDataFrame() @@ -2844,7 +2861,19 @@ def test_store_index_name(self): with ensure_clean_store(self.path) as store: store['frame'] = df recons = store['frame'] - assert(recons.index.name == 'foo') + tm.assert_frame_equal(recons, df) + + def test_store_index_name_with_tz(self): + # GH 13884 + df = pd.DataFrame({'A': [1, 2]}) + df.index = pd.DatetimeIndex([1234567890123456787, 1234567890123456788]) + df.index = df.index.tz_localize('UTC') + df.index.name = 'foo' + + with ensure_clean_store(self.path) as store: + store.put('frame', df, format='table') + recons = store['frame'] + tm.assert_frame_equal(recons, df) def test_store_series_name(self): df = tm.makeDataFrame() @@ -2853,7 +2882,7 @@ def test_store_series_name(self): with ensure_clean_store(self.path) as store: store['series'] = series recons = store['series'] - assert(recons.name == 'A') + tm.assert_series_equal(recons, series) def test_store_mixed(self): @@ -4404,12 +4433,12 @@ def test_legacy_table_read(self): def test_legacy_0_10_read(self): # legacy from 0.10 - with ensure_clean_store( - tm.get_data_path('legacy_hdf/legacy_0.10.h5'), - mode='r') as store: - str(store) - for k in store.keys(): - store.select(k) + with compat_assert_produces_warning(FutureWarning): + path = tm.get_data_path('legacy_hdf/legacy_0.10.h5') + with ensure_clean_store(path, mode='r') as store: + str(store) + for k in store.keys(): + store.select(k) def test_legacy_0_11_read(self): # legacy from 0.11 @@ -4428,65 +4457,69 @@ def test_legacy_0_11_read(self): def test_copy(self): - def do_copy(f=None, new_f=None, keys=None, propindexes=True, **kwargs): - try: - if f is None: - f = tm.get_data_path(os.path.join('legacy_hdf', - 'legacy_0.10.h5')) - - store = HDFStore(f, 'r') - - if new_f is None: - import tempfile - fd, new_f = tempfile.mkstemp() - - tstore = store.copy( - new_f, keys=keys, propindexes=propindexes, **kwargs) - - # check keys - if keys is None: - keys = store.keys() - self.assertEqual(set(keys), set(tstore.keys())) + with compat_assert_produces_warning(FutureWarning): - # check indicies & nrows - for k in tstore.keys(): - if tstore.get_storer(k).is_table: - new_t = tstore.get_storer(k) - orig_t = store.get_storer(k) - - self.assertEqual(orig_t.nrows, new_t.nrows) - - # check propindixes - if propindexes: - for a in orig_t.axes: - if a.is_indexed: - self.assertTrue(new_t[a.name].is_indexed) - - finally: - safe_close(store) - safe_close(tstore) + def do_copy(f=None, new_f=None, keys=None, + propindexes=True, **kwargs): try: - os.close(fd) - except: - pass - safe_remove(new_f) - - do_copy() - do_copy(keys=['/a', '/b', '/df1_mixed']) - do_copy(propindexes=False) - - # new table - df = tm.makeDataFrame() + if f is None: + f = tm.get_data_path(os.path.join('legacy_hdf', + 'legacy_0.10.h5')) + + store = HDFStore(f, 'r') + + if new_f is None: + import tempfile + fd, new_f = tempfile.mkstemp() + + tstore = store.copy( + new_f, keys=keys, propindexes=propindexes, **kwargs) + + # check keys + if keys is None: + keys = store.keys() + self.assertEqual(set(keys), set(tstore.keys())) + + # check indicies & nrows + for k in tstore.keys(): + if tstore.get_storer(k).is_table: + new_t = tstore.get_storer(k) + orig_t = store.get_storer(k) + + self.assertEqual(orig_t.nrows, new_t.nrows) + + # check propindixes + if propindexes: + for a in orig_t.axes: + if a.is_indexed: + self.assertTrue( + new_t[a.name].is_indexed) + + finally: + safe_close(store) + safe_close(tstore) + try: + os.close(fd) + except: + pass + safe_remove(new_f) + + do_copy() + do_copy(keys=['/a', '/b', '/df1_mixed']) + do_copy(propindexes=False) + + # new table + df = tm.makeDataFrame() - try: - path = create_tempfile(self.path) - st = HDFStore(path) - st.append('df', df, data_columns=['A']) - st.close() - do_copy(f=path) - do_copy(f=path, propindexes=False) - finally: - safe_remove(path) + try: + path = create_tempfile(self.path) + st = HDFStore(path) + st.append('df', df, data_columns=['A']) + st.close() + do_copy(f=path) + do_copy(f=path, propindexes=False) + finally: + safe_remove(path) def test_legacy_table_write(self): raise nose.SkipTest("cannot write legacy tables") @@ -4566,12 +4599,10 @@ def test_unicode_index(self): unicode_values = [u('\u03c3'), u('\u03c3\u03c3')] - def f(): + with compat_assert_produces_warning(PerformanceWarning): s = Series(np.random.randn(len(unicode_values)), unicode_values) self._check_roundtrip(s, tm.assert_series_equal) - compat_assert_produces_warning(PerformanceWarning, f) - def test_unicode_longer_encoded(self): # GH 11234 char = '\u0394' @@ -4724,6 +4755,36 @@ def test_categorical(self): self.assertRaises( KeyError, lambda: store.select('df3/meta/s/meta')) + def test_categorical_conversion(self): + + # GH13322 + # Check that read_hdf with categorical columns doesn't return rows if + # where criteria isn't met. + obsids = ['ESP_012345_6789', 'ESP_987654_3210'] + imgids = ['APF00006np', 'APF0001imm'] + data = [4.3, 9.8] + + # Test without categories + df = DataFrame(dict(obsids=obsids, imgids=imgids, data=data)) + + # We are expecting an empty DataFrame matching types of df + expected = df.iloc[[], :] + with ensure_clean_path(self.path) as path: + df.to_hdf(path, 'df', format='table', data_columns=True) + result = read_hdf(path, 'df', where='obsids=B') + tm.assert_frame_equal(result, expected) + + # Test with categories + df.obsids = df.obsids.astype('category') + df.imgids = df.imgids.astype('category') + + # We are expecting an empty DataFrame matching types of df + expected = df.iloc[[], :] + with ensure_clean_path(self.path) as path: + df.to_hdf(path, 'df', format='table', data_columns=True) + result = read_hdf(path, 'df', where='obsids=B') + tm.assert_frame_equal(result, expected) + def test_duplicate_column_name(self): df = DataFrame(columns=["a", "a"], data=[[0, 0]]) @@ -4877,6 +4938,9 @@ def test_read_nokey(self): df = DataFrame(np.random.rand(4, 5), index=list('abcd'), columns=list('ABCDE')) + + # Categorical dtype not supported for "fixed" format. So no need + # to test with that dtype in the dataframe here. with ensure_clean_path(self.path) as path: df.to_hdf(path, 'df', mode='a') reread = read_hdf(path) @@ -4884,6 +4948,24 @@ def test_read_nokey(self): df.to_hdf(path, 'df2', mode='a') self.assertRaises(ValueError, read_hdf, path) + def test_read_nokey_table(self): + # GH13231 + df = DataFrame({'i': range(5), + 'c': Series(list('abacd'), dtype='category')}) + + with ensure_clean_path(self.path) as path: + df.to_hdf(path, 'df', mode='a', format='table') + reread = read_hdf(path) + assert_frame_equal(df, reread) + df.to_hdf(path, 'df2', mode='a', format='table') + self.assertRaises(ValueError, read_hdf, path) + + def test_read_nokey_empty(self): + with ensure_clean_path(self.path) as path: + store = HDFStore(path) + store.close() + self.assertRaises(ValueError, read_hdf, path) + def test_read_from_pathlib_path(self): # GH11773 @@ -5019,16 +5101,18 @@ def test_complex_across_dimensions(self): s = Series(complex128, index=list('abcd')) df = DataFrame({'A': s, 'B': s}) p = Panel({'One': df, 'Two': df}) - p4d = pd.Panel4D({'i': p, 'ii': p}) - objs = [df, p, p4d] - comps = [tm.assert_frame_equal, tm.assert_panel_equal, - tm.assert_panel4d_equal] - for obj, comp in zip(objs, comps): - with ensure_clean_path(self.path) as path: - obj.to_hdf(path, 'obj', format='table') - reread = read_hdf(path, 'obj') - comp(obj, reread) + with compat_assert_produces_warning(FutureWarning): + p4d = pd.Panel4D({'i': p, 'ii': p}) + + objs = [df, p, p4d] + comps = [tm.assert_frame_equal, tm.assert_panel_equal, + tm.assert_panel4d_equal] + for obj, comp in zip(objs, comps): + with ensure_clean_path(self.path) as path: + obj.to_hdf(path, 'obj', format='table') + reread = read_hdf(path, 'obj') + comp(obj, reread) def test_complex_indexing_error(self): complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], @@ -5288,14 +5372,6 @@ def test_store_timezone(self): # issue storing datetime.date with a timezone as it resets when read # back in a new timezone - import platform - if platform.system() == "Windows": - raise nose.SkipTest("timezone setting not supported on windows") - - import datetime - import time - import os - # original method with ensure_clean_store(self.path) as store: @@ -5306,34 +5382,17 @@ def test_store_timezone(self): assert_frame_equal(result, df) # with tz setting - orig_tz = os.environ.get('TZ') - - def setTZ(tz): - if tz is None: - try: - del os.environ['TZ'] - except: - pass - else: - os.environ['TZ'] = tz - time.tzset() - - try: - - with ensure_clean_store(self.path) as store: + with ensure_clean_store(self.path) as store: - setTZ('EST5EDT') + with set_timezone('EST5EDT'): today = datetime.date(2013, 9, 10) df = DataFrame([1, 2, 3], index=[today, today, today]) store['obj1'] = df - setTZ('CST6CDT') + with set_timezone('CST6CDT'): result = store['obj1'] - assert_frame_equal(result, df) - - finally: - setTZ(orig_tz) + assert_frame_equal(result, df) def test_legacy_datetimetz_object(self): # legacy from < 0.17.0 diff --git a/pandas/io/tests/test_sql.py b/pandas/io/tests/test_sql.py index 9a995c17f0445..198a4017b5af7 100644 --- a/pandas/io/tests/test_sql.py +++ b/pandas/io/tests/test_sql.py @@ -13,7 +13,7 @@ common methods, `_TestSQLAlchemyConn` tests the API with a SQLAlchemy Connection object. The different tested flavors (sqlite3, MySQL, PostgreSQL) derive from the base class - - Tests for the fallback mode (`TestSQLiteFallback` and `TestMySQLLegacy`) + - Tests for the fallback mode (`TestSQLiteFallback`) """ @@ -31,12 +31,13 @@ from datetime import datetime, date, time +from pandas.types.common import (is_object_dtype, is_datetime64_dtype, + is_datetime64tz_dtype) from pandas import DataFrame, Series, Index, MultiIndex, isnull, concat from pandas import date_range, to_datetime, to_timedelta, Timestamp import pandas.compat as compat from pandas.compat import StringIO, range, lrange, string_types -from pandas.core import common as com -from pandas.core.datetools import format as date_format +from pandas.tseries.tools import format as date_format import pandas.io.sql as sql from pandas.io.sql import read_sql_table, read_sql_query @@ -525,30 +526,29 @@ def test_read_sql_view(self): self._check_iris_loaded_frame(iris_frame) def test_to_sql(self): - sql.to_sql(self.test_frame1, 'test_frame1', self.conn, flavor='sqlite') + sql.to_sql(self.test_frame1, 'test_frame1', self.conn) self.assertTrue( - sql.has_table('test_frame1', self.conn, flavor='sqlite'), + sql.has_table('test_frame1', self.conn), 'Table not written to DB') def test_to_sql_fail(self): sql.to_sql(self.test_frame1, 'test_frame2', - self.conn, flavor='sqlite', if_exists='fail') + self.conn, if_exists='fail') self.assertTrue( - sql.has_table('test_frame2', self.conn, flavor='sqlite'), + sql.has_table('test_frame2', self.conn), 'Table not written to DB') self.assertRaises(ValueError, sql.to_sql, self.test_frame1, - 'test_frame2', self.conn, flavor='sqlite', - if_exists='fail') + 'test_frame2', self.conn, if_exists='fail') def test_to_sql_replace(self): sql.to_sql(self.test_frame1, 'test_frame3', - self.conn, flavor='sqlite', if_exists='fail') + self.conn, if_exists='fail') # Add to table again sql.to_sql(self.test_frame1, 'test_frame3', - self.conn, flavor='sqlite', if_exists='replace') + self.conn, if_exists='replace') self.assertTrue( - sql.has_table('test_frame3', self.conn, flavor='sqlite'), + sql.has_table('test_frame3', self.conn), 'Table not written to DB') num_entries = len(self.test_frame1) @@ -559,13 +559,13 @@ def test_to_sql_replace(self): def test_to_sql_append(self): sql.to_sql(self.test_frame1, 'test_frame4', - self.conn, flavor='sqlite', if_exists='fail') + self.conn, if_exists='fail') # Add to table again sql.to_sql(self.test_frame1, 'test_frame4', - self.conn, flavor='sqlite', if_exists='append') + self.conn, if_exists='append') self.assertTrue( - sql.has_table('test_frame4', self.conn, flavor='sqlite'), + sql.has_table('test_frame4', self.conn), 'Table not written to DB') num_entries = 2 * len(self.test_frame1) @@ -575,26 +575,25 @@ def test_to_sql_append(self): num_rows, num_entries, "not the same number of rows as entries") def test_to_sql_type_mapping(self): - sql.to_sql(self.test_frame3, 'test_frame5', - self.conn, flavor='sqlite', index=False) + sql.to_sql(self.test_frame3, 'test_frame5', self.conn, index=False) result = sql.read_sql("SELECT * FROM test_frame5", self.conn) tm.assert_frame_equal(self.test_frame3, result) def test_to_sql_series(self): s = Series(np.arange(5, dtype='int64'), name='series') - sql.to_sql(s, "test_series", self.conn, flavor='sqlite', index=False) + sql.to_sql(s, "test_series", self.conn, index=False) s2 = sql.read_sql_query("SELECT * FROM test_series", self.conn) tm.assert_frame_equal(s.to_frame(), s2) def test_to_sql_panel(self): panel = tm.makePanel() self.assertRaises(NotImplementedError, sql.to_sql, panel, - 'test_panel', self.conn, flavor='sqlite') + 'test_panel', self.conn) def test_roundtrip(self): sql.to_sql(self.test_frame1, 'test_frame_roundtrip', - con=self.conn, flavor='sqlite') + con=self.conn) result = sql.read_sql_query( 'SELECT * FROM test_frame_roundtrip', con=self.conn) @@ -608,7 +607,7 @@ def test_roundtrip(self): def test_roundtrip_chunksize(self): sql.to_sql(self.test_frame1, 'test_frame_roundtrip', con=self.conn, - index=False, flavor='sqlite', chunksize=2) + index=False, chunksize=2) result = sql.read_sql_query( 'SELECT * FROM test_frame_roundtrip', con=self.conn) @@ -763,27 +762,25 @@ def test_integer_col_names(self): if_exists='replace') def test_get_schema(self): - create_sql = sql.get_schema(self.test_frame1, 'test', 'sqlite', - con=self.conn) + create_sql = sql.get_schema(self.test_frame1, 'test', con=self.conn) self.assertTrue('CREATE' in create_sql) def test_get_schema_dtypes(self): float_frame = DataFrame({'a': [1.1, 1.2], 'b': [2.1, 2.2]}) dtype = sqlalchemy.Integer if self.mode == 'sqlalchemy' else 'INTEGER' - create_sql = sql.get_schema(float_frame, 'test', 'sqlite', + create_sql = sql.get_schema(float_frame, 'test', con=self.conn, dtype={'b': dtype}) self.assertTrue('CREATE' in create_sql) self.assertTrue('INTEGER' in create_sql) def test_get_schema_keys(self): frame = DataFrame({'Col1': [1.1, 1.2], 'Col2': [2.1, 2.2]}) - create_sql = sql.get_schema(frame, 'test', 'sqlite', - con=self.conn, keys='Col1') + create_sql = sql.get_schema(frame, 'test', con=self.conn, keys='Col1') constraint_sentence = 'CONSTRAINT test_pk PRIMARY KEY ("Col1")' self.assertTrue(constraint_sentence in create_sql) # multiple columns as key (GH10385) - create_sql = sql.get_schema(self.test_frame1, 'test', 'sqlite', + create_sql = sql.get_schema(self.test_frame1, 'test', con=self.conn, keys=['A', 'B']) constraint_sentence = 'CONSTRAINT test_pk PRIMARY KEY ("A", "B")' self.assertTrue(constraint_sentence in create_sql) @@ -1043,8 +1040,8 @@ def test_sql_open_close(self): with tm.ensure_clean() as name: conn = self.connect(name) - sql.to_sql(self.test_frame3, "test_frame3_legacy", conn, - flavor="sqlite", index=False) + sql.to_sql(self.test_frame3, "test_frame3_legacy", + conn, index=False) conn.close() conn = self.connect(name) @@ -1054,6 +1051,14 @@ def test_sql_open_close(self): tm.assert_frame_equal(self.test_frame3, result) + def test_con_string_import_error(self): + if not SQLALCHEMY_INSTALLED: + conn = 'mysql://root@localhost/pandas_nosetest' + self.assertRaises(ImportError, sql.read_sql, "SELECT * FROM iris", + conn) + else: + raise nose.SkipTest('SQLAlchemy is installed') + def test_read_sql_delegate(self): iris_frame1 = sql.read_sql_query("SELECT * FROM iris", self.conn) iris_frame2 = sql.read_sql("SELECT * FROM iris", self.conn) @@ -1066,25 +1071,13 @@ def test_safe_names_warning(self): df = DataFrame([[1, 2], [3, 4]], columns=['a', 'b ']) # has a space # warns on create table with spaces in names with tm.assert_produces_warning(): - sql.to_sql(df, "test_frame3_legacy", self.conn, - flavor="sqlite", index=False) + sql.to_sql(df, "test_frame3_legacy", self.conn, index=False) def test_get_schema2(self): # without providing a connection object (available for backwards comp) - create_sql = sql.get_schema(self.test_frame1, 'test', 'sqlite') + create_sql = sql.get_schema(self.test_frame1, 'test') self.assertTrue('CREATE' in create_sql) - def test_tquery(self): - with tm.assert_produces_warning(FutureWarning): - iris_results = sql.tquery("SELECT * FROM iris", con=self.conn) - row = iris_results[0] - tm.equalContents(row, [5.1, 3.5, 1.4, 0.2, 'Iris-setosa']) - - def test_uquery(self): - with tm.assert_produces_warning(FutureWarning): - rows = sql.uquery("SELECT * FROM iris LIMIT 1", con=self.conn) - self.assertEqual(rows, -1) - def _get_sqlite_column_type(self, schema, column): for col in schema.split('\n'): @@ -1097,7 +1090,7 @@ def test_sqlite_type_mapping(self): # Test Timestamp objects (no datetime64 because of timezone) (GH9085) df = DataFrame({'time': to_datetime(['201412120154', '201412110254'], utc=True)}) - db = sql.SQLiteDatabase(self.conn, self.flavor) + db = sql.SQLiteDatabase(self.conn) table = sql.SQLiteTable("test_type", db, frame=df) schema = table.sql_schema() self.assertEqual(self._get_sqlite_column_type(schema, 'time'), @@ -1275,7 +1268,7 @@ def test_datetime_with_timezone(self): def check(col): # check that a column is either datetime64[ns] # or datetime64[ns, UTC] - if com.is_datetime64_dtype(col.dtype): + if is_datetime64_dtype(col.dtype): # "2000-01-01 00:00:00-08:00" should convert to # "2000-01-01 08:00:00" @@ -1285,7 +1278,7 @@ def check(col): # "2000-06-01 07:00:00" self.assertEqual(col[1], Timestamp('2000-06-01 07:00:00')) - elif com.is_datetime64tz_dtype(col.dtype): + elif is_datetime64tz_dtype(col.dtype): self.assertTrue(str(col.dt.tz) == 'UTC') # "2000-01-01 00:00:00-08:00" should convert to @@ -1311,9 +1304,9 @@ def check(col): # even with the same versions of psycopg2 & sqlalchemy, possibly a # Postgrsql server version difference col = df.DateColWithTz - self.assertTrue(com.is_object_dtype(col.dtype) or - com.is_datetime64_dtype(col.dtype) or - com.is_datetime64tz_dtype(col.dtype), + self.assertTrue(is_object_dtype(col.dtype) or + is_datetime64_dtype(col.dtype) or + is_datetime64tz_dtype(col.dtype), "DateCol loaded with incorrect type -> {0}" .format(col.dtype)) @@ -1327,7 +1320,7 @@ def check(col): self.conn, chunksize=1)), ignore_index=True) col = df.DateColWithTz - self.assertTrue(com.is_datetime64tz_dtype(col.dtype), + self.assertTrue(is_datetime64tz_dtype(col.dtype), "DateCol loaded with incorrect type -> {0}" .format(col.dtype)) self.assertTrue(str(col.dt.tz) == 'UTC') @@ -1552,6 +1545,15 @@ def test_dtype(self): self.assertTrue(isinstance(sqltype, sqlalchemy.String)) self.assertEqual(sqltype.length, 10) + # single dtype + df.to_sql('single_dtype_test', self.conn, dtype=sqlalchemy.TEXT) + meta = sqlalchemy.schema.MetaData(bind=self.conn) + meta.reflect() + sqltypea = meta.tables['single_dtype_test'].columns['A'].type + sqltypeb = meta.tables['single_dtype_test'].columns['B'].type + self.assertTrue(isinstance(sqltypea, sqlalchemy.TEXT)) + self.assertTrue(isinstance(sqltypeb, sqlalchemy.TEXT)) + def test_notnull_dtype(self): cols = {'Bool': Series([True, None]), 'Date': Series([datetime(2012, 5, 1), None]), @@ -1907,16 +1909,12 @@ def connect(cls): def setUp(self): self.conn = self.connect() - self.pandasSQL = sql.SQLiteDatabase(self.conn, 'sqlite') + self.pandasSQL = sql.SQLiteDatabase(self.conn) self._load_iris_data() self._load_test1_data() - def test_invalid_flavor(self): - self.assertRaises( - NotImplementedError, sql.SQLiteDatabase, self.conn, 'oracle') - def test_read_sql(self): self._read_sql_iris() @@ -1964,7 +1962,7 @@ def test_execute_sql(self): def test_datetime_date(self): # test support for datetime.date df = DataFrame([date(2014, 1, 1), date(2014, 1, 2)], columns=["a"]) - df.to_sql('test_date', self.conn, index=False, flavor=self.flavor) + df.to_sql('test_date', self.conn, index=False) res = read_sql_query('SELECT * FROM test_date', self.conn) if self.flavor == 'sqlite': # comes back as strings @@ -1975,7 +1973,7 @@ def test_datetime_date(self): def test_datetime_time(self): # test support for datetime.time, GH #8341 df = DataFrame([time(9, 0, 0), time(9, 1, 30)], columns=["a"]) - df.to_sql('test_time', self.conn, index=False, flavor=self.flavor) + df.to_sql('test_time', self.conn, index=False) res = read_sql_query('SELECT * FROM test_time', self.conn) if self.flavor == 'sqlite': # comes back as strings @@ -2025,6 +2023,13 @@ def test_dtype(self): self.assertRaises(ValueError, df.to_sql, 'error', self.conn, dtype={'B': bool}) + # single dtype + df.to_sql('single_dtype_test', self.conn, dtype='STRING') + self.assertEqual( + self._get_sqlite_column_type('single_dtype_test', 'A'), 'STRING') + self.assertEqual( + self._get_sqlite_column_type('single_dtype_test', 'B'), 'STRING') + def test_notnull_dtype(self): if self.flavor == 'mysql': raise nose.SkipTest('Not applicable to MySQL legacy') @@ -2050,130 +2055,22 @@ def test_illegal_names(self): df = DataFrame([[1, 2], [3, 4]], columns=['a', 'b']) # Raise error on blank - self.assertRaises(ValueError, df.to_sql, "", self.conn, - flavor=self.flavor) + self.assertRaises(ValueError, df.to_sql, "", self.conn) for ndx, weird_name in enumerate( ['test_weird_name]', 'test_weird_name[', 'test_weird_name`', 'test_weird_name"', 'test_weird_name\'', '_b.test_weird_name_01-30', '"_b.test_weird_name_01-30"', '99beginswithnumber', '12345', u'\xe9']): - df.to_sql(weird_name, self.conn, flavor=self.flavor) + df.to_sql(weird_name, self.conn) sql.table_exists(weird_name, self.conn) df2 = DataFrame([[1, 2], [3, 4]], columns=['a', weird_name]) c_tbl = 'test_weird_col_name%d' % ndx - df2.to_sql(c_tbl, self.conn, flavor=self.flavor) + df2.to_sql(c_tbl, self.conn) sql.table_exists(c_tbl, self.conn) -class TestMySQLLegacy(MySQLMixIn, TestSQLiteFallback): - """ - Test the legacy mode against a MySQL database. - - """ - flavor = 'mysql' - - @classmethod - def setUpClass(cls): - cls.setup_driver() - - # test connection - try: - cls.connect() - except cls.driver.err.OperationalError: - raise nose.SkipTest( - "{0} - can't connect to MySQL server".format(cls)) - - @classmethod - def setup_driver(cls): - try: - import pymysql - cls.driver = pymysql - except ImportError: - raise nose.SkipTest('pymysql not installed') - - @classmethod - def connect(cls): - return cls.driver.connect(host='127.0.0.1', user='root', passwd='', - db='pandas_nosetest') - - def _count_rows(self, table_name): - cur = self._get_exec() - cur.execute( - "SELECT count(*) AS count_1 FROM %s" % table_name) - rows = cur.fetchall() - return rows[0][0] - - def setUp(self): - try: - self.conn = self.connect() - except self.driver.err.OperationalError: - raise nose.SkipTest("Can't connect to MySQL server") - - self.pandasSQL = sql.SQLiteDatabase(self.conn, 'mysql') - - self._load_iris_data() - self._load_test1_data() - - def test_a_deprecation(self): - with tm.assert_produces_warning(FutureWarning): - sql.to_sql(self.test_frame1, 'test_frame1', self.conn, - flavor='mysql') - self.assertTrue( - sql.has_table('test_frame1', self.conn, flavor='mysql'), - 'Table not written to DB') - - def _get_index_columns(self, tbl_name): - ixs = sql.read_sql_query( - "SHOW INDEX IN %s" % tbl_name, self.conn) - ix_cols = {} - for ix_name, ix_col in zip(ixs.Key_name, ixs.Column_name): - if ix_name not in ix_cols: - ix_cols[ix_name] = [] - ix_cols[ix_name].append(ix_col) - return list(ix_cols.values()) - - # TODO: cruft? - # def test_to_sql_save_index(self): - # self._to_sql_save_index() - - # for ix_name, ix_col in zip(ixs.Key_name, ixs.Column_name): - # if ix_name not in ix_cols: - # ix_cols[ix_name] = [] - # ix_cols[ix_name].append(ix_col) - # return ix_cols.values() - - def test_to_sql_save_index(self): - self._to_sql_save_index() - - def test_illegal_names(self): - df = DataFrame([[1, 2], [3, 4]], columns=['a', 'b']) - - # These tables and columns should be ok - for ndx, ok_name in enumerate(['99beginswithnumber', '12345']): - df.to_sql(ok_name, self.conn, flavor=self.flavor, index=False, - if_exists='replace') - df2 = DataFrame([[1, 2], [3, 4]], columns=['a', ok_name]) - - df2.to_sql('test_ok_col_name', self.conn, - flavor=self.flavor, index=False, - if_exists='replace') - - # For MySQL, these should raise ValueError - for ndx, illegal_name in enumerate( - ['test_illegal_name]', 'test_illegal_name[', - 'test_illegal_name`', 'test_illegal_name"', - 'test_illegal_name\'', '']): - self.assertRaises(ValueError, df.to_sql, illegal_name, self.conn, - flavor=self.flavor, index=False) - - df2 = DataFrame([[1, 2], [3, 4]], columns=['a', illegal_name]) - self.assertRaises(ValueError, df2.to_sql, - 'test_illegal_col_name%d' % ndx, - self.conn, flavor=self.flavor, index=False) - - # ----------------------------------------------------------------------------- # -- Old tests from 0.13.1 (before refactor using sqlalchemy) @@ -2207,6 +2104,15 @@ def format_query(sql, *args): return sql % tuple(processed_args) +def tquery(query, con=None, cur=None): + """Replace removed sql.tquery function""" + res = sql.execute(query, con=con, cur=cur).fetchall() + if res is None: + return None + else: + return list(res) + + def _skip_if_no_pymysql(): try: import pymysql # noqa @@ -2227,7 +2133,7 @@ def test_write_row_by_row(self): frame = tm.makeTimeDataFrame() frame.ix[0, 0] = np.nan - create_sql = sql.get_schema(frame, 'test', 'sqlite') + create_sql = sql.get_schema(frame, 'test') cur = self.conn.cursor() cur.execute(create_sql) @@ -2236,7 +2142,7 @@ def test_write_row_by_row(self): ins = "INSERT INTO test VALUES (%s, %s, %s, %s)" for idx, row in frame.iterrows(): fmt_sql = format_query(ins, *row) - sql.tquery(fmt_sql, cur=cur) + tquery(fmt_sql, cur=cur) self.conn.commit() @@ -2246,7 +2152,7 @@ def test_write_row_by_row(self): def test_execute(self): frame = tm.makeTimeDataFrame() - create_sql = sql.get_schema(frame, 'test', 'sqlite') + create_sql = sql.get_schema(frame, 'test') cur = self.conn.cursor() cur.execute(create_sql) ins = "INSERT INTO test VALUES (?, ?, ?, ?)" @@ -2261,7 +2167,7 @@ def test_execute(self): def test_schema(self): frame = tm.makeTimeDataFrame() - create_sql = sql.get_schema(frame, 'test', 'sqlite') + create_sql = sql.get_schema(frame, 'test') lines = create_sql.splitlines() for l in lines: tokens = l.split(' ') @@ -2269,7 +2175,7 @@ def test_schema(self): self.assertTrue(tokens[1] == 'DATETIME') frame = tm.makeTimeDataFrame() - create_sql = sql.get_schema(frame, 'test', 'sqlite', keys=['A', 'B'],) + create_sql = sql.get_schema(frame, 'test', keys=['A', 'B']) lines = create_sql.splitlines() self.assertTrue('PRIMARY KEY ("A", "B")' in create_sql) cur = self.conn.cursor() @@ -2316,7 +2222,7 @@ def test_execute_closed_connection(self): self.conn.close() try: sys.stdout = StringIO() - self.assertRaises(Exception, sql.tquery, "select * from test", + self.assertRaises(Exception, tquery, "select * from test", con=self.conn) finally: sys.stdout = sys.__stdout__ @@ -2348,42 +2254,6 @@ def _check_roundtrip(self, frame): expected.index.name = 'Idx' tm.assert_frame_equal(expected, result) - def test_tquery(self): - frame = tm.makeTimeDataFrame() - sql.to_sql(frame, name='test_table', con=self.conn, index=False) - result = sql.tquery("select A from test_table", self.conn) - expected = Series(frame.A.values, frame.index) # not to have name - result = Series(result, frame.index) - tm.assert_series_equal(result, expected) - - try: - sys.stdout = StringIO() - self.assertRaises(sql.DatabaseError, sql.tquery, - 'select * from blah', con=self.conn) - - self.assertRaises(sql.DatabaseError, sql.tquery, - 'select * from blah', con=self.conn, retry=True) - finally: - sys.stdout = sys.__stdout__ - - def test_uquery(self): - frame = tm.makeTimeDataFrame() - sql.to_sql(frame, name='test_table', con=self.conn, index=False) - stmt = 'INSERT INTO test_table VALUES(2.314, -123.1, 1.234, 2.3)' - self.assertEqual(sql.uquery(stmt, con=self.conn), 1) - - try: - sys.stdout = StringIO() - - self.assertRaises(sql.DatabaseError, sql.tquery, - 'insert into blah values (1)', con=self.conn) - - self.assertRaises(sql.DatabaseError, sql.tquery, - 'insert into blah values (1)', con=self.conn, - retry=True) - finally: - sys.stdout = sys.__stdout__ - def test_keyword_as_column_names(self): df = DataFrame({'From': np.ones(5)}) sql.to_sql(df, con=self.conn, name='testkeywords', index=False) @@ -2424,44 +2294,68 @@ def clean_up(test_table_to_drop): frame=df_if_exists_1, con=self.conn, name=table_name, - flavor='sqlite', if_exists='notvalidvalue') clean_up(table_name) # test if_exists='fail' - sql.to_sql(frame=df_if_exists_1, con=self.conn, name=table_name, - flavor='sqlite', if_exists='fail') + sql.to_sql(frame=df_if_exists_1, con=self.conn, + name=table_name, if_exists='fail') self.assertRaises(ValueError, sql.to_sql, frame=df_if_exists_1, con=self.conn, name=table_name, - flavor='sqlite', if_exists='fail') # test if_exists='replace' sql.to_sql(frame=df_if_exists_1, con=self.conn, name=table_name, - flavor='sqlite', if_exists='replace', index=False) - self.assertEqual(sql.tquery(sql_select, con=self.conn), + if_exists='replace', index=False) + self.assertEqual(tquery(sql_select, con=self.conn), [(1, 'A'), (2, 'B')]) sql.to_sql(frame=df_if_exists_2, con=self.conn, name=table_name, - flavor='sqlite', if_exists='replace', index=False) - self.assertEqual(sql.tquery(sql_select, con=self.conn), + if_exists='replace', index=False) + self.assertEqual(tquery(sql_select, con=self.conn), [(3, 'C'), (4, 'D'), (5, 'E')]) clean_up(table_name) # test if_exists='append' sql.to_sql(frame=df_if_exists_1, con=self.conn, name=table_name, - flavor='sqlite', if_exists='fail', index=False) - self.assertEqual(sql.tquery(sql_select, con=self.conn), + if_exists='fail', index=False) + self.assertEqual(tquery(sql_select, con=self.conn), [(1, 'A'), (2, 'B')]) sql.to_sql(frame=df_if_exists_2, con=self.conn, name=table_name, - flavor='sqlite', if_exists='append', index=False) - self.assertEqual(sql.tquery(sql_select, con=self.conn), + if_exists='append', index=False) + self.assertEqual(tquery(sql_select, con=self.conn), [(1, 'A'), (2, 'B'), (3, 'C'), (4, 'D'), (5, 'E')]) clean_up(table_name) +class TestSQLFlavorDeprecation(tm.TestCase): + """ + gh-13611: test that the 'flavor' parameter + is appropriately deprecated by checking the + functions that directly raise the warning + """ + + con = 1234 # don't need real connection for this + funcs = ['SQLiteDatabase', 'pandasSQL_builder'] + + def test_unsupported_flavor(self): + msg = 'is not supported' + + for func in self.funcs: + tm.assertRaisesRegexp(ValueError, msg, getattr(sql, func), + self.con, flavor='mysql') + + def test_deprecated_flavor(self): + for func in self.funcs: + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + getattr(sql, func)(self.con, flavor='sqlite') + + +@unittest.skip("gh-13611: there is no support for MySQL " + "if SQLAlchemy is not installed") class TestXMySQL(MySQLMixIn, tm.TestCase): @classmethod @@ -2530,14 +2424,14 @@ def test_write_row_by_row(self): frame = tm.makeTimeDataFrame() frame.ix[0, 0] = np.nan drop_sql = "DROP TABLE IF EXISTS test" - create_sql = sql.get_schema(frame, 'test', 'mysql') + create_sql = sql.get_schema(frame, 'test') cur = self.conn.cursor() cur.execute(drop_sql) cur.execute(create_sql) ins = "INSERT INTO test VALUES (%s, %s, %s, %s)" for idx, row in frame.iterrows(): fmt_sql = format_query(ins, *row) - sql.tquery(fmt_sql, cur=cur) + tquery(fmt_sql, cur=cur) self.conn.commit() @@ -2552,7 +2446,7 @@ def test_chunksize_read_type(self): drop_sql = "DROP TABLE IF EXISTS test" cur = self.conn.cursor() cur.execute(drop_sql) - sql.to_sql(frame, name='test', con=self.conn, flavor='mysql') + sql.to_sql(frame, name='test', con=self.conn) query = "select * from test" chunksize = 5 chunk_gen = pd.read_sql_query(sql=query, con=self.conn, @@ -2564,7 +2458,7 @@ def test_execute(self): _skip_if_no_pymysql() frame = tm.makeTimeDataFrame() drop_sql = "DROP TABLE IF EXISTS test" - create_sql = sql.get_schema(frame, 'test', 'mysql') + create_sql = sql.get_schema(frame, 'test') cur = self.conn.cursor() with warnings.catch_warnings(): warnings.filterwarnings("ignore", "Unknown table.*") @@ -2583,7 +2477,7 @@ def test_execute(self): def test_schema(self): _skip_if_no_pymysql() frame = tm.makeTimeDataFrame() - create_sql = sql.get_schema(frame, 'test', 'mysql') + create_sql = sql.get_schema(frame, 'test') lines = create_sql.splitlines() for l in lines: tokens = l.split(' ') @@ -2592,7 +2486,7 @@ def test_schema(self): frame = tm.makeTimeDataFrame() drop_sql = "DROP TABLE IF EXISTS test" - create_sql = sql.get_schema(frame, 'test', 'mysql', keys=['A', 'B'],) + create_sql = sql.get_schema(frame, 'test', keys=['A', 'B']) lines = create_sql.splitlines() self.assertTrue('PRIMARY KEY (`A`, `B`)' in create_sql) cur = self.conn.cursor() @@ -2646,7 +2540,7 @@ def test_execute_closed_connection(self): self.conn.close() try: sys.stdout = StringIO() - self.assertRaises(Exception, sql.tquery, "select * from test", + self.assertRaises(Exception, tquery, "select * from test", con=self.conn) finally: sys.stdout = sys.__stdout__ @@ -2665,8 +2559,7 @@ def _check_roundtrip(self, frame): with warnings.catch_warnings(): warnings.filterwarnings("ignore", "Unknown table.*") cur.execute(drop_sql) - sql.to_sql(frame, name='test_table', - con=self.conn, flavor='mysql', index=False) + sql.to_sql(frame, name='test_table', con=self.conn, index=False) result = sql.read_sql("select * from test_table", self.conn) # HACK! Change this once indexes are handled properly. @@ -2686,7 +2579,7 @@ def _check_roundtrip(self, frame): warnings.filterwarnings("ignore", "Unknown table.*") cur.execute(drop_sql) sql.to_sql(frame2, name='test_table2', - con=self.conn, flavor='mysql', index=False) + con=self.conn, index=False) result = sql.read_sql("select * from test_table2", self.conn, index_col='Idx') expected = frame.copy() @@ -2696,63 +2589,11 @@ def _check_roundtrip(self, frame): expected.index.names = result.index.names tm.assert_frame_equal(expected, result) - def test_tquery(self): - try: - import pymysql # noqa - except ImportError: - raise nose.SkipTest("no pymysql") - frame = tm.makeTimeDataFrame() - drop_sql = "DROP TABLE IF EXISTS test_table" - cur = self.conn.cursor() - cur.execute(drop_sql) - sql.to_sql(frame, name='test_table', - con=self.conn, flavor='mysql', index=False) - result = sql.tquery("select A from test_table", self.conn) - expected = Series(frame.A.values, frame.index) # not to have name - result = Series(result, frame.index) - tm.assert_series_equal(result, expected) - - try: - sys.stdout = StringIO() - self.assertRaises(sql.DatabaseError, sql.tquery, - 'select * from blah', con=self.conn) - - self.assertRaises(sql.DatabaseError, sql.tquery, - 'select * from blah', con=self.conn, retry=True) - finally: - sys.stdout = sys.__stdout__ - - def test_uquery(self): - try: - import pymysql # noqa - except ImportError: - raise nose.SkipTest("no pymysql") - frame = tm.makeTimeDataFrame() - drop_sql = "DROP TABLE IF EXISTS test_table" - cur = self.conn.cursor() - cur.execute(drop_sql) - sql.to_sql(frame, name='test_table', - con=self.conn, flavor='mysql', index=False) - stmt = 'INSERT INTO test_table VALUES(2.314, -123.1, 1.234, 2.3)' - self.assertEqual(sql.uquery(stmt, con=self.conn), 1) - - try: - sys.stdout = StringIO() - - self.assertRaises(sql.DatabaseError, sql.tquery, - 'insert into blah values (1)', con=self.conn) - - self.assertRaises(sql.DatabaseError, sql.tquery, - 'insert into blah values (1)', con=self.conn, - retry=True) - finally: - sys.stdout = sys.__stdout__ - def test_keyword_as_column_names(self): _skip_if_no_pymysql() df = DataFrame({'From': np.ones(5)}) sql.to_sql(df, con=self.conn, name='testkeywords', - if_exists='replace', flavor='mysql', index=False) + if_exists='replace', index=False) def test_if_exists(self): _skip_if_no_pymysql() @@ -2775,40 +2616,38 @@ def clean_up(test_table_to_drop): frame=df_if_exists_1, con=self.conn, name=table_name, - flavor='mysql', if_exists='notvalidvalue') clean_up(table_name) # test if_exists='fail' sql.to_sql(frame=df_if_exists_1, con=self.conn, name=table_name, - flavor='mysql', if_exists='fail', index=False) + if_exists='fail', index=False) self.assertRaises(ValueError, sql.to_sql, frame=df_if_exists_1, con=self.conn, name=table_name, - flavor='mysql', if_exists='fail') # test if_exists='replace' sql.to_sql(frame=df_if_exists_1, con=self.conn, name=table_name, - flavor='mysql', if_exists='replace', index=False) - self.assertEqual(sql.tquery(sql_select, con=self.conn), + if_exists='replace', index=False) + self.assertEqual(tquery(sql_select, con=self.conn), [(1, 'A'), (2, 'B')]) sql.to_sql(frame=df_if_exists_2, con=self.conn, name=table_name, - flavor='mysql', if_exists='replace', index=False) - self.assertEqual(sql.tquery(sql_select, con=self.conn), + if_exists='replace', index=False) + self.assertEqual(tquery(sql_select, con=self.conn), [(3, 'C'), (4, 'D'), (5, 'E')]) clean_up(table_name) # test if_exists='append' sql.to_sql(frame=df_if_exists_1, con=self.conn, name=table_name, - flavor='mysql', if_exists='fail', index=False) - self.assertEqual(sql.tquery(sql_select, con=self.conn), + if_exists='fail', index=False) + self.assertEqual(tquery(sql_select, con=self.conn), [(1, 'A'), (2, 'B')]) sql.to_sql(frame=df_if_exists_2, con=self.conn, name=table_name, - flavor='mysql', if_exists='append', index=False) - self.assertEqual(sql.tquery(sql_select, con=self.conn), + if_exists='append', index=False) + self.assertEqual(tquery(sql_select, con=self.conn), [(1, 'A'), (2, 'B'), (3, 'C'), (4, 'D'), (5, 'E')]) clean_up(table_name) diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index 830c68d62efad..7752fff5247c0 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -1,27 +1,27 @@ # -*- coding: utf-8 -*- # pylint: disable=E1101 -from datetime import datetime import datetime as dt import os -import warnings -import nose import struct import sys +import warnings +from datetime import datetime from distutils.version import LooseVersion +import nose import numpy as np +from pandas.tslib import NaT import pandas as pd +import pandas.util.testing as tm +from pandas import compat from pandas.compat import iterkeys from pandas.core.frame import DataFrame, Series -from pandas.core.common import is_categorical_dtype from pandas.io.parsers import read_csv from pandas.io.stata import (read_stata, StataReader, InvalidColumnName, PossiblePrecisionLoss, StataMissingValue) -import pandas.util.testing as tm -from pandas.tslib import NaT -from pandas import compat +from pandas.types.common import is_categorical_dtype class TestStata(tm.TestCase): @@ -80,6 +80,7 @@ def setUp(self): self.dta21_117 = os.path.join(self.dirpath, 'stata12_117.dta') self.dta22_118 = os.path.join(self.dirpath, 'stata14_118.dta') + self.dta23 = os.path.join(self.dirpath, 'stata15.dta') def read_dta(self, file): # Legacy default reader configuration @@ -1113,6 +1114,110 @@ def test_read_chunks_columns(self): tm.assert_frame_equal(from_frame, chunk, check_dtype=False) pos += chunksize + def test_write_variable_labels(self): + # GH 13631, add support for writing variable labels + original = pd.DataFrame({'a': [1, 2, 3, 4], + 'b': [1.0, 3.0, 27.0, 81.0], + 'c': ['Atlanta', 'Birmingham', + 'Cincinnati', 'Detroit']}) + original.index.name = 'index' + variable_labels = {'a': 'City Rank', 'b': 'City Exponent', 'c': 'City'} + with tm.ensure_clean() as path: + original.to_stata(path, variable_labels=variable_labels) + with StataReader(path) as sr: + read_labels = sr.variable_labels() + expected_labels = {'index': '', + 'a': 'City Rank', + 'b': 'City Exponent', + 'c': 'City'} + tm.assert_equal(read_labels, expected_labels) + + variable_labels['index'] = 'The Index' + with tm.ensure_clean() as path: + original.to_stata(path, variable_labels=variable_labels) + with StataReader(path) as sr: + read_labels = sr.variable_labels() + tm.assert_equal(read_labels, variable_labels) + + def test_write_variable_label_errors(self): + original = pd.DataFrame({'a': [1, 2, 3, 4], + 'b': [1.0, 3.0, 27.0, 81.0], + 'c': ['Atlanta', 'Birmingham', + 'Cincinnati', 'Detroit']}) + values = [u'\u03A1', u'\u0391', + u'\u039D', u'\u0394', + u'\u0391', u'\u03A3'] + + variable_labels_utf8 = {'a': 'City Rank', + 'b': 'City Exponent', + 'c': u''.join(values)} + + with tm.assertRaises(ValueError): + with tm.ensure_clean() as path: + original.to_stata(path, variable_labels=variable_labels_utf8) + + variable_labels_long = {'a': 'City Rank', + 'b': 'City Exponent', + 'c': 'A very, very, very long variable label ' + 'that is too long for Stata which means ' + 'that it has more than 80 characters'} + + with tm.assertRaises(ValueError): + with tm.ensure_clean() as path: + original.to_stata(path, variable_labels=variable_labels_long) + + def test_default_date_conversion(self): + # GH 12259 + dates = [dt.datetime(1999, 12, 31, 12, 12, 12, 12000), + dt.datetime(2012, 12, 21, 12, 21, 12, 21000), + dt.datetime(1776, 7, 4, 7, 4, 7, 4000)] + original = pd.DataFrame({'nums': [1.0, 2.0, 3.0], + 'strs': ['apple', 'banana', 'cherry'], + 'dates': dates}) + + with tm.ensure_clean() as path: + original.to_stata(path, write_index=False) + reread = read_stata(path, convert_dates=True) + tm.assert_frame_equal(original, reread) + + original.to_stata(path, + write_index=False, + convert_dates={'dates': 'tc'}) + direct = read_stata(path, convert_dates=True) + tm.assert_frame_equal(reread, direct) + + def test_unsupported_type(self): + original = pd.DataFrame({'a': [1 + 2j, 2 + 4j]}) + + with tm.assertRaises(NotImplementedError): + with tm.ensure_clean() as path: + original.to_stata(path) + + def test_unsupported_datetype(self): + dates = [dt.datetime(1999, 12, 31, 12, 12, 12, 12000), + dt.datetime(2012, 12, 21, 12, 21, 12, 21000), + dt.datetime(1776, 7, 4, 7, 4, 7, 4000)] + original = pd.DataFrame({'nums': [1.0, 2.0, 3.0], + 'strs': ['apple', 'banana', 'cherry'], + 'dates': dates}) + + with tm.assertRaises(NotImplementedError): + with tm.ensure_clean() as path: + original.to_stata(path, convert_dates={'dates': 'tC'}) + + dates = pd.date_range('1-1-1990', periods=3, tz='Asia/Hong_Kong') + original = pd.DataFrame({'nums': [1.0, 2.0, 3.0], + 'strs': ['apple', 'banana', 'cherry'], + 'dates': dates}) + with tm.assertRaises(NotImplementedError): + with tm.ensure_clean() as path: + original.to_stata(path) + + def test_repeated_column_labels(self): + # GH 13923 + with tm.assertRaises(ValueError) as cm: + read_stata(self.dta23, convert_categoricals=True) + tm.assertTrue('wolof' in cm.exception) if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/io/tests/test_wb.py b/pandas/io/tests/test_wb.py deleted file mode 100644 index 42884b19de03a..0000000000000 --- a/pandas/io/tests/test_wb.py +++ /dev/null @@ -1,114 +0,0 @@ -# flake8: noqa - -import nose - -import pandas -from pandas.compat import u -from pandas.util.testing import network -from pandas.util.testing import assert_frame_equal -import pandas.util.testing as tm - -# deprecated -with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - from pandas.io.wb import search, download, get_countries - -class TestWB(tm.TestCase): - - @tm.slow - @network - def test_wdi_search(self): - - # Test that a name column exists, and that some results were returned - # ...without being too strict about what the actual contents of the - # results actually are. The fact that there are some, is good enough. - - result = search('gdp.*capita.*constant') - self.assertTrue(result.name.str.contains('GDP').any()) - - @tm.slow - @network - def test_wdi_download(self): - - # Test a bad indicator with double (US), triple (USA), - # standard (CA, MX), non standard (KSV), - # duplicated (US, US, USA), and unknown (BLA) country codes - - # ...but NOT a crash inducing country code (World bank strips pandas - # users of the luxury of laziness, because they create their - # own exceptions, and don't clean up legacy country codes. - # ...but NOT a retired indicator (User should want it to error.) - - cntry_codes = ['CA', 'MX', 'USA', 'US', 'US', 'KSV', 'BLA'] - inds = ['NY.GDP.PCAP.CD','BAD.INDICATOR'] - - expected = {'NY.GDP.PCAP.CD': {('Canada', '2003'): 28026.006013044702, ('Mexico', '2003'): 6601.0420648056606, ('Canada', '2004'): 31829.522562759001, ('Kosovo', '2003'): 1969.56271307405, ('Mexico', '2004'): 7042.0247834044303, ('United States', '2004'): 41928.886136479705, ('United States', '2003'): 39682.472247320402, ('Kosovo', '2004'): 2135.3328465238301}} - expected = pandas.DataFrame(expected) - #Round, to ignore revisions to data. - expected = pandas.np.round(expected,decimals=-3) - expected.sort(inplace=True) - result = download(country=cntry_codes, indicator=inds, - start=2003, end=2004, errors='ignore') - result.sort(inplace=True) - #Round, to ignore revisions to data. - result = pandas.np.round(result,decimals=-3) - expected.index = result.index - assert_frame_equal(result, pandas.DataFrame(expected)) - - @tm.slow - @network - def test_wdi_download_w_retired_indicator(self): - - cntry_codes = ['CA', 'MX', 'US'] - # Despite showing up in the search feature, and being listed online, - # the api calls to GDPPCKD don't work in their own query builder, nor - # pandas module. GDPPCKD used to be a common symbol. - # This test is written to ensure that error messages to pandas users - # continue to make sense, rather than a user getting some missing - # key error, cause their JSON message format changed. If - # World bank ever finishes the deprecation of this symbol, - # this nose test should still pass. - - inds = ['GDPPCKD'] - - try: - result = download(country=cntry_codes, indicator=inds, - start=2003, end=2004, errors='ignore') - # If for some reason result actually ever has data, it's cause WB - # fixed the issue with this ticker. Find another bad one. - except ValueError as e: - raise nose.SkipTest("No indicators returned data: {0}".format(e)) - - # if it ever gets here, it means WB unretired the indicator. - # even if they dropped it completely, it would still get caught above - # or the WB API changed somehow in a really unexpected way. - if len(result) > 0: - raise nose.SkipTest("Invalid results") - - @tm.slow - @network - def test_wdi_download_w_crash_inducing_countrycode(self): - - cntry_codes = ['CA', 'MX', 'US', 'XXX'] - inds = ['NY.GDP.PCAP.CD'] - - try: - result = download(country=cntry_codes, indicator=inds, - start=2003, end=2004, errors='ignore') - except ValueError as e: - raise nose.SkipTest("No indicators returned data: {0}".format(e)) - - # if it ever gets here, it means the country code XXX got used by WB - # or the WB API changed somehow in a really unexpected way. - if len(result) > 0: - raise nose.SkipTest("Invalid results") - - @tm.slow - @network - def test_wdi_get_countries(self): - result = get_countries() - self.assertTrue('Zimbabwe' in list(result['name'])) - self.assertTrue(len(result) > 100) - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/io/wb.py b/pandas/io/wb.py index 81b4947f06b16..5dc4d9ce1adc4 100644 --- a/pandas/io/wb.py +++ b/pandas/io/wb.py @@ -1,314 +1,6 @@ -# -*- coding: utf-8 -*- - -# flake8: noqa - -from __future__ import print_function - -from pandas.compat import map, reduce, range, lrange -from pandas.io.common import urlopen -from pandas.io import json -import pandas -import numpy as np -import warnings - -warnings.warn("\n" - "The pandas.io.wb module is moved to a separate package " - "(pandas-datareader) and will be removed from pandas in a " - "future version.\nAfter installing the pandas-datareader package " - "(https://github.com/pydata/pandas-datareader), you can change " - "the import ``from pandas.io import data, wb`` to " - "``from pandas_datareader import data, wb``.", - FutureWarning) - - -# This list of country codes was pulled from wikipedia during October 2014. -# While some exceptions do exist, it is the best proxy for countries supported -# by World Bank. It is an aggregation of the 2-digit ISO 3166-1 alpha-2, and -# 3-digit ISO 3166-1 alpha-3, codes, with 'all', 'ALL', and 'All' appended ot -# the end. - -country_codes = ['AD', 'AE', 'AF', 'AG', 'AI', 'AL', 'AM', 'AO', 'AQ', 'AR', \ - 'AS', 'AT', 'AU', 'AW', 'AX', 'AZ', 'BA', 'BB', 'BD', 'BE', \ - 'BF', 'BG', 'BH', 'BI', 'BJ', 'BL', 'BM', 'BN', 'BO', 'BQ', \ - 'BR', 'BS', 'BT', 'BV', 'BW', 'BY', 'BZ', 'CA', 'CC', 'CD', \ - 'CF', 'CG', 'CH', 'CI', 'CK', 'CL', 'CM', 'CN', 'CO', 'CR', \ - 'CU', 'CV', 'CW', 'CX', 'CY', 'CZ', 'DE', 'DJ', 'DK', 'DM', \ - 'DO', 'DZ', 'EC', 'EE', 'EG', 'EH', 'ER', 'ES', 'ET', 'FI', \ - 'FJ', 'FK', 'FM', 'FO', 'FR', 'GA', 'GB', 'GD', 'GE', 'GF', \ - 'GG', 'GH', 'GI', 'GL', 'GM', 'GN', 'GP', 'GQ', 'GR', 'GS', \ - 'GT', 'GU', 'GW', 'GY', 'HK', 'HM', 'HN', 'HR', 'HT', 'HU', \ - 'ID', 'IE', 'IL', 'IM', 'IN', 'IO', 'IQ', 'IR', 'IS', 'IT', \ - 'JE', 'JM', 'JO', 'JP', 'KE', 'KG', 'KH', 'KI', 'KM', 'KN', \ - 'KP', 'KR', 'KW', 'KY', 'KZ', 'LA', 'LB', 'LC', 'LI', 'LK', \ - 'LR', 'LS', 'LT', 'LU', 'LV', 'LY', 'MA', 'MC', 'MD', 'ME', \ - 'MF', 'MG', 'MH', 'MK', 'ML', 'MM', 'MN', 'MO', 'MP', 'MQ', \ - 'MR', 'MS', 'MT', 'MU', 'MV', 'MW', 'MX', 'MY', 'MZ', 'NA', \ - 'NC', 'NE', 'NF', 'NG', 'NI', 'NL', 'NO', 'NP', 'NR', 'NU', \ - 'NZ', 'OM', 'PA', 'PE', 'PF', 'PG', 'PH', 'PK', 'PL', 'PM', \ - 'PN', 'PR', 'PS', 'PT', 'PW', 'PY', 'QA', 'RE', 'RO', 'RS', \ - 'RU', 'RW', 'SA', 'SB', 'SC', 'SD', 'SE', 'SG', 'SH', 'SI', \ - 'SJ', 'SK', 'SL', 'SM', 'SN', 'SO', 'SR', 'SS', 'ST', 'SV', \ - 'SX', 'SY', 'SZ', 'TC', 'TD', 'TF', 'TG', 'TH', 'TJ', 'TK', \ - 'TL', 'TM', 'TN', 'TO', 'TR', 'TT', 'TV', 'TW', 'TZ', 'UA', \ - 'UG', 'UM', 'US', 'UY', 'UZ', 'VA', 'VC', 'VE', 'VG', 'VI', \ - 'VN', 'VU', 'WF', 'WS', 'YE', 'YT', 'ZA', 'ZM', 'ZW', \ - 'ABW', 'AFG', 'AGO', 'AIA', 'ALA', 'ALB', 'AND', 'ARE', \ - 'ARG', 'ARM', 'ASM', 'ATA', 'ATF', 'ATG', 'AUS', 'AUT', \ - 'AZE', 'BDI', 'BEL', 'BEN', 'BES', 'BFA', 'BGD', 'BGR', \ - 'BHR', 'BHS', 'BIH', 'BLM', 'BLR', 'BLZ', 'BMU', 'BOL', \ - 'BRA', 'BRB', 'BRN', 'BTN', 'BVT', 'BWA', 'CAF', 'CAN', \ - 'CCK', 'CHE', 'CHL', 'CHN', 'CIV', 'CMR', 'COD', 'COG', \ - 'COK', 'COL', 'COM', 'CPV', 'CRI', 'CUB', 'CUW', 'CXR', \ - 'CYM', 'CYP', 'CZE', 'DEU', 'DJI', 'DMA', 'DNK', 'DOM', \ - 'DZA', 'ECU', 'EGY', 'ERI', 'ESH', 'ESP', 'EST', 'ETH', \ - 'FIN', 'FJI', 'FLK', 'FRA', 'FRO', 'FSM', 'GAB', 'GBR', \ - 'GEO', 'GGY', 'GHA', 'GIB', 'GIN', 'GLP', 'GMB', 'GNB', \ - 'GNQ', 'GRC', 'GRD', 'GRL', 'GTM', 'GUF', 'GUM', 'GUY', \ - 'HKG', 'HMD', 'HND', 'HRV', 'HTI', 'HUN', 'IDN', 'IMN', \ - 'IND', 'IOT', 'IRL', 'IRN', 'IRQ', 'ISL', 'ISR', 'ITA', \ - 'JAM', 'JEY', 'JOR', 'JPN', 'KAZ', 'KEN', 'KGZ', 'KHM', \ - 'KIR', 'KNA', 'KOR', 'KWT', 'LAO', 'LBN', 'LBR', 'LBY', \ - 'LCA', 'LIE', 'LKA', 'LSO', 'LTU', 'LUX', 'LVA', 'MAC', \ - 'MAF', 'MAR', 'MCO', 'MDA', 'MDG', 'MDV', 'MEX', 'MHL', \ - 'MKD', 'MLI', 'MLT', 'MMR', 'MNE', 'MNG', 'MNP', 'MOZ', \ - 'MRT', 'MSR', 'MTQ', 'MUS', 'MWI', 'MYS', 'MYT', 'NAM', \ - 'NCL', 'NER', 'NFK', 'NGA', 'NIC', 'NIU', 'NLD', 'NOR', \ - 'NPL', 'NRU', 'NZL', 'OMN', 'PAK', 'PAN', 'PCN', 'PER', \ - 'PHL', 'PLW', 'PNG', 'POL', 'PRI', 'PRK', 'PRT', 'PRY', \ - 'PSE', 'PYF', 'QAT', 'REU', 'ROU', 'RUS', 'RWA', 'SAU', \ - 'SDN', 'SEN', 'SGP', 'SGS', 'SHN', 'SJM', 'SLB', 'SLE', \ - 'SLV', 'SMR', 'SOM', 'SPM', 'SRB', 'SSD', 'STP', 'SUR', \ - 'SVK', 'SVN', 'SWE', 'SWZ', 'SXM', 'SYC', 'SYR', 'TCA', \ - 'TCD', 'TGO', 'THA', 'TJK', 'TKL', 'TKM', 'TLS', 'TON', \ - 'TTO', 'TUN', 'TUR', 'TUV', 'TWN', 'TZA', 'UGA', 'UKR', \ - 'UMI', 'URY', 'USA', 'UZB', 'VAT', 'VCT', 'VEN', 'VGB', \ - 'VIR', 'VNM', 'VUT', 'WLF', 'WSM', 'YEM', 'ZAF', 'ZMB', \ - 'ZWE', 'all', 'ALL', 'All'] - -def download(country=None, indicator=None, - start=2003, end=2005,errors='warn'): - """ - Download data series from the World Bank's World Development Indicators - - Parameters - ---------- - - indicator: string or list of strings - taken from the ``id`` field in ``WDIsearch()`` - - country: string or list of strings. - ``all`` downloads data for all countries - 2 or 3 character ISO country codes select individual - countries (e.g.``US``,``CA``) or (e.g.``USA``,``CAN``). The codes - can be mixed. - - The two ISO lists of countries, provided by wikipedia, are hardcoded - into pandas as of 11/10/2014. - - start: int - First year of the data series - - end: int - Last year of the data series (inclusive) - - errors: str {'ignore', 'warn', 'raise'}, default 'warn' - Country codes are validated against a hardcoded list. This controls - the outcome of that validation, and attempts to also apply - to the results from world bank. - - errors='raise', will raise a ValueError on a bad country code. - - Returns - ------- - - ``pandas`` DataFrame with columns: country, iso_code, year, - indicator value. - - """ - if country is None: - country = ['MX', 'CA', 'US'] - if indicator is None: - indicator = ['NY.GDP.MKTP.CD', 'NY.GNS.ICTR.ZS'] - - if type(country) == str: - country = [country] - - bad_countries = np.setdiff1d(country, country_codes) - - # Validate the input - if len(bad_countries) > 0: - tmp = ", ".join(bad_countries) - if errors == 'raise': - raise ValueError("Invalid Country Code(s): %s" % tmp) - if errors == 'warn': - warnings.warn('Non-standard ISO country codes: %s' % tmp) - - # Work with a list of indicators - if type(indicator) == str: - indicator = [indicator] - - # Download - data = [] - bad_indicators = {} - for ind in indicator: - one_indicator_data,msg = _get_data(ind, country, start, end) - if msg == "Success": - data.append(one_indicator_data) - else: - bad_indicators[ind] = msg - - if len(bad_indicators.keys()) > 0: - bad_ind_msgs = [i + " : " + m for i,m in bad_indicators.items()] - bad_ind_msgs = "\n\n".join(bad_ind_msgs) - bad_ind_msgs = "\n\nInvalid Indicators:\n\n%s" % bad_ind_msgs - if errors == 'raise': - raise ValueError(bad_ind_msgs) - if errors == 'warn': - warnings.warn(bad_ind_msgs) - - # Confirm we actually got some data, and build Dataframe - if len(data) > 0: - out = reduce(lambda x, y: x.merge(y, how='outer'), data) - out = out.drop('iso_code', axis=1) - out = out.set_index(['country', 'year']) - out = out._convert(datetime=True, numeric=True) - return out - else: - msg = "No indicators returned data." - if errors == 'ignore': - msg += " Set errors='warn' for more information." - raise ValueError(msg) - -def _get_data(indicator="NY.GNS.ICTR.GN.ZS", country='US', - start=2002, end=2005): - - if type(country) == str: - country = [country] - - countries = ';'.join(country) - - # Build URL for api call - url = ("http://api.worldbank.org/countries/" + countries + "/indicators/" + - indicator + "?date=" + str(start) + ":" + str(end) + - "&per_page=25000&format=json") - - # Download - with urlopen(url) as response: - data = response.read() - - # Check to see if there is a possible problem - possible_message = json.loads(data)[0] - if 'message' in possible_message.keys(): - msg = possible_message['message'][0] - try: - msg = msg['key'].split() + ["\n "] + msg['value'].split() - wb_err = ' '.join(msg) - except: - wb_err = "" - if 'key' in msg.keys(): - wb_err = msg['key'] + "\n " - if 'value' in msg.keys(): - wb_err += msg['value'] - error_msg = "Problem with a World Bank Query \n %s" - return None, error_msg % wb_err - - if 'total' in possible_message.keys(): - if possible_message['total'] == 0: - return None, "No results from world bank." - - # Parse JSON file - data = json.loads(data)[1] - country = [x['country']['value'] for x in data] - iso_code = [x['country']['id'] for x in data] - year = [x['date'] for x in data] - value = [x['value'] for x in data] - # Prepare output - out = pandas.DataFrame([country, iso_code, year, value]).T - out.columns = ['country', 'iso_code', 'year', indicator] - return out,"Success" - -def get_countries(): - """Query information about countries - """ - url = 'http://api.worldbank.org/countries/?per_page=1000&format=json' - with urlopen(url) as response: - data = response.read() - data = json.loads(data)[1] - data = pandas.DataFrame(data) - data.adminregion = [x['value'] for x in data.adminregion] - data.incomeLevel = [x['value'] for x in data.incomeLevel] - data.lendingType = [x['value'] for x in data.lendingType] - data.region = [x['value'] for x in data.region] - data = data.rename(columns={'id': 'iso3c', 'iso2Code': 'iso2c'}) - return data - -def get_indicators(): - """Download information about all World Bank data series - """ - url = 'http://api.worldbank.org/indicators?per_page=50000&format=json' - with urlopen(url) as response: - data = response.read() - data = json.loads(data)[1] - data = pandas.DataFrame(data) - # Clean fields - data.source = [x['value'] for x in data.source] - fun = lambda x: x.encode('ascii', 'ignore') - data.sourceOrganization = data.sourceOrganization.apply(fun) - # Clean topic field - - def get_value(x): - try: - return x['value'] - except: - return '' - fun = lambda x: [get_value(y) for y in x] - data.topics = data.topics.apply(fun) - data.topics = data.topics.apply(lambda x: ' ; '.join(x)) - # Clean outpu - data = data.sort(columns='id') - data.index = pandas.Index(lrange(data.shape[0])) - return data - -_cached_series = None - - -def search(string='gdp.*capi', field='name', case=False): - """ - Search available data series from the world bank - - Parameters - ---------- - - string: string - regular expression - field: string - id, name, source, sourceNote, sourceOrganization, topics - See notes below - case: bool - case sensitive search? - - Notes - ----- - - The first time this function is run it will download and cache the full - list of available series. Depending on the speed of your network - connection, this can take time. Subsequent searches will use the cached - copy, so they should be much faster. - - id : Data series indicator (for use with the ``indicator`` argument of - ``WDI()``) e.g. NY.GNS.ICTR.GN.ZS" - name: Short description of the data series - source: Data collection project - sourceOrganization: Data collection organization - note: - sourceNote: - topics: - """ - # Create cached list of series if it does not exist - global _cached_series - if type(_cached_series) is not pandas.core.frame.DataFrame: - _cached_series = get_indicators() - data = _cached_series[field] - idx = data.str.contains(string, case=case) - out = _cached_series.ix[idx].dropna() - return out +raise ImportError( + "The pandas.io.wb module is moved to a separate package " + "(pandas-datareader). After installing the pandas-datareader package " + "(https://github.com/pydata/pandas-datareader), you can change " + "the import ``from pandas.io import data, wb`` to " + "``from pandas_datareader import data, wb``.") diff --git a/pandas/lib.pxd b/pandas/lib.pxd index ba52e4cc47c89..554b0248e97ea 100644 --- a/pandas/lib.pxd +++ b/pandas/lib.pxd @@ -1 +1,4 @@ +# prototypes for sharing + cdef bint is_null_datetimelike(v) +cpdef bint is_period(val) diff --git a/pandas/lib.pyx b/pandas/lib.pyx index a9c7f93097f1b..e7672de5c835e 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -84,6 +84,7 @@ PyDateTime_IMPORT import_array() import_ufunc() + def values_from_object(object o): """ return my values or the object if we are say an ndarray """ cdef f @@ -159,6 +160,7 @@ def ismember(ndarray arr, set values): return result.view(np.bool_) + def ismember_int64(ndarray[int64_t] arr, set values): """ Checks whether @@ -184,6 +186,7 @@ def ismember_int64(ndarray[int64_t] arr, set values): return result.view(np.bool_) + @cython.wraparound(False) @cython.boundscheck(False) def memory_usage_of_objects(ndarray[object, ndim=1] arr): @@ -217,12 +220,15 @@ cdef inline int64_t gmtime(object date): days = pydate(y, m, 1).toordinal() - _EPOCH_ORD + d - 1 return (( (((days * 24 + h) * 60 + mn))) * 60 + s) * 1000 + cpdef object to_datetime(int64_t timestamp): return pydatetime.utcfromtimestamp(timestamp / 1000.0) + cpdef object to_timestamp(object dt): return gmtime(dt) + def array_to_timestamp(ndarray[object, ndim=1] arr): cdef int i, n cdef ndarray[int64_t, ndim=1] result @@ -235,6 +241,7 @@ def array_to_timestamp(ndarray[object, ndim=1] arr): return result + def time64_to_datetime(ndarray[int64_t, ndim=1] arr): cdef int i, n cdef ndarray[object, ndim=1] result @@ -254,6 +261,7 @@ def time64_to_datetime(ndarray[int64_t, ndim=1] arr): cdef double INF = np.inf cdef double NEGINF = -INF + cpdef checknull(object val): if util.is_float_object(val) or util.is_complex_object(val): return val != val # and val != INF and val != NEGINF @@ -268,6 +276,7 @@ cpdef checknull(object val): else: return _checknull(val) + cpdef checknull_old(object val): if util.is_float_object(val) or util.is_complex_object(val): return val != val or val == INF or val == NEGINF @@ -282,18 +291,21 @@ cpdef checknull_old(object val): else: return util._checknull(val) + cpdef isposinf_scalar(object val): if util.is_float_object(val) and val == INF: return True else: return False + cpdef isneginf_scalar(object val): if util.is_float_object(val) and val == NEGINF: return True else: return False + def isscalar(object val): """ Return True if given value is scalar. @@ -342,11 +354,13 @@ def item_from_zerodim(object val): @cython.wraparound(False) @cython.boundscheck(False) -def isnullobj(ndarray[object] arr): +def isnullobj(ndarray arr): cdef Py_ssize_t i, n cdef object val cdef ndarray[uint8_t] result + assert arr.ndim == 1, "'arr' must be 1-D." + n = len(arr) result = np.empty(n, dtype=np.uint8) for i from 0 <= i < n: @@ -354,13 +368,16 @@ def isnullobj(ndarray[object] arr): result[i] = _check_all_nulls(val) return result.view(np.bool_) + @cython.wraparound(False) @cython.boundscheck(False) -def isnullobj_old(ndarray[object] arr): +def isnullobj_old(ndarray arr): cdef Py_ssize_t i, n cdef object val cdef ndarray[uint8_t] result + assert arr.ndim == 1, "'arr' must be 1-D." + n = len(arr) result = np.zeros(n, dtype=np.uint8) for i from 0 <= i < n: @@ -368,13 +385,16 @@ def isnullobj_old(ndarray[object] arr): result[i] = val is NaT or util._checknull_old(val) return result.view(np.bool_) + @cython.wraparound(False) @cython.boundscheck(False) -def isnullobj2d(ndarray[object, ndim=2] arr): +def isnullobj2d(ndarray arr): cdef Py_ssize_t i, j, n, m cdef object val cdef ndarray[uint8_t, ndim=2] result + assert arr.ndim == 2, "'arr' must be 2-D." + n, m = ( arr).shape result = np.zeros((n, m), dtype=np.uint8) for i from 0 <= i < n: @@ -384,13 +404,16 @@ def isnullobj2d(ndarray[object, ndim=2] arr): result[i, j] = 1 return result.view(np.bool_) + @cython.wraparound(False) @cython.boundscheck(False) -def isnullobj2d_old(ndarray[object, ndim=2] arr): +def isnullobj2d_old(ndarray arr): cdef Py_ssize_t i, j, n, m cdef object val cdef ndarray[uint8_t, ndim=2] result + assert arr.ndim == 2, "'arr' must be 2-D." + n, m = ( arr).shape result = np.zeros((n, m), dtype=np.uint8) for i from 0 <= i < n: @@ -405,8 +428,8 @@ def isnullobj2d_old(ndarray[object, ndim=2] arr): @cython.boundscheck(False) cpdef ndarray[object] list_to_object_array(list obj): """ - Convert list to object ndarray. Seriously can\'t believe I had to write this - function + Convert list to object ndarray. Seriously can\'t believe + I had to write this function. """ cdef: Py_ssize_t i, n = len(obj) @@ -439,6 +462,7 @@ def fast_unique(ndarray[object] values): return uniques + @cython.wraparound(False) @cython.boundscheck(False) def fast_unique_multiple(list arrays): @@ -465,6 +489,7 @@ def fast_unique_multiple(list arrays): return uniques + @cython.wraparound(False) @cython.boundscheck(False) def fast_unique_multiple_list(list lists): @@ -491,6 +516,7 @@ def fast_unique_multiple_list(list lists): return uniques + @cython.wraparound(False) @cython.boundscheck(False) def fast_unique_multiple_list_gen(object gen, bint sort=True): @@ -530,6 +556,7 @@ def fast_unique_multiple_list_gen(object gen, bint sort=True): return uniques + @cython.wraparound(False) @cython.boundscheck(False) def dicts_to_array(list dicts, list columns): @@ -555,6 +582,7 @@ def dicts_to_array(list dicts, list columns): return result + def fast_zip(list ndarrays): """ For zipping multiple ndarrays into an ndarray of tuples @@ -596,6 +624,7 @@ def fast_zip(list ndarrays): return result + def get_reverse_indexer(ndarray[int64_t] indexer, Py_ssize_t length): """ Reverse indexing operation. @@ -637,6 +666,7 @@ def has_infs_f4(ndarray[float32_t] arr): return True return False + def has_infs_f8(ndarray[float64_t] arr): cdef: Py_ssize_t i, n = len(arr) @@ -651,6 +681,7 @@ def has_infs_f8(ndarray[float64_t] arr): return True return False + def convert_timestamps(ndarray values): cdef: object val, f, result @@ -768,12 +799,12 @@ def scalar_compare(ndarray[object] values, object val, object op): raise ValueError('Unrecognized operator') result = np.empty(n, dtype=bool).view(np.uint8) - isnull_val = _checknull(val) + isnull_val = checknull(val) if flag == cpython.Py_NE: for i in range(n): x = values[i] - if _checknull(x): + if checknull(x): result[i] = True elif isnull_val: result[i] = True @@ -785,7 +816,7 @@ def scalar_compare(ndarray[object] values, object val, object op): elif flag == cpython.Py_EQ: for i in range(n): x = values[i] - if _checknull(x): + if checknull(x): result[i] = False elif isnull_val: result[i] = False @@ -798,7 +829,7 @@ def scalar_compare(ndarray[object] values, object val, object op): else: for i in range(n): x = values[i] - if _checknull(x): + if checknull(x): result[i] = False elif isnull_val: result[i] = False @@ -864,7 +895,7 @@ def vec_compare(ndarray[object] left, ndarray[object] right, object op): x = left[i] y = right[i] - if _checknull(x) or _checknull(y): + if checknull(x) or checknull(y): result[i] = True else: result[i] = cpython.PyObject_RichCompareBool(x, y, flag) @@ -873,7 +904,7 @@ def vec_compare(ndarray[object] left, ndarray[object] right, object op): x = left[i] y = right[i] - if _checknull(x) or _checknull(y): + if checknull(x) or checknull(y): result[i] = False else: result[i] = cpython.PyObject_RichCompareBool(x, y, flag) @@ -903,6 +934,7 @@ def scalar_binop(ndarray[object] values, object val, object op): return maybe_convert_bool(result) + @cython.wraparound(False) @cython.boundscheck(False) def vec_binop(ndarray[object] left, ndarray[object] right, object op): @@ -940,18 +972,19 @@ def astype_intsafe(ndarray[object] arr, new_dtype): ndarray result # on 32-bit, 1.6.2 numpy M8[ns] is a subdtype of integer, which is weird - is_datelike = new_dtype in ['M8[ns]','m8[ns]'] + is_datelike = new_dtype in ['M8[ns]', 'm8[ns]'] result = np.empty(n, dtype=new_dtype) for i in range(n): v = arr[i] if is_datelike and checknull(v): - result[i] = NPY_NAT + result[i] = NPY_NAT else: - util.set_value_at(result, i, v) + util.set_value_at(result, i, v) return result + cpdef ndarray[object] astype_unicode(ndarray arr): cdef: Py_ssize_t i, n = arr.size @@ -962,6 +995,7 @@ cpdef ndarray[object] astype_unicode(ndarray arr): return result + cpdef ndarray[object] astype_str(ndarray arr): cdef: Py_ssize_t i, n = arr.size @@ -972,6 +1006,7 @@ cpdef ndarray[object] astype_str(ndarray arr): return result + def clean_index_list(list obj): """ Utility used in pandas.core.index._ensure_index @@ -984,7 +1019,7 @@ def clean_index_list(list obj): for i in range(n): v = obj[i] - if not (PyList_Check(v) or np.PyArray_Check(v) or hasattr(v,'_data')): + if not (PyList_Check(v) or np.PyArray_Check(v) or hasattr(v, '_data')): all_arrays = 0 break @@ -994,7 +1029,7 @@ def clean_index_list(list obj): converted = np.empty(n, dtype=object) for i in range(n): v = obj[i] - if PyList_Check(v) or np.PyArray_Check(v) or hasattr(v,'_data'): + if PyList_Check(v) or np.PyArray_Check(v) or hasattr(v, '_data'): converted[i] = tuple(v) else: converted[i] = v @@ -1030,10 +1065,16 @@ cpdef Py_ssize_t max_len_string_array(pandas_string[:] arr): return m + @cython.boundscheck(False) @cython.wraparound(False) -def string_array_replace_from_nan_rep(ndarray[object, ndim=1] arr, object nan_rep, object replace = None): - """ replace the values in the array with replacement if they are nan_rep; return the same array """ +def string_array_replace_from_nan_rep( + ndarray[object, ndim=1] arr, object nan_rep, + object replace=None): + """ + Replace the values in the array with 'replacement' if + they are 'nan_rep'. Return the same array. + """ cdef int length = arr.shape[0], i = 0 if replace is None: @@ -1045,9 +1086,11 @@ def string_array_replace_from_nan_rep(ndarray[object, ndim=1] arr, object nan_re return arr + @cython.boundscheck(False) @cython.wraparound(False) -def write_csv_rows(list data, ndarray data_index, int nlevels, ndarray cols, object writer): +def write_csv_rows(list data, ndarray data_index, + int nlevels, ndarray cols, object writer): cdef int N, j, i, ncols cdef list rows @@ -1058,7 +1101,7 @@ def write_csv_rows(list data, ndarray data_index, int nlevels, ndarray cols, obj # pre-allocate rows ncols = len(cols) - rows = [[None]*(nlevels+ncols) for x in range(N)] + rows = [[None] * (nlevels + ncols) for x in range(N)] j = -1 if nlevels == 1: @@ -1066,18 +1109,18 @@ def write_csv_rows(list data, ndarray data_index, int nlevels, ndarray cols, obj row = rows[j % N] row[0] = data_index[j] for i in range(ncols): - row[1+i] = data[i][j] + row[1 + i] = data[i][j] - if j >= N-1 and j % N == N-1: + if j >= N - 1 and j % N == N - 1: writer.writerows(rows) elif nlevels > 1: for j in range(len(data_index)): row = rows[j % N] row[:nlevels] = list(data_index[j]) for i in range(ncols): - row[nlevels+i] = data[i][j] + row[nlevels + i] = data[i][j] - if j >= N-1 and j % N == N-1: + if j >= N - 1 and j % N == N - 1: writer.writerows(rows) else: for j in range(len(data_index)): @@ -1085,15 +1128,15 @@ def write_csv_rows(list data, ndarray data_index, int nlevels, ndarray cols, obj for i in range(ncols): row[i] = data[i][j] - if j >= N-1 and j % N == N-1: + if j >= N - 1 and j % N == N - 1: writer.writerows(rows) - if j >= 0 and (j < N-1 or (j % N) != N-1 ): - writer.writerows(rows[:((j+1) % N)]) + if j >= 0 and (j < N - 1 or (j % N) != N - 1): + writer.writerows(rows[:((j + 1) % N)]) -#------------------------------------------------------------------------------- -# Groupby-related functions +#------------------------------------------------------------------------------ +# Groupby-related functions @cython.boundscheck(False) def arrmap(ndarray[object] index, object func): cdef int length = index.shape[0] @@ -1106,6 +1149,7 @@ def arrmap(ndarray[object] index, object func): return result + @cython.wraparound(False) @cython.boundscheck(False) def is_lexsorted(list list_of_arrays): @@ -1120,16 +1164,14 @@ def is_lexsorted(list list_of_arrays): cdef int64_t **vecs = malloc(nlevels * sizeof(int64_t*)) for i from 0 <= i < nlevels: - # vecs[i] = ( list_of_arrays[i]).data - arr = list_of_arrays[i] vecs[i] = arr.data - # assume uniqueness?? + # Assume uniqueness?? for i from 1 <= i < n: for k from 0 <= k < nlevels: cur = vecs[k][i] - pre = vecs[k][i-1] + pre = vecs[k][i - 1] if cur == pre: continue elif cur > pre: @@ -1140,11 +1182,9 @@ def is_lexsorted(list list_of_arrays): return True - # TODO: could do even better if we know something about the data. eg, index has # 1-min data, binner has 5-min data, then bins are just strides in index. This # is a general, O(max(len(values), len(binner))) method. - @cython.boundscheck(False) @cython.wraparound(False) def generate_bins_dt64(ndarray[int64_t] values, ndarray[int64_t] binner, @@ -1174,18 +1214,18 @@ def generate_bins_dt64(ndarray[int64_t] values, ndarray[int64_t] binner, if values[0] < binner[0]: raise ValueError("Values falls before first bin") - if values[lenidx-1] > binner[lenbin-1]: + if values[lenidx - 1] > binner[lenbin - 1]: raise ValueError("Values falls after last bin") bins = np.empty(lenbin - 1, dtype=np.int64) - j = 0 # index into values + j = 0 # index into values bc = 0 # bin count # linear scan if right_closed: for i in range(0, lenbin - 1): - r_bin = binner[i+1] + r_bin = binner[i + 1] # count values in current bin, advance to next bin while j < lenidx and values[j] <= r_bin: j += 1 @@ -1193,7 +1233,7 @@ def generate_bins_dt64(ndarray[int64_t] values, ndarray[int64_t] binner, bc += 1 else: for i in range(0, lenbin - 1): - r_bin = binner[i+1] + r_bin = binner[i + 1] # count values in current bin, advance to next bin while j < lenidx and values[j] < r_bin: j += 1 @@ -1208,8 +1248,6 @@ def generate_bins_dt64(ndarray[int64_t] values, ndarray[int64_t] binner, return bins - - @cython.boundscheck(False) @cython.wraparound(False) def row_bool_subset(ndarray[float64_t, ndim=2] values, @@ -1231,6 +1269,7 @@ def row_bool_subset(ndarray[float64_t, ndim=2] values, return out + @cython.boundscheck(False) @cython.wraparound(False) def row_bool_subset_object(ndarray[object, ndim=2] values, @@ -1252,6 +1291,7 @@ def row_bool_subset_object(ndarray[object, ndim=2] values, return out + @cython.boundscheck(False) @cython.wraparound(False) def get_level_sorter(ndarray[int64_t, ndim=1] label, @@ -1274,6 +1314,7 @@ def get_level_sorter(ndarray[int64_t, ndim=1] label, return out + def group_count(ndarray[int64_t] values, Py_ssize_t size): cdef: Py_ssize_t i, n = len(values) @@ -1284,6 +1325,7 @@ def group_count(ndarray[int64_t] values, Py_ssize_t size): counts[values[i]] += 1 return counts + def lookup_values(ndarray[object] values, dict mapping): cdef: Py_ssize_t i, n = len(values) @@ -1323,6 +1365,7 @@ def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask, return counts + cdef class _PandasNull: def __richcmp__(_PandasNull self, object other, int op): @@ -1338,6 +1381,7 @@ cdef class _PandasNull: pandas_null = _PandasNull() + def fast_zip_fillna(list ndarrays, fill_value=pandas_null): """ For zipping multiple ndarrays into an ndarray of tuples @@ -1386,46 +1430,6 @@ def fast_zip_fillna(list ndarrays, fill_value=pandas_null): return result -def duplicated(ndarray[object] values, object keep='first'): - cdef: - Py_ssize_t i, n - dict seen = dict() - object row - - n = len(values) - cdef ndarray[uint8_t] result = np.zeros(n, dtype=np.uint8) - - if keep == 'last': - for i from n > i >= 0: - row = values[i] - if row in seen: - result[i] = 1 - else: - seen[row] = i - result[i] = 0 - elif keep == 'first': - for i from 0 <= i < n: - row = values[i] - if row in seen: - result[i] = 1 - else: - seen[row] = i - result[i] = 0 - elif keep is False: - for i from 0 <= i < n: - row = values[i] - if row in seen: - result[i] = 1 - result[seen[row]] = 1 - else: - seen[row] = i - result[i] = 0 - else: - raise ValueError('keep must be either "first", "last" or False') - - return result.view(np.bool_) - - def generate_slices(ndarray[int64_t] labels, Py_ssize_t ngroups): cdef: Py_ssize_t i, group_size, n, start @@ -1477,7 +1481,7 @@ def indices_fast(object index, ndarray[int64_t] labels, list keys, tup = PyTuple_New(k) for j in range(k): val = util.get_value_at(keys[j], - sorted_labels[j][i-1]) + sorted_labels[j][i - 1]) PyTuple_SET_ITEM(tup, j, val) Py_INCREF(val) @@ -1606,7 +1610,7 @@ cpdef slice indexer_as_slice(int64_t[:] vals): return None for i in range(2, n): - if vals[i] < 0 or vals[i] - vals[i-1] != d: + if vals[i] < 0 or vals[i] - vals[i - 1] != d: return None start = vals[0] @@ -1677,12 +1681,13 @@ cpdef slice_get_indices_ex(slice slc, Py_ssize_t objlen=PY_SSIZE_T_MAX): if slc is None: raise TypeError("slc should be a slice") - PySlice_GetIndicesEx(slc, objlen, + PySlice_GetIndicesEx(slc, objlen, &start, &stop, &step, &length) return start, stop, step, length -cpdef Py_ssize_t slice_len(slice slc, Py_ssize_t objlen=PY_SSIZE_T_MAX) except -1: +cpdef Py_ssize_t slice_len( + slice slc, Py_ssize_t objlen=PY_SSIZE_T_MAX) except -1: """ Get length of a bounded slice. @@ -1700,7 +1705,7 @@ cpdef Py_ssize_t slice_len(slice slc, Py_ssize_t objlen=PY_SSIZE_T_MAX) except - if slc is None: raise TypeError("slc must be slice") - PySlice_GetIndicesEx(slc, objlen, + PySlice_GetIndicesEx(slc, objlen, &start, &stop, &step, &length) return length diff --git a/pandas/msgpack/_packer.pyx b/pandas/msgpack/_packer.pyx index 5004b9e8e7262..008dbe5541d50 100644 --- a/pandas/msgpack/_packer.pyx +++ b/pandas/msgpack/_packer.pyx @@ -23,7 +23,8 @@ cdef extern from "../src/msgpack/pack.h": int msgpack_pack_false(msgpack_packer* pk) int msgpack_pack_long(msgpack_packer* pk, long d) int msgpack_pack_long_long(msgpack_packer* pk, long long d) - int msgpack_pack_unsigned_long_long(msgpack_packer* pk, unsigned long long d) + int msgpack_pack_unsigned_long_long(msgpack_packer* pk, + unsigned long long d) int msgpack_pack_float(msgpack_packer* pk, float d) int msgpack_pack_double(msgpack_packer* pk, double d) int msgpack_pack_array(msgpack_packer* pk, size_t l) @@ -58,8 +59,10 @@ cdef class Packer(object): :param bool use_single_float: Use single precision float type for float. (default: False) :param bool autoreset: - Reset buffer after each pack and return it's content as `bytes`. (default: True). - If set this to false, use `bytes()` to get content and `.reset()` to clear buffer. + Reset buffer after each pack and return it's + content as `bytes`. (default: True). + If set this to false, use `bytes()` to get + content and `.reset()` to clear buffer. :param bool use_bin_type: Use bin type introduced in msgpack spec 2.0 for bytes. It also enable str8 type for unicode. @@ -74,15 +77,16 @@ cdef class Packer(object): cdef bint autoreset def __cinit__(self): - cdef int buf_size = 1024*1024 - self.pk.buf = malloc(buf_size); + cdef int buf_size = 1024 * 1024 + self.pk.buf = malloc(buf_size) if self.pk.buf == NULL: raise MemoryError("Unable to allocate internal buffer.") self.pk.buf_size = buf_size self.pk.length = 0 - def __init__(self, default=None, encoding='utf-8', unicode_errors='strict', - use_single_float=False, bint autoreset=1, bint use_bin_type=0): + def __init__(self, default=None, encoding='utf-8', + unicode_errors='strict', use_single_float=False, + bint autoreset=1, bint use_bin_type=0): """ """ self.use_float = use_single_float @@ -110,7 +114,8 @@ cdef class Packer(object): def __dealloc__(self): free(self.pk.buf); - cdef int _pack(self, object o, int nest_limit=DEFAULT_RECURSE_LIMIT) except -1: + cdef int _pack(self, object o, + int nest_limit=DEFAULT_RECURSE_LIMIT) except -1: cdef long long llval cdef unsigned long long ullval cdef long longval @@ -147,14 +152,14 @@ cdef class Packer(object): ret = msgpack_pack_long(&self.pk, longval) elif PyFloat_Check(o): if self.use_float: - fval = o - ret = msgpack_pack_float(&self.pk, fval) + fval = o + ret = msgpack_pack_float(&self.pk, fval) else: - dval = o - ret = msgpack_pack_double(&self.pk, dval) + dval = o + ret = msgpack_pack_double(&self.pk, dval) elif PyBytes_Check(o): L = len(o) - if L > (2**32)-1: + if L > (2**32) - 1: raise ValueError("bytes is too large") rawval = o ret = msgpack_pack_bin(&self.pk, L) @@ -162,10 +167,12 @@ cdef class Packer(object): ret = msgpack_pack_raw_body(&self.pk, rawval, L) elif PyUnicode_Check(o): if not self.encoding: - raise TypeError("Can't encode unicode string: no encoding is specified") - o = PyUnicode_AsEncodedString(o, self.encoding, self.unicode_errors) + raise TypeError("Can't encode unicode string: " + "no encoding is specified") + o = PyUnicode_AsEncodedString(o, self.encoding, + self.unicode_errors) L = len(o) - if L > (2**32)-1: + if L > (2**32) - 1: raise ValueError("dict is too large") rawval = o ret = msgpack_pack_raw(&self.pk, len(o)) @@ -174,43 +181,43 @@ cdef class Packer(object): elif PyDict_CheckExact(o): d = o L = len(d) - if L > (2**32)-1: + if L > (2**32) - 1: raise ValueError("dict is too large") ret = msgpack_pack_map(&self.pk, L) if ret == 0: for k, v in d.iteritems(): - ret = self._pack(k, nest_limit-1) + ret = self._pack(k, nest_limit - 1) if ret != 0: break - ret = self._pack(v, nest_limit-1) + ret = self._pack(v, nest_limit - 1) if ret != 0: break elif PyDict_Check(o): L = len(o) - if L > (2**32)-1: + if L > (2**32) - 1: raise ValueError("dict is too large") ret = msgpack_pack_map(&self.pk, L) if ret == 0: for k, v in o.items(): - ret = self._pack(k, nest_limit-1) + ret = self._pack(k, nest_limit - 1) if ret != 0: break - ret = self._pack(v, nest_limit-1) + ret = self._pack(v, nest_limit - 1) if ret != 0: break elif isinstance(o, ExtType): # This should be before Tuple because ExtType is namedtuple. longval = o.code rawval = o.data L = len(o.data) - if L > (2**32)-1: + if L > (2**32) - 1: raise ValueError("EXT data is too large") ret = msgpack_pack_ext(&self.pk, longval, L) ret = msgpack_pack_raw_body(&self.pk, rawval, L) elif PyTuple_Check(o) or PyList_Check(o): L = len(o) - if L > (2**32)-1: + if L > (2**32) - 1: raise ValueError("list is too large") ret = msgpack_pack_array(&self.pk, L) if ret == 0: for v in o: - ret = self._pack(v, nest_limit-1) + ret = self._pack(v, nest_limit - 1) if ret != 0: break elif not default_used and self._default: o = self._default(o) @@ -237,7 +244,7 @@ cdef class Packer(object): msgpack_pack_raw_body(&self.pk, data, len(data)) def pack_array_header(self, size_t size): - if size > (2**32-1): + if size > (2**32) - 1: raise ValueError cdef int ret = msgpack_pack_array(&self.pk, size) if ret == -1: @@ -250,7 +257,7 @@ cdef class Packer(object): return buf def pack_map_header(self, size_t size): - if size > (2**32-1): + if size > (2**32) - 1: raise ValueError cdef int ret = msgpack_pack_map(&self.pk, size) if ret == -1: diff --git a/pandas/msgpack/_unpacker.pyx b/pandas/msgpack/_unpacker.pyx index f68bf3369427c..6f23a24adde6c 100644 --- a/pandas/msgpack/_unpacker.pyx +++ b/pandas/msgpack/_unpacker.pyx @@ -4,18 +4,15 @@ from cpython cimport * cdef extern from "Python.h": ctypedef struct PyObject - cdef int PyObject_AsReadBuffer(object o, const void** buff, Py_ssize_t* buf_len) except -1 + cdef int PyObject_AsReadBuffer(object o, const void** buff, + Py_ssize_t* buf_len) except -1 from libc.stdlib cimport * from libc.string cimport * from libc.limits cimport * -from pandas.msgpack.exceptions import ( - BufferFull, - OutOfData, - UnpackValueError, - ExtraData, - ) +from pandas.msgpack.exceptions import (BufferFull, OutOfData, + UnpackValueError, ExtraData) from pandas.msgpack import ExtType @@ -65,7 +62,8 @@ cdef inline init_ctx(unpack_context *ctx, ctx.user.max_ext_len = max_ext_len if object_hook is not None and object_pairs_hook is not None: - raise TypeError("object_pairs_hook and object_hook are mutually exclusive.") + raise TypeError("object_pairs_hook and object_hook " + "are mutually exclusive.") if object_hook is not None: if not PyCallable_Check(object_hook): @@ -93,8 +91,11 @@ cdef inline init_ctx(unpack_context *ctx, ctx.user.encoding = encoding ctx.user.unicode_errors = unicode_errors + def default_read_extended_type(typecode, data): - raise NotImplementedError("Cannot decode extended type with typecode=%d" % typecode) + raise NotImplementedError("Cannot decode extended type " + "with typecode=%d" % typecode) + def unpackb(object packed, object object_hook=None, object list_hook=None, bint use_list=1, encoding=None, unicode_errors="strict", @@ -139,7 +140,8 @@ def unpackb(object packed, object object_hook=None, object list_hook=None, if ret == 1: obj = unpack_data(&ctx) if off < buf_len: - raise ExtraData(obj, PyBytes_FromStringAndSize(buf+off, buf_len-off)) + raise ExtraData(obj, PyBytes_FromStringAndSize( + buf + off, buf_len - off)) return obj else: raise UnpackValueError("Unpack failed: error = %d" % (ret,)) @@ -157,9 +159,9 @@ def unpack(object stream, object object_hook=None, object list_hook=None, See :class:`Unpacker` for options. """ return unpackb(stream.read(), use_list=use_list, - object_hook=object_hook, object_pairs_hook=object_pairs_hook, list_hook=list_hook, - encoding=encoding, unicode_errors=unicode_errors, - ) + object_hook=object_hook, + object_pairs_hook=object_pairs_hook, list_hook=list_hook, + encoding=encoding, unicode_errors=unicode_errors) cdef class Unpacker(object): @@ -169,10 +171,12 @@ cdef class Unpacker(object): :param file_like: File-like object having `.read(n)` method. - If specified, unpacker reads serialized data from it and :meth:`feed()` is not usable. + If specified, unpacker reads serialized data from it and + :meth:`feed()` is not usable. :param int read_size: - Used as `file_like.read(read_size)`. (default: `min(1024**2, max_buffer_size)`) + Used as `file_like.read(read_size)`. (default: + `min(1024**2, max_buffer_size)`) :param bool use_list: If true, unpack msgpack array to Python list. @@ -184,9 +188,8 @@ cdef class Unpacker(object): (See also simplejson) :param callable object_pairs_hook: - When specified, it should be callable. - Unpacker calls it with a list of key-value pairs after unpacking msgpack map. - (See also simplejson) + When specified, it should be callable. Unpacker calls it with a list + of key-value pairs after unpacking msgpack map. (See also simplejson) :param str encoding: Encoding used for decoding msgpack raw. @@ -197,9 +200,10 @@ cdef class Unpacker(object): (default: `'strict'`) :param int max_buffer_size: - Limits size of data waiting unpacked. 0 means system's INT_MAX (default). - Raises `BufferFull` exception when it is insufficient. - You shoud set this parameter when unpacking data from untrasted source. + Limits size of data waiting unpacked. 0 means system's + INT_MAX (default). Raises `BufferFull` exception when it + is insufficient. You shoud set this parameter when unpacking + data from untrasted source. :param int max_str_len: Limits max length of str. (default: 2**31-1) @@ -250,9 +254,9 @@ cdef class Unpacker(object): self.buf = NULL def __init__(self, file_like=None, Py_ssize_t read_size=0, bint use_list=1, - object object_hook=None, object object_pairs_hook=None, object list_hook=None, - encoding=None, unicode_errors='strict', int max_buffer_size=0, - object ext_hook=ExtType, + object object_hook=None, object object_pairs_hook=None, + object list_hook=None, encoding=None, unicode_errors='strict', + int max_buffer_size=0, object ext_hook=ExtType, Py_ssize_t max_str_len=2147483647, # 2**32-1 Py_ssize_t max_bin_len=2147483647, Py_ssize_t max_array_len=2147483647, @@ -274,7 +278,8 @@ cdef class Unpacker(object): if not max_buffer_size: max_buffer_size = INT_MAX if read_size > max_buffer_size: - raise ValueError("read_size should be less or equal to max_buffer_size") + raise ValueError("read_size should be less or " + "equal to max_buffer_size") if not read_size: read_size = min(max_buffer_size, 1024**2) self.max_buffer_size = max_buffer_size @@ -313,8 +318,8 @@ cdef class Unpacker(object): """Append `next_bytes` to internal buffer.""" cdef Py_buffer pybuff if self.file_like is not None: - raise AssertionError( - "unpacker.feed() is not be able to use with `file_like`.") + raise AssertionError("unpacker.feed() is not be able " + "to use with `file_like`.") PyObject_GetBuffer(next_bytes, &pybuff, PyBUF_SIMPLE) try: self.append_buffer(pybuff.buf, pybuff.len) @@ -338,10 +343,10 @@ cdef class Unpacker(object): head = 0 else: # expand buffer. - new_size = (tail-head) + _buf_len + new_size = (tail - head) + _buf_len if new_size > self.max_buffer_size: raise BufferFull - new_size = min(new_size*2, self.max_buffer_size) + new_size = min(new_size * 2, self.max_buffer_size) new_buf = malloc(new_size) if new_buf == NULL: # self.buf still holds old buffer and will be freed during @@ -363,15 +368,16 @@ cdef class Unpacker(object): cdef read_from_file(self): next_bytes = self.file_like_read( - min(self.read_size, - self.max_buffer_size - (self.buf_tail - self.buf_head) - )) + min(self.read_size, + self.max_buffer_size - (self.buf_tail - self.buf_head))) if next_bytes: - self.append_buffer(PyBytes_AsString(next_bytes), PyBytes_Size(next_bytes)) + self.append_buffer(PyBytes_AsString(next_bytes), + PyBytes_Size(next_bytes)) else: self.file_like = None - cdef object _unpack(self, execute_fn execute, object write_bytes, bint iter=0): + cdef object _unpack(self, execute_fn execute, + object write_bytes, bint iter=0): cdef int ret cdef object obj cdef size_t prev_head @@ -389,7 +395,8 @@ cdef class Unpacker(object): ret = execute(&self.ctx, self.buf, self.buf_tail, &self.buf_head) if write_bytes is not None: - write_bytes(PyBytes_FromStringAndSize(self.buf + prev_head, self.buf_head - prev_head)) + write_bytes(PyBytes_FromStringAndSize( + self.buf + prev_head, self.buf_head - prev_head)) if ret == 1: obj = unpack_data(&self.ctx) diff --git a/pandas/parser.pyx b/pandas/parser.pyx index 729e5af528b80..12525c7a9c587 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -7,9 +7,12 @@ from libc.string cimport strncpy, strlen, strcmp, strcasecmp cimport libc.stdio as stdio import warnings +from csv import QUOTE_MINIMAL, QUOTE_NONNUMERIC, QUOTE_NONE from cpython cimport (PyObject, PyBytes_FromString, PyBytes_AsString, PyBytes_Check, - PyUnicode_Check, PyUnicode_AsUTF8String) + PyUnicode_Check, PyUnicode_AsUTF8String, + PyErr_Occurred, PyErr_Fetch) +from cpython.ref cimport PyObject, Py_XDECREF from io.common import CParserError, DtypeWarning, EmptyDataError @@ -22,6 +25,7 @@ cdef extern from "Python.h": cdef extern from "stdlib.h": void memcpy(void *dst, void *src, size_t n) +cimport cython cimport numpy as cnp from numpy cimport ndarray, uint8_t, uint64_t @@ -30,6 +34,16 @@ import numpy as np cimport util import pandas.lib as lib +import pandas.compat as compat +from pandas.types.common import (is_categorical_dtype, CategoricalDtype, + is_integer_dtype, is_float_dtype, + is_bool_dtype, is_object_dtype, + is_string_dtype, is_datetime64_dtype, + pandas_dtype) +from pandas.core.categorical import Categorical +from pandas.core.algorithms import take_1d +from pandas.types.concat import union_categoricals +from pandas import Index import time import os @@ -92,7 +106,7 @@ cdef extern from "parser/tokenizer.h": enum: ERROR_OVERFLOW ctypedef void* (*io_callback)(void *src, size_t nbytes, size_t *bytes_read, - int *status) + int *status) ctypedef int (*io_cleanup)(void *src) ctypedef struct parser_t: @@ -162,7 +176,7 @@ cdef extern from "parser/tokenizer.h": void *skipset int64_t skip_first_N_rows - int skip_footer + int skipfooter double (*converter)(const char *, char **, char, char, char, int) nogil # error handling @@ -267,7 +281,7 @@ cdef class TextReader: kh_str_t *true_set cdef public: - int leading_cols, table_width, skip_footer, buffer_lines + int leading_cols, table_width, skipfooter, buffer_lines object allow_leading_cols object delimiter, converters, delim_whitespace object na_values @@ -283,6 +297,7 @@ cdef class TextReader: object compression object mangle_dupe_cols object tupleize_cols + list dtype_cast_order set noconvert, usecols def __cinit__(self, source, @@ -334,7 +349,7 @@ cdef class TextReader: low_memory=False, buffer_lines=None, skiprows=None, - skip_footer=0, + skipfooter=0, verbose=False, mangle_dupe_cols=True, tupleize_cols=False, @@ -393,8 +408,13 @@ cdef class TextReader: raise ValueError('Only length-1 escapes supported') self.parser.escapechar = ord(escapechar) - self.parser.quotechar = ord(quotechar) - self.parser.quoting = quoting + self._set_quoting(quotechar, quoting) + + dtype_order = ['int64', 'float64', 'bool', 'object'] + if quoting == QUOTE_NONNUMERIC: + # consistent with csv module semantics, cast all to float + dtype_order = dtype_order[1:] + self.dtype_cast_order = [np.dtype(x) for x in dtype_order] if comment is not None: if len(comment) > 1: @@ -409,7 +429,7 @@ cdef class TextReader: if skiprows is not None: self._make_skiprow_set() - self.skip_footer = skip_footer + self.skipfooter = skipfooter # suboptimal if usecols is not None: @@ -417,7 +437,7 @@ cdef class TextReader: self.usecols = set(usecols) # XXX - if skip_footer > 0: + if skipfooter > 0: self.parser.error_bad_lines = 0 self.parser.warn_bad_lines = 0 @@ -463,15 +483,10 @@ cdef class TextReader: self.encoding = encoding if isinstance(dtype, dict): - conv = {} - for k in dtype: - v = dtype[k] - if isinstance(v, basestring): - v = np.dtype(v) - conv[k] = v - dtype = conv + dtype = {k: pandas_dtype(dtype[k]) + for k in dtype} elif dtype is not None: - dtype = np.dtype(dtype) + dtype = pandas_dtype(dtype) self.dtype = dtype @@ -501,7 +516,7 @@ cdef class TextReader: # need to artifically skip the final line # which is still a header line header = list(header) - header.append(header[-1]+1) + header.append(header[-1] + 1) self.parser.header_start = header[0] self.parser.header_end = header[-1] @@ -548,6 +563,29 @@ cdef class TextReader: def set_error_bad_lines(self, int status): self.parser.error_bad_lines = status + def _set_quoting(self, quote_char, quoting): + if not isinstance(quoting, int): + raise TypeError('"quoting" must be an integer') + + if not QUOTE_MINIMAL <= quoting <= QUOTE_NONE: + raise TypeError('bad "quoting" value') + + if not isinstance(quote_char, (str, bytes)) and quote_char is not None: + dtype = type(quote_char).__name__ + raise TypeError('"quotechar" must be string, ' + 'not {dtype}'.format(dtype=dtype)) + + if quote_char is None or quote_char == '': + if quoting != QUOTE_NONE: + raise TypeError("quotechar must be set if quoting enabled") + self.parser.quoting = quoting + self.parser.quotechar = -1 + elif len(quote_char) > 1: # 0-len case handled earlier + raise TypeError('"quotechar" must be a 1-character string') + else: + self.parser.quoting = quoting + self.parser.quotechar = ord(quote_char) + cdef _make_skiprow_set(self): if isinstance(self.skiprows, (int, np.integer)): parser_set_skipfirstnrows(self.parser, self.skiprows) @@ -593,7 +631,6 @@ cdef class TextReader: raise ValueError('Multiple files found in compressed ' 'zip file %s', str(zip_names)) elif self.compression == 'xz': - from pandas import compat lzma = compat.import_lzma() if isinstance(source, basestring): @@ -625,7 +662,8 @@ cdef class TextReader: if ptr == NULL: if not os.path.exists(source): - raise IOError('File %s does not exist' % source) + raise compat.FileNotFoundError( + 'File %s does not exist' % source) raise IOError('Initializing from file failed') self.parser.source = ptr @@ -651,12 +689,13 @@ cdef class TextReader: # header is now a list of lists, so field_count should use header[0] cdef: - size_t i, start, data_line, field_count, passed_count, hr, unnamed_count + size_t i, start, data_line, field_count, passed_count, hr, unnamed_count # noqa char *word object name int status Py_ssize_t size char *errors = "strict" + cdef StringPath path = _string_path(self.c_encoding) header = [] @@ -677,39 +716,40 @@ cdef class TextReader: # e.g., if header=3 and file only has 2 lines elif self.parser.lines < hr + 1: msg = self.orig_header - if isinstance(msg,list): - msg = "[%s], len of %d," % (','.join([ str(m) for m in msg ]),len(msg)) - raise CParserError('Passed header=%s but only %d lines in file' - % (msg, self.parser.lines)) + if isinstance(msg, list): + msg = "[%s], len of %d," % ( + ','.join([ str(m) for m in msg ]), len(msg)) + raise CParserError( + 'Passed header=%s but only %d lines in file' + % (msg, self.parser.lines)) else: field_count = self.parser.line_fields[hr] start = self.parser.line_start[hr] - # TODO: Py3 vs. Py2 counts = {} unnamed_count = 0 for i in range(field_count): word = self.parser.words[start + i] - if self.c_encoding == NULL and not PY3: + if path == CSTRING: name = PyBytes_FromString(word) - else: - if self.c_encoding == NULL or self.c_encoding == b'utf-8': - name = PyUnicode_FromString(word) - else: - name = PyUnicode_Decode(word, strlen(word), - self.c_encoding, errors) + elif path == UTF8: + name = PyUnicode_FromString(word) + elif path == ENCODED: + name = PyUnicode_Decode(word, strlen(word), + self.c_encoding, errors) if name == '': if self.has_mi_columns: - name = 'Unnamed: %d_level_%d' % (i,level) + name = 'Unnamed: %d_level_%d' % (i, level) else: name = 'Unnamed: %d' % i unnamed_count += 1 count = counts.get(name, 0) - if count > 0 and self.mangle_dupe_cols and not self.has_mi_columns: + if (count > 0 and self.mangle_dupe_cols + and not self.has_mi_columns): this_header.append('%s.%d' % (name, count)) else: this_header.append(name) @@ -717,12 +757,13 @@ cdef class TextReader: if self.has_mi_columns: - # if we have grabbed an extra line, but its not in our format - # so save in the buffer, and create an blank extra line for the rest of the - # parsing code + # If we have grabbed an extra line, but it's not in our + # format, save in the buffer, and create an blank extra + # line for the rest of the parsing code. if hr == self.header[-1]: lc = len(this_header) - ic = len(self.index_col) if self.index_col is not None else 0 + ic = (len(self.index_col) if self.index_col + is not None else 0) if lc != unnamed_count and lc - ic > unnamed_count: hr -= 1 self.parser_start -= 1 @@ -809,7 +850,7 @@ cdef class TextReader: if self.as_recarray: self._start_clock() - result = _to_structured_array(columns, self.header) + result = _to_structured_array(columns, self.header, self.usecols) self._end_clock('Conversion to structured array') return result @@ -880,8 +921,8 @@ cdef class TextReader: if buffered_lines < irows: self._tokenize_rows(irows - buffered_lines) - if self.skip_footer > 0: - raise ValueError('skip_footer can only be used to read ' + if self.skipfooter > 0: + raise ValueError('skipfooter can only be used to read ' 'the whole file') else: with nogil: @@ -894,7 +935,7 @@ cdef class TextReader: if status < 0: raise_parser_error('Error tokenizing data', self.parser) - footer = self.skip_footer + footer = self.skipfooter if self.parser_start == self.parser.lines: raise StopIteration @@ -956,20 +997,15 @@ cdef class TextReader: # if footer > 0: # end -= footer - #print >> sys.stderr, self.table_width - #print >> sys.stderr, self.leading_cols - #print >> sys.stderr, self.parser.lines - #print >> sys.stderr, start - #print >> sys.stderr, end - #print >> sys.stderr, self.header - #print >> sys.stderr, "index" num_cols = -1 for i in range(self.parser.lines): - num_cols = (num_cols < self.parser.line_fields[i]) * self.parser.line_fields[i] +\ + num_cols = (num_cols < self.parser.line_fields[i]) * \ + self.parser.line_fields[i] + \ (num_cols >= self.parser.line_fields[i]) * num_cols if self.table_width - self.leading_cols > num_cols: - raise CParserError("Too many columns specified: expected %s and found %s" % + raise CParserError( + "Too many columns specified: expected %s and found %s" % (self.table_width - self.leading_cols, num_cols)) results = {} @@ -1008,8 +1044,8 @@ cdef class TextReader: continue # Should return as the desired dtype (inferred or specified) - col_res, na_count = self._convert_tokens(i, start, end, name, - na_filter, na_hashset, na_flist) + col_res, na_count = self._convert_tokens( + i, start, end, name, na_filter, na_hashset, na_flist) if na_filter: self._free_na_set(na_hashset) @@ -1017,8 +1053,10 @@ cdef class TextReader: if upcast_na and na_count > 0: col_res = _maybe_upcast(col_res) - if issubclass(col_res.dtype.type, np.integer) and self.compact_ints: - col_res = downcast_int64(col_res, self.use_unsigned) + if issubclass(col_res.dtype.type, + np.integer) and self.compact_ints: + col_res = lib.downcast_int64(col_res, na_values, + self.use_unsigned) if col_res is None: raise CParserError('Unable to parse column %d' % i) @@ -1044,21 +1082,18 @@ cdef class TextReader: col_dtype = self.dtype[i] else: if self.dtype.names: - col_dtype = self.dtype.descr[i][1] + # structured array + col_dtype = np.dtype(self.dtype.descr[i][1]) else: col_dtype = self.dtype if col_dtype is not None: - if not isinstance(col_dtype, basestring): - if isinstance(col_dtype, np.dtype): - col_dtype = col_dtype.str - else: - col_dtype = np.dtype(col_dtype).str + col_res, na_count = self._convert_with_dtype( + col_dtype, i, start, end, na_filter, + 1, na_hashset, na_flist) - col_res, na_count = self._convert_with_dtype(col_dtype, i, start, end, - na_filter, 1, na_hashset, na_flist) - - # fallback on the parse (e.g. we requested int dtype, but its actually a float) + # Fallback on the parse (e.g. we requested int dtype, + # but its actually a float). if col_res is not None: return col_res, na_count @@ -1066,13 +1101,14 @@ cdef class TextReader: return self._string_convert(i, start, end, na_filter, na_hashset) else: col_res = None - for dt in dtype_cast_order: + for dt in self.dtype_cast_order: try: col_res, na_count = self._convert_with_dtype( dt, i, start, end, na_filter, 0, na_hashset, na_flist) except OverflowError: col_res, na_count = self._convert_with_dtype( - '|O8', i, start, end, na_filter, 0, na_hashset, na_flist) + np.dtype('object'), i, start, end, na_filter, + 0, na_hashset, na_flist) if col_res is not None: break @@ -1081,7 +1117,7 @@ cdef class TextReader: # only allow safe casts, eg. with a nan you cannot safely cast to int if col_res is not None and col_dtype is not None: try: - col_res = col_res.astype(col_dtype,casting='safe') + col_res = col_res.astype(col_dtype, casting='safe') except TypeError: # float -> int conversions can fail the above @@ -1089,12 +1125,13 @@ cdef class TextReader: col_res_orig = col_res col_res = col_res.astype(col_dtype) if (col_res != col_res_orig).any(): - raise ValueError("cannot safely convert passed user dtype of " - "{col_dtype} for {col_res} dtyped data in " - "column {column}".format(col_dtype=col_dtype, - col_res=col_res_orig.dtype.name, - column=i)) - + raise ValueError( + "cannot safely convert passed user dtype of " + "{col_dtype} for {col_res} dtyped data in " + "column {column}".format( + col_dtype=col_dtype, + col_res=col_res_orig.dtype.name, + column=i)) return col_res, na_count @@ -1104,90 +1141,90 @@ cdef class TextReader: bint user_dtype, kh_str_t *na_hashset, object na_flist): - if dtype[1] == 'i' or dtype[1] == 'u': - result, na_count = _try_int64(self.parser, i, start, end, - na_filter, na_hashset) + if is_integer_dtype(dtype): + result, na_count = _try_int64(self.parser, i, start, + end, na_filter, na_hashset) if user_dtype and na_count is not None: if na_count > 0: raise ValueError("Integer column has NA values in " - "column {column}".format(column=i)) + "column {column}".format(column=i)) - if result is not None and dtype[1:] != 'i8': + if result is not None and dtype != 'int64': result = result.astype(dtype) return result, na_count - elif dtype[1] == 'f': + elif is_float_dtype(dtype): result, na_count = _try_double(self.parser, i, start, end, na_filter, na_hashset, na_flist) - if result is not None and dtype[1:] != 'f8': + if result is not None and dtype != 'float64': result = result.astype(dtype) return result, na_count - elif dtype[1] == 'b': + elif is_bool_dtype(dtype): result, na_count = _try_bool_flex(self.parser, i, start, end, na_filter, na_hashset, self.true_set, self.false_set) return result, na_count - elif dtype[1] == 'c': - raise NotImplementedError("the dtype %s is not supported for parsing" % dtype) - - elif dtype[1] == 'S': + elif dtype.kind == 'S': # TODO: na handling - width = int(dtype[2:]) + width = dtype.itemsize if width > 0: result = _to_fw_string(self.parser, i, start, end, width) return result, 0 # treat as a regular string parsing return self._string_convert(i, start, end, na_filter, - na_hashset) - elif dtype[1] == 'U': - width = int(dtype[2:]) + na_hashset) + elif dtype.kind == 'U': + width = dtype.itemsize if width > 0: - raise NotImplementedError("the dtype %s is not supported for parsing" % dtype) + raise TypeError("the dtype %s is not " + "supported for parsing" % dtype) # unicode variable width return self._string_convert(i, start, end, na_filter, na_hashset) - - - elif dtype[1] == 'O': + elif is_categorical_dtype(dtype): + codes, cats, na_count = _categorical_convert( + self.parser, i, start, end, na_filter, + na_hashset, self.c_encoding) + # sort categories and recode if necessary + cats = Index(cats) + if not cats.is_monotonic_increasing: + unsorted = cats.copy() + cats = cats.sort_values() + indexer = cats.get_indexer(unsorted) + codes = take_1d(indexer, codes, fill_value=-1) + + return Categorical(codes, categories=cats, ordered=False, + fastpath=True), na_count + elif is_object_dtype(dtype): return self._string_convert(i, start, end, na_filter, na_hashset) + elif is_datetime64_dtype(dtype): + raise TypeError("the dtype %s is not supported " + "for parsing, pass this column " + "using parse_dates instead" % dtype) else: - if dtype[1] == 'M': - raise TypeError("the dtype %s is not supported for parsing, " - "pass this column using parse_dates instead" % dtype) - raise TypeError("the dtype %s is not supported for parsing" % dtype) + raise TypeError("the dtype %s is not " + "supported for parsing" % dtype) cdef _string_convert(self, Py_ssize_t i, int start, int end, bint na_filter, kh_str_t *na_hashset): - if PY3: - if self.c_encoding != NULL: - if self.c_encoding == b"utf-8": - return _string_box_utf8(self.parser, i, start, end, - na_filter, na_hashset) - else: - return _string_box_decode(self.parser, i, start, end, - na_filter, na_hashset, - self.c_encoding) - else: - return _string_box_utf8(self.parser, i, start, end, - na_filter, na_hashset) - else: - if self.c_encoding != NULL: - if self.c_encoding == b"utf-8": - return _string_box_utf8(self.parser, i, start, end, - na_filter, na_hashset) - else: - return _string_box_decode(self.parser, i, start, end, - na_filter, na_hashset, - self.c_encoding) - else: - return _string_box_factorize(self.parser, i, start, end, - na_filter, na_hashset) + + cdef StringPath path = _string_path(self.c_encoding) + + if path == UTF8: + return _string_box_utf8(self.parser, i, start, end, na_filter, + na_hashset) + elif path == ENCODED: + return _string_box_decode(self.parser, i, start, end, + na_filter, na_hashset, self.c_encoding) + elif path == CSTRING: + return _string_box_factorize(self.parser, i, start, end, + na_filter, na_hashset) def _get_converter(self, i, name): if self.converters is None: @@ -1299,6 +1336,19 @@ def _maybe_upcast(arr): return arr +cdef enum StringPath: + CSTRING + UTF8 + ENCODED + +# factored out logic to pick string converter +cdef inline StringPath _string_path(char *encoding): + if encoding != NULL and encoding != b"utf-8": + return ENCODED + elif PY3 or encoding != NULL: + return UTF8 + else: + return CSTRING # ---------------------------------------------------------------------- # Type conversions / inference support code @@ -1402,7 +1452,7 @@ cdef _string_box_utf8(parser_t *parser, int col, pyval = PyUnicode_FromString(word) k = kh_put_strbox(table, word, &ret) - table.vals[k] = pyval + table.vals[k] = pyval result[i] = pyval @@ -1460,7 +1510,7 @@ cdef _string_box_decode(parser_t *parser, int col, pyval = PyUnicode_Decode(word, size, encoding, errors) k = kh_put_strbox(table, word, &ret) - table.vals[k] = pyval + table.vals[k] = pyval result[i] = pyval @@ -1469,6 +1519,79 @@ cdef _string_box_decode(parser_t *parser, int col, return result, na_count +@cython.boundscheck(False) +cdef _categorical_convert(parser_t *parser, int col, + int line_start, int line_end, + bint na_filter, kh_str_t *na_hashset, + char *encoding): + "Convert column data into codes, categories" + cdef: + int error, na_count = 0 + Py_ssize_t i, size + size_t lines + coliter_t it + const char *word = NULL + + int64_t NA = -1 + int64_t[:] codes + int64_t current_category = 0 + + char *errors = "strict" + cdef StringPath path = _string_path(encoding) + + int ret = 0 + kh_str_t *table + khiter_t k + + lines = line_end - line_start + codes = np.empty(lines, dtype=np.int64) + + # factorize parsed values, creating a hash table + # bytes -> category code + with nogil: + table = kh_init_str() + coliter_setup(&it, parser, col, line_start) + + for i in range(lines): + COLITER_NEXT(it, word) + + if na_filter: + k = kh_get_str(na_hashset, word) + # is in NA values + if k != na_hashset.n_buckets: + na_count += 1 + codes[i] = NA + continue + + k = kh_get_str(table, word) + # not in the hash table + if k == table.n_buckets: + k = kh_put_str(table, word, &ret) + table.vals[k] = current_category + current_category += 1 + + codes[i] = table.vals[k] + + # parse and box categories to python strings + result = np.empty(table.n_occupied, dtype=np.object_) + if path == ENCODED: + for k in range(table.n_buckets): + if kh_exist_str(table, k): + size = strlen(table.keys[k]) + result[table.vals[k]] = PyUnicode_Decode( + table.keys[k], size, encoding, errors) + elif path == UTF8: + for k in range(table.n_buckets): + if kh_exist_str(table, k): + result[table.vals[k]] = PyUnicode_FromString(table.keys[k]) + elif path == CSTRING: + for k in range(table.n_buckets): + if kh_exist_str(table, k): + result[table.vals[k]] = PyBytes_FromString(table.keys[k]) + + kh_destroy_str(table) + return np.asarray(codes), result, na_count + cdef _to_fw_string(parser_t *parser, int col, int line_start, int line_end, size_t width): cdef: @@ -1486,8 +1609,9 @@ cdef _to_fw_string(parser_t *parser, int col, int line_start, return result -cdef inline void _to_fw_string_nogil(parser_t *parser, int col, int line_start, - int line_end, size_t width, char *data) nogil: +cdef inline void _to_fw_string_nogil(parser_t *parser, int col, + int line_start, int line_end, + size_t width, char *data) nogil: cdef: Py_ssize_t i coliter_t it @@ -1525,17 +1649,20 @@ cdef _try_double(parser_t *parser, int col, int line_start, int line_end, na_fset = kset_float64_from_list(na_flist) with nogil: error = _try_double_nogil(parser, col, line_start, line_end, - na_filter, na_hashset, use_na_flist, na_fset, NA, data, &na_count) + na_filter, na_hashset, use_na_flist, + na_fset, NA, data, &na_count) kh_destroy_float64(na_fset) if error != 0: return None, None return result, na_count -cdef inline int _try_double_nogil(parser_t *parser, int col, int line_start, int line_end, - bint na_filter, kh_str_t *na_hashset, bint use_na_flist, +cdef inline int _try_double_nogil(parser_t *parser, int col, + int line_start, int line_end, + bint na_filter, kh_str_t *na_hashset, + bint use_na_flist, const kh_float64_t *na_flist, - double NA, - double *data, int *na_count) nogil: + double NA, double *data, + int *na_count) nogil: cdef: int error, size_t i @@ -1560,15 +1687,17 @@ cdef inline int _try_double_nogil(parser_t *parser, int col, int line_start, int na_count[0] += 1 data[0] = NA else: - data[0] = parser.converter(word, &p_end, parser.decimal, parser.sci, - parser.thousands, 1) + data[0] = parser.converter(word, &p_end, parser.decimal, + parser.sci, parser.thousands, 1) if errno != 0 or p_end[0] or p_end == word: - if strcasecmp(word, cinf) == 0 or strcasecmp(word, cposinf) == 0: + if (strcasecmp(word, cinf) == 0 or + strcasecmp(word, cposinf) == 0): data[0] = INF elif strcasecmp(word, cneginf) == 0: data[0] = NEGINF else: - # Just return a non-zero value since the errno is never consumed. + # Just return a non-zero value since + # the errno is never consumed. return 1 if use_na_flist: k64 = kh_get_float64(na_flist, data[0]) @@ -1579,15 +1708,17 @@ cdef inline int _try_double_nogil(parser_t *parser, int col, int line_start, int else: for i in range(lines): COLITER_NEXT(it, word) - data[0] = parser.converter(word, &p_end, parser.decimal, parser.sci, - parser.thousands, 1) + data[0] = parser.converter(word, &p_end, parser.decimal, + parser.sci, parser.thousands, 1) if errno != 0 or p_end[0] or p_end == word: - if strcasecmp(word, cinf) == 0 or strcasecmp(word, cposinf) == 0: + if (strcasecmp(word, cinf) == 0 or + strcasecmp(word, cposinf) == 0): data[0] = INF elif strcasecmp(word, cneginf) == 0: data[0] = NEGINF else: - # Just return a non-zero value since the errno is never consumed. + # Just return a non-zero value since + # the errno is never consumed. return 1 data += 1 @@ -1610,7 +1741,8 @@ cdef _try_int64(parser_t *parser, int col, int line_start, int line_end, data = result.data coliter_setup(&it, parser, col, line_start) with nogil: - error = _try_int64_nogil(parser, col, line_start, line_end, na_filter, na_hashset, NA, data, &na_count) + error = _try_int64_nogil(parser, col, line_start, line_end, + na_filter, na_hashset, NA, data, &na_count) if error != 0: if error == ERROR_OVERFLOW: # Can't get the word variable @@ -1619,9 +1751,10 @@ cdef _try_int64(parser_t *parser, int col, int line_start, int line_end, return result, na_count -cdef inline int _try_int64_nogil(parser_t *parser, int col, int line_start, int line_end, - bint na_filter, const kh_str_t *na_hashset, int64_t NA, int64_t *data, - int *na_count) nogil: +cdef inline int _try_int64_nogil(parser_t *parser, int col, int line_start, + int line_end, bint na_filter, + const kh_str_t *na_hashset, int64_t NA, + int64_t *data, int *na_count) nogil: cdef: int error size_t i @@ -1671,14 +1804,18 @@ cdef _try_bool(parser_t *parser, int col, int line_start, int line_end, data = result.data with nogil: - error = _try_bool_nogil(parser, col, line_start, line_end, na_filter, na_hashset, NA, data, &na_count) + error = _try_bool_nogil(parser, col, line_start, + line_end, na_filter, + na_hashset, NA, data, + &na_count) if error != 0: return None, None return result.view(np.bool_), na_count -cdef inline int _try_bool_nogil(parser_t *parser, int col, int line_start, int line_end, - bint na_filter, const kh_str_t *na_hashset, uint8_t NA, uint8_t *data, - int *na_count) nogil: +cdef inline int _try_bool_nogil(parser_t *parser, int col, int line_start, + int line_end, bint na_filter, + const kh_str_t *na_hashset, uint8_t NA, + uint8_t *data, int *na_count) nogil: cdef: int error size_t lines = line_end - line_start @@ -1687,6 +1824,7 @@ cdef inline int _try_bool_nogil(parser_t *parser, int col, int line_start, int l const char *word = NULL khiter_t k na_count[0] = 0 + coliter_setup(&it, parser, col, line_start) if na_filter: @@ -1717,7 +1855,8 @@ cdef inline int _try_bool_nogil(parser_t *parser, int col, int line_start, int l cdef _try_bool_flex(parser_t *parser, int col, int line_start, int line_end, bint na_filter, const kh_str_t *na_hashset, - const kh_str_t *true_hashset, const kh_str_t *false_hashset): + const kh_str_t *true_hashset, + const kh_str_t *false_hashset): cdef: int error, na_count = 0 size_t i, lines @@ -1733,16 +1872,20 @@ cdef _try_bool_flex(parser_t *parser, int col, int line_start, int line_end, result = np.empty(lines, dtype=np.uint8) data = result.data with nogil: - error = _try_bool_flex_nogil(parser, col, line_start, line_end, na_filter, na_hashset, - true_hashset, false_hashset, NA, data, &na_count) + error = _try_bool_flex_nogil(parser, col, line_start, line_end, + na_filter, na_hashset, true_hashset, + false_hashset, NA, data, &na_count) if error != 0: return None, None return result.view(np.bool_), na_count -cdef inline int _try_bool_flex_nogil(parser_t *parser, int col, int line_start, int line_end, - bint na_filter, const kh_str_t *na_hashset, - const kh_str_t *true_hashset, const kh_str_t *false_hashset, - uint8_t NA, uint8_t *data, int *na_count) nogil: +cdef inline int _try_bool_flex_nogil(parser_t *parser, int col, int line_start, + int line_end, bint na_filter, + const kh_str_t *na_hashset, + const kh_str_t *true_hashset, + const kh_str_t *false_hashset, + uint8_t NA, uint8_t *data, + int *na_count) nogil: cdef: int error = 0 size_t i @@ -1804,6 +1947,7 @@ cdef inline int _try_bool_flex_nogil(parser_t *parser, int col, int line_start, return 0 + cdef kh_str_t* kset_from_list(list values) except NULL: # caller takes responsibility for freeing the hash table cdef: @@ -1847,13 +1991,21 @@ cdef kh_float64_t* kset_float64_from_list(values) except NULL: return table -# if at first you don't succeed... - -# TODO: endianness just a placeholder? -cdef list dtype_cast_order = [' value + Py_XDECREF(value) + raise old_exc message = '%s. C error: ' % base if parser.error_msg != NULL: if PY3: @@ -1866,76 +2018,6 @@ cdef raise_parser_error(object base, parser_t *parser): raise CParserError(message) -def downcast_int64(ndarray[int64_t] arr, bint use_unsigned=0): - cdef: - Py_ssize_t i, n = len(arr) - int64_t mx = INT64_MIN + 1, mn = INT64_MAX - int64_t NA = na_values[np.int64] - int64_t val - ndarray[uint8_t] mask - int na_count = 0 - - _mask = np.empty(n, dtype=bool) - mask = _mask.view(np.uint8) - - for i in range(n): - val = arr[i] - - if val == NA: - mask[i] = 1 - na_count += 1 - continue - - # not NA - mask[i] = 0 - - if val > mx: - mx = val - - if val < mn: - mn = val - - if mn >= 0 and use_unsigned: - if mx <= UINT8_MAX - 1: - result = arr.astype(np.uint8) - if na_count: - np.putmask(result, _mask, na_values[np.uint8]) - return result - - if mx <= UINT16_MAX - 1: - result = arr.astype(np.uint16) - if na_count: - np.putmask(result, _mask, na_values[np.uint16]) - return result - - if mx <= UINT32_MAX - 1: - result = arr.astype(np.uint32) - if na_count: - np.putmask(result, _mask, na_values[np.uint32]) - return result - - else: - if mn >= INT8_MIN + 1 and mx <= INT8_MAX: - result = arr.astype(np.int8) - if na_count: - np.putmask(result, _mask, na_values[np.int8]) - return result - - if mn >= INT16_MIN + 1 and mx <= INT16_MAX: - result = arr.astype(np.int16) - if na_count: - np.putmask(result, _mask, na_values[np.int16]) - return result - - if mn >= INT32_MIN + 1 and mx <= INT32_MAX: - result = arr.astype(np.int32) - if na_count: - np.putmask(result, _mask, na_values[np.int32]) - return result - - return arr - - def _concatenate_chunks(list chunks): cdef: list names = list(chunks[0].keys()) @@ -1954,18 +2036,23 @@ def _concatenate_chunks(list chunks): common_type = np.find_common_type(dtypes, []) if common_type == np.object: warning_columns.append(str(name)) - result[name] = np.concatenate(arrs) + + if is_categorical_dtype(dtypes.pop()): + result[name] = union_categoricals(arrs, sort_categories=True) + else: + result[name] = np.concatenate(arrs) if warning_columns: warning_names = ','.join(warning_columns) - warning_message = " ".join(["Columns (%s) have mixed types." % warning_names, + warning_message = " ".join([ + "Columns (%s) have mixed types." % warning_names, "Specify dtype option on import or set low_memory=False." ]) warnings.warn(warning_message, DtypeWarning, stacklevel=8) return result -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- # NA values def _compute_na_values(): int64info = np.iinfo(np.int64) @@ -1977,17 +2064,17 @@ def _compute_na_values(): uint16info = np.iinfo(np.uint16) uint8info = np.iinfo(np.uint8) na_values = { - np.float64 : np.nan, - np.int64 : int64info.min, - np.int32 : int32info.min, - np.int16 : int16info.min, - np.int8 : int8info.min, - np.uint64 : uint64info.max, - np.uint32 : uint32info.max, - np.uint16 : uint16info.max, - np.uint8 : uint8info.max, - np.bool_ : uint8info.max, - np.object_ : np.nan # oof + np.float64: np.nan, + np.int64: int64info.min, + np.int32: int32info.min, + np.int16: int16info.min, + np.int8: int8info.min, + np.uint64: uint64info.max, + np.uint32: uint32info.max, + np.uint16: uint16info.max, + np.uint8: uint8info.max, + np.bool_: uint8info.max, + np.object_: np.nan # oof } return na_values @@ -2035,7 +2122,7 @@ cdef _apply_converter(object f, parser_t *parser, int col, return lib.maybe_convert_objects(result) -def _to_structured_array(dict columns, object names): +def _to_structured_array(dict columns, object names, object usecols): cdef: ndarray recs, column cnp.dtype dt @@ -2052,6 +2139,10 @@ def _to_structured_array(dict columns, object names): # single line header names = names[0] + if usecols is not None: + names = [n for i, n in enumerate(names) + if i in usecols or n in usecols] + dt = np.dtype([(str(name), columns[i].dtype) for i, name in enumerate(names)]) fnames = dt.names @@ -2066,22 +2157,13 @@ def _to_structured_array(dict columns, object names): stride = dt.itemsize - # start = time.time() - - # we own the data + # We own the data. buf = malloc(length * stride) recs = util.sarr_from_data(dt, length, buf) assert(recs.flags.owndata) - # buf = recs.data - # end = time.time() - # print 'took %.4f' % (end - start) - for i in range(nfields): - # start = time.clock() - # name = names[i] - # XXX field_type = fields[fnames[i]] @@ -2094,9 +2176,6 @@ def _to_structured_array(dict columns, object names): elsize, stride, length, field_type[0] == np.object_) - # print 'Transfer of %s took %.4f' % (str(field_type), - # time.clock() - start) - return recs cdef _fill_structured_column(char *dst, char* src, int elsize, @@ -2113,7 +2192,6 @@ cdef _fill_structured_column(char *dst, char* src, int elsize, src += elsize - def _maybe_encode(values): if values is None: return [] diff --git a/pandas/sandbox/qtpandas.py b/pandas/sandbox/qtpandas.py deleted file mode 100644 index b6af40a0e2156..0000000000000 --- a/pandas/sandbox/qtpandas.py +++ /dev/null @@ -1,145 +0,0 @@ -""" -Easy integration of DataFrame into pyqt framework - -@author: Jev Kuznetsov -""" - -# flake8: noqa - -# GH9615 - -import warnings -warnings.warn("The pandas.sandbox.qtpandas module is deprecated and will be " - "removed in a future version. We refer users to the external package " - "here: https://github.com/datalyze-solutions/pandas-qt") - -try: - from PyQt4.QtCore import QAbstractTableModel, Qt, QVariant, QModelIndex - from PyQt4.QtGui import ( - QApplication, QDialog, QVBoxLayout, QTableView, QWidget) -except ImportError: - from PySide.QtCore import QAbstractTableModel, Qt, QModelIndex - from PySide.QtGui import ( - QApplication, QDialog, QVBoxLayout, QTableView, QWidget) - QVariant = lambda value=None: value - -from pandas import DataFrame, Index - - -class DataFrameModel(QAbstractTableModel): - """ data model for a DataFrame class """ - def __init__(self): - super(DataFrameModel, self).__init__() - self.df = DataFrame() - - def setDataFrame(self, dataFrame): - self.df = dataFrame - - def signalUpdate(self): - """ tell viewers to update their data (this is full update, not - efficient)""" - self.layoutChanged.emit() - - #------------- table display functions ----------------- - def headerData(self, section, orientation, role=Qt.DisplayRole): - if role != Qt.DisplayRole: - return QVariant() - - if orientation == Qt.Horizontal: - try: - return self.df.columns.tolist()[section] - except (IndexError, ): - return QVariant() - elif orientation == Qt.Vertical: - try: - # return self.df.index.tolist() - return self.df.index.tolist()[section] - except (IndexError, ): - return QVariant() - - def data(self, index, role=Qt.DisplayRole): - if role != Qt.DisplayRole: - return QVariant() - - if not index.isValid(): - return QVariant() - - return QVariant(str(self.df.ix[index.row(), index.column()])) - - def flags(self, index): - flags = super(DataFrameModel, self).flags(index) - flags |= Qt.ItemIsEditable - return flags - - def setData(self, index, value, role): - row = self.df.index[index.row()] - col = self.df.columns[index.column()] - if hasattr(value, 'toPyObject'): - # PyQt4 gets a QVariant - value = value.toPyObject() - else: - # PySide gets an unicode - dtype = self.df[col].dtype - if dtype != object: - value = None if value == '' else dtype.type(value) - self.df.set_value(row, col, value) - return True - - def rowCount(self, index=QModelIndex()): - return self.df.shape[0] - - def columnCount(self, index=QModelIndex()): - return self.df.shape[1] - - -class DataFrameWidget(QWidget): - """ a simple widget for using DataFrames in a gui """ - def __init__(self, dataFrame, parent=None): - super(DataFrameWidget, self).__init__(parent) - - self.dataModel = DataFrameModel() - self.dataTable = QTableView() - self.dataTable.setModel(self.dataModel) - - layout = QVBoxLayout() - layout.addWidget(self.dataTable) - self.setLayout(layout) - # Set DataFrame - self.setDataFrame(dataFrame) - - def setDataFrame(self, dataFrame): - self.dataModel.setDataFrame(dataFrame) - self.dataModel.signalUpdate() - self.dataTable.resizeColumnsToContents() - -#-----------------stand alone test code - - -def testDf(): - """ creates test dataframe """ - data = {'int': [1, 2, 3], 'float': [1.5, 2.5, 3.5], - 'string': ['a', 'b', 'c'], 'nan': [np.nan, np.nan, np.nan]} - return DataFrame(data, index=Index(['AAA', 'BBB', 'CCC']), - columns=['int', 'float', 'string', 'nan']) - - -class Form(QDialog): - def __init__(self, parent=None): - super(Form, self).__init__(parent) - - df = testDf() # make up some data - widget = DataFrameWidget(df) - widget.resizeColumnsToContents() - - layout = QVBoxLayout() - layout.addWidget(widget) - self.setLayout(layout) - -if __name__ == '__main__': - import sys - import numpy as np - - app = QApplication(sys.argv) - form = Form() - form.show() - app.exec_() diff --git a/pandas/sparse/api.py b/pandas/sparse/api.py index b4d874e6a1ab9..55841fbeffa2d 100644 --- a/pandas/sparse/api.py +++ b/pandas/sparse/api.py @@ -4,4 +4,3 @@ from pandas.sparse.list import SparseList from pandas.sparse.series import SparseSeries, SparseTimeSeries from pandas.sparse.frame import SparseDataFrame -from pandas.sparse.panel import SparsePanel diff --git a/pandas/sparse/array.py b/pandas/sparse/array.py index 0312fb023f7fd..8420371d05e02 100644 --- a/pandas/sparse/array.py +++ b/pandas/sparse/array.py @@ -4,17 +4,26 @@ from __future__ import division # pylint: disable=E1101,E1103,W0231 -from numpy import nan, ndarray import numpy as np import pandas as pd from pandas.core.base import PandasObject -import pandas.core.common as com -from pandas import compat, lib +from pandas import compat from pandas.compat import range from pandas.compat.numpy import function as nv +from pandas.types.generic import ABCSparseArray, ABCSparseSeries +from pandas.types.common import (_ensure_platform_int, + is_float, is_integer, + is_integer_dtype, + is_bool_dtype, + is_list_like, + is_scalar, is_dtype_equal) +from pandas.types.cast import (_possibly_convert_platform, _maybe_promote, + _astype_nansafe, _find_common_type) +from pandas.types.missing import isnull, notnull, na_value_for_dtype + from pandas._sparse import SparseIndex, BlockIndex, IntIndex import pandas._sparse as splib import pandas.index as _index @@ -40,17 +49,17 @@ def wrapper(self, other): if len(self) != len(other): raise AssertionError("length mismatch: %d vs. %d" % (len(self), len(other))) - if not isinstance(other, com.ABCSparseArray): - other = SparseArray(other, fill_value=self.fill_value) - if name[0] == 'r': - return _sparse_array_op(other, self, op, name[1:]) - else: - return _sparse_array_op(self, other, op, name) - elif lib.isscalar(other): - new_fill_value = op(np.float64(self.fill_value), np.float64(other)) - - return _wrap_result(name, op(self.sp_values, other), - self.sp_index, new_fill_value) + if not isinstance(other, ABCSparseArray): + dtype = getattr(other, 'dtype', None) + other = SparseArray(other, fill_value=self.fill_value, + dtype=dtype) + return _sparse_array_op(self, other, op, name) + elif is_scalar(other): + with np.errstate(all='ignore'): + fill = op(_get_fill(self), np.asarray(other)) + result = op(self.sp_values, other) + + return _wrap_result(name, result, self.sp_index, fill) else: # pragma: no cover raise TypeError('operation with %s not supported' % type(other)) @@ -60,33 +69,89 @@ def wrapper(self, other): return wrapper -def _sparse_array_op(left, right, op, name): - if left.sp_index.equals(right.sp_index): - result = op(left.sp_values, right.sp_values) - result_index = left.sp_index - else: - sparse_op = getattr(splib, 'sparse_%s' % name) - result, result_index = sparse_op(left.sp_values, left.sp_index, - left.fill_value, right.sp_values, - right.sp_index, right.fill_value) +def _get_fill(arr): + # coerce fill_value to arr dtype if possible + # int64 SparseArray can have NaN as fill_value if there is no missing try: - fill_value = op(left.fill_value, right.fill_value) - except: - fill_value = nan - return _wrap_result(name, result, result_index, fill_value) + return np.asarray(arr.fill_value, dtype=arr.dtype) + except ValueError: + return np.asarray(arr.fill_value) + + +def _sparse_array_op(left, right, op, name, series=False): + + if series and is_integer_dtype(left) and is_integer_dtype(right): + # series coerces to float64 if result should have NaN/inf + if name in ('floordiv', 'mod') and (right.values == 0).any(): + left = left.astype(np.float64) + right = right.astype(np.float64) + elif name in ('rfloordiv', 'rmod') and (left.values == 0).any(): + left = left.astype(np.float64) + right = right.astype(np.float64) + + # dtype used to find corresponding sparse method + if not is_dtype_equal(left.dtype, right.dtype): + dtype = _find_common_type([left.dtype, right.dtype]) + left = left.astype(dtype) + right = right.astype(dtype) + else: + dtype = left.dtype + + # dtype the result must have + result_dtype = None + + if left.sp_index.ngaps == 0 or right.sp_index.ngaps == 0: + with np.errstate(all='ignore'): + result = op(left.get_values(), right.get_values()) + fill = op(_get_fill(left), _get_fill(right)) + + if left.sp_index.ngaps == 0: + index = left.sp_index + else: + index = right.sp_index + elif left.sp_index.equals(right.sp_index): + with np.errstate(all='ignore'): + result = op(left.sp_values, right.sp_values) + fill = op(_get_fill(left), _get_fill(right)) + index = left.sp_index + else: + if name[0] == 'r': + left, right = right, left + name = name[1:] + + if name in ('and', 'or') and dtype == 'bool': + opname = 'sparse_{name}_uint8'.format(name=name, dtype=dtype) + # to make template simple, cast here + left_sp_values = left.sp_values.view(np.uint8) + right_sp_values = right.sp_values.view(np.uint8) + result_dtype = np.bool + else: + opname = 'sparse_{name}_{dtype}'.format(name=name, dtype=dtype) + left_sp_values = left.sp_values + right_sp_values = right.sp_values + + sparse_op = getattr(splib, opname) + with np.errstate(all='ignore'): + result, index, fill = sparse_op(left_sp_values, left.sp_index, + left.fill_value, right_sp_values, + right.sp_index, right.fill_value) + + if result_dtype is None: + result_dtype = result.dtype + + return _wrap_result(name, result, index, fill, dtype=result_dtype) -def _wrap_result(name, data, sparse_index, fill_value): +def _wrap_result(name, data, sparse_index, fill_value, dtype=None): """ wrap op result to have correct dtype """ if name in ('eq', 'ne', 'lt', 'gt', 'le', 'ge'): - # ToDo: We can remove this condition when removing - # SparseArray's dtype default when closing GH 667 - return SparseArray(data, sparse_index=sparse_index, - fill_value=fill_value, - dtype=np.bool) - else: - return SparseArray(data, sparse_index=sparse_index, - fill_value=fill_value) + dtype = np.bool + + if is_bool_dtype(dtype): + # fill_value may be np.bool_ + fill_value = bool(fill_value) + return SparseArray(data, sparse_index=sparse_index, + fill_value=fill_value, dtype=dtype) class SparseArray(PandasObject, np.ndarray): @@ -97,7 +162,8 @@ class SparseArray(PandasObject, np.ndarray): data : {array-like (1-D), Series, SparseSeries, dict} kind : {'block', 'integer'} fill_value : float - Defaults to NaN (code for missing) + Code for missing value. Defaults depends on dtype. + 0 for int dtype, False for bool dtype, and NaN for other dtypes sparse_index : {BlockIndex, IntIndex}, optional Only if you have one. Mainly used internally @@ -115,69 +181,63 @@ class SparseArray(PandasObject, np.ndarray): fill_value = None def __new__(cls, data, sparse_index=None, index=None, kind='integer', - fill_value=None, dtype=np.float64, copy=False): + fill_value=None, dtype=None, copy=False): if index is not None: if data is None: data = np.nan - if not lib.isscalar(data): + if not is_scalar(data): raise Exception("must only pass scalars with an index ") values = np.empty(len(index), dtype='float64') values.fill(data) data = values + if isinstance(data, ABCSparseSeries): + data = data.values + is_sparse_array = isinstance(data, SparseArray) + if dtype is not None: dtype = np.dtype(dtype) - is_sparse_array = isinstance(data, SparseArray) - if fill_value is None: - if is_sparse_array: - fill_value = data.fill_value - else: - fill_value = nan if is_sparse_array: sparse_index = data.sp_index - values = np.asarray(data) + values = data.sp_values + fill_value = data.fill_value else: # array-like if sparse_index is None: - values, sparse_index = make_sparse(data, kind=kind, - fill_value=fill_value) + if dtype is not None: + data = np.asarray(data, dtype=dtype) + res = make_sparse(data, kind=kind, fill_value=fill_value) + values, sparse_index, fill_value = res else: values = _sanitize_values(data) if len(values) != sparse_index.npoints: raise AssertionError("Non array-like type {0} must have" " the same length as the" " index".format(type(values))) - # Create array, do *not* copy data by default if copy: - try: - # ToDo: Can remove this error handling when we actually - # support other dtypes - subarr = np.array(values, dtype=dtype, copy=True) - except ValueError: - subarr = np.array(values, copy=True) + subarr = np.array(values, dtype=dtype, copy=True) else: - try: - subarr = np.asarray(values, dtype=dtype) - except ValueError: - subarr = np.asarray(values) - - # if we have a bool type, make sure that we have a bool fill_value - if ((dtype is not None and issubclass(dtype.type, np.bool_)) or - (data is not None and lib.is_bool_array(subarr))): - if np.isnan(fill_value) or not fill_value: - fill_value = False - else: - fill_value = bool(fill_value) - + subarr = np.asarray(values, dtype=dtype) # Change the class of the array to be the subclass type. return cls._simple_new(subarr, sparse_index, fill_value) @classmethod def _simple_new(cls, data, sp_index, fill_value): - if (com.is_integer_dtype(data) and com.is_float(fill_value) and + if not isinstance(sp_index, SparseIndex): + # caller must pass SparseIndex + raise ValueError('sp_index must be a SparseIndex') + + if fill_value is None: + if sp_index.ngaps > 0: + # has missing hole + fill_value = np.nan + else: + fill_value = na_value_for_dtype(data.dtype) + + if (is_integer_dtype(data) and is_float(fill_value) and sp_index.ngaps > 0): # if float fill_value is being included in dense repr, # convert values to float @@ -190,7 +250,7 @@ def _simple_new(cls, data, sp_index, fill_value): raise ValueError('sp_index must be a SparseIndex') result.sp_index = sp_index - result.fill_value = fill_value + result._fill_value = fill_value return result @property @@ -205,17 +265,46 @@ def kind(self): elif isinstance(self.sp_index, IntIndex): return 'integer' + def __array_wrap__(self, out_arr, context=None): + """ + NumPy calls this method when ufunc is applied + + Parameters + ---------- + + out_arr : ndarray + ufunc result (note that ufunc is only applied to sp_values) + context : tuple of 3 elements (ufunc, signature, domain) + for example, following is a context when np.sin is applied to + SparseArray, + + (, (SparseArray,), 0)) + + See http://docs.scipy.org/doc/numpy/user/basics.subclassing.html + """ + if isinstance(context, tuple) and len(context) == 3: + ufunc, args, domain = context + # to apply ufunc only to fill_value (to avoid recursive call) + args = [getattr(a, 'fill_value', a) for a in args] + with np.errstate(all='ignore'): + fill_value = ufunc(self.fill_value, *args[1:]) + else: + fill_value = self.fill_value + + return self._simple_new(out_arr, sp_index=self.sp_index, + fill_value=fill_value) + def __array_finalize__(self, obj): """ Gets called after any ufunc or other array operations, necessary to pass on the index. """ self.sp_index = getattr(obj, 'sp_index', None) - self.fill_value = getattr(obj, 'fill_value', None) + self._fill_value = getattr(obj, 'fill_value', None) def __reduce__(self): """Necessary for making this object picklable""" - object_state = list(ndarray.__reduce__(self)) + object_state = list(np.ndarray.__reduce__(self)) subclass_state = self.fill_value, self.sp_index object_state[2] = (object_state[2], subclass_state) return tuple(object_state) @@ -223,11 +312,11 @@ def __reduce__(self): def __setstate__(self, state): """Necessary for making this object picklable""" nd_state, own_state = state - ndarray.__setstate__(self, nd_state) + np.ndarray.__setstate__(self, nd_state) fill_value, sp_index = own_state[:2] self.sp_index = sp_index - self.fill_value = fill_value + self._fill_value = fill_value def __len__(self): try: @@ -270,6 +359,22 @@ def sp_values(self): # caching not an option, leaks memory return self.view(np.ndarray) + @property + def fill_value(self): + return self._fill_value + + @fill_value.setter + def fill_value(self, value): + if not is_scalar(value): + raise ValueError('fill_value must be a scalar') + # if the specified value triggers type promotion, raise ValueError + new_dtype, fill_value = _maybe_promote(self.dtype, value) + if is_dtype_equal(self.dtype, new_dtype): + self._fill_value = fill_value + else: + msg = 'unable to set fill_value {0} to {1} dtype' + raise ValueError(msg.format(value, self.dtype)) + def get_values(self, fill=None): """ return a dense representation """ return self.to_dense(fill=fill) @@ -285,16 +390,21 @@ def __iter__(self): yield self._get_val_at(i) def __getitem__(self, key): + """ """ - if com.is_integer(key): + + if is_integer(key): return self._get_val_at(key) elif isinstance(key, tuple): data_slice = self.values[key] else: if isinstance(key, SparseArray): - key = np.asarray(key) + if is_bool_dtype(key): + key = key.to_dense() + else: + key = np.asarray(key) if hasattr(key, '__len__') and len(self) != len(key): return self.take(key) @@ -340,11 +450,11 @@ def take(self, indices, axis=0, allow_fill=True, if axis: raise ValueError("axis must be 0, input was {0}".format(axis)) - if com.is_integer(indices): + if is_integer(indices): # return scalar return self[indices] - indices = com._ensure_platform_int(indices) + indices = _ensure_platform_int(indices) n = len(self) if allow_fill and fill_value is not None: # allow -1 to indicate self.fill_value, @@ -380,7 +490,7 @@ def take(self, indices, axis=0, allow_fill=True, return self._simple_new(new_values, sp_index, self.fill_value) def __setitem__(self, key, value): - # if com.is_integer(key): + # if is_integer(key): # self.values[key] = value # else: # raise Exception("SparseArray does not support seting non-scalars @@ -395,7 +505,7 @@ def __setslice__(self, i, j, value): j = 0 slobj = slice(i, j) # noqa - # if not lib.isscalar(value): + # if not is_scalar(value): # raise Exception("SparseArray does not support seting non-scalars # via slices") @@ -405,14 +515,20 @@ def __setslice__(self, i, j, value): raise TypeError("SparseArray does not support item assignment via " "slices") - def astype(self, dtype=None): - """ - - """ + def astype(self, dtype=None, copy=True): dtype = np.dtype(dtype) - if dtype is not None and dtype not in (np.float_, float): - raise TypeError('Can only support floating point data for now') - return self.copy() + sp_values = _astype_nansafe(self.sp_values, dtype, copy=copy) + try: + if is_bool_dtype(dtype): + # to avoid np.bool_ dtype + fill_value = bool(self.fill_value) + else: + fill_value = dtype.type(self.fill_value) + except ValueError: + msg = 'unable to coerce current fill_value {0} to {1} dtype' + raise ValueError(msg.format(self.fill_value, dtype)) + return self._simple_new(sp_values, self.sp_index, + fill_value=fill_value) def copy(self, deep=True): """ @@ -445,12 +561,12 @@ def count(self): @property def _null_fill_value(self): - return com.isnull(self.fill_value) + return isnull(self.fill_value) @property def _valid_sp_values(self): sp_vals = self.sp_values - mask = com.notnull(sp_vals) + mask = notnull(sp_vals) return sp_vals[mask] @Appender(_index_shared_docs['fillna'] % _sparray_doc_kwargs) @@ -466,7 +582,7 @@ def fillna(self, value, downcast=None): fill_value=value) else: new_values = self.sp_values.copy() - new_values[com.isnull(new_values)] = value + new_values[isnull(new_values)] = value return self._simple_new(new_values, self.sp_index, fill_value=self.fill_value) @@ -498,7 +614,7 @@ def cumsum(self, axis=0, *args, **kwargs): nv.validate_cumsum(args, kwargs) # TODO: gh-12855 - return a SparseArray here - if com.notnull(self.fill_value): + if notnull(self.fill_value): return self.to_dense().cumsum() # TODO: what if sp_values contains NaN?? @@ -569,11 +685,9 @@ def _maybe_to_dense(obj): def _maybe_to_sparse(array): - if isinstance(array, com.ABCSparseSeries): - array = SparseArray(array.values, sparse_index=array.sp_index, - fill_value=array.fill_value, copy=True) - if not isinstance(array, SparseArray): - array = com._values_from_object(array) + """ array must be SparseSeries or SparseArray """ + if isinstance(array, ABCSparseSeries): + array = array.values.copy() return array @@ -588,15 +702,15 @@ def _sanitize_values(arr): else: # scalar - if lib.isscalar(arr): + if is_scalar(arr): arr = [arr] # ndarray if isinstance(arr, np.ndarray): pass - elif com.is_list_like(arr) and len(arr) > 0: - arr = com._possibly_convert_platform(arr) + elif is_list_like(arr) and len(arr) > 0: + arr = _possibly_convert_platform(arr) else: arr = np.asarray(arr) @@ -604,7 +718,7 @@ def _sanitize_values(arr): return arr -def make_sparse(arr, kind='block', fill_value=nan): +def make_sparse(arr, kind='block', fill_value=None): """ Convert ndarray to sparse format @@ -624,8 +738,11 @@ def make_sparse(arr, kind='block', fill_value=nan): if arr.ndim > 1: raise TypeError("expected dimension <= 1 data") - if com.isnull(fill_value): - mask = com.notnull(arr) + if fill_value is None: + fill_value = na_value_for_dtype(arr.dtype) + + if isnull(fill_value): + mask = notnull(arr) else: mask = arr != fill_value @@ -638,7 +755,7 @@ def make_sparse(arr, kind='block', fill_value=nan): index = _make_index(length, indices, kind) sparsified_values = arr[mask] - return sparsified_values, index + return sparsified_values, index, fill_value def _make_index(length, indices, kind): @@ -655,4 +772,5 @@ def _make_index(length, indices, kind): ops.add_special_arithmetic_methods(SparseArray, arith_method=_arith_method, comp_method=_arith_method, + bool_method=_arith_method, use_numexpr=False) diff --git a/pandas/sparse/frame.py b/pandas/sparse/frame.py index 52a6e6edf0896..8eeff045d1fac 100644 --- a/pandas/sparse/frame.py +++ b/pandas/sparse/frame.py @@ -10,22 +10,28 @@ from pandas import compat import numpy as np +from pandas.types.missing import isnull, notnull +from pandas.types.cast import _maybe_upcast +from pandas.types.common import _ensure_platform_int + +from pandas.core.common import _try_sort from pandas.compat.numpy import function as nv -from pandas.core.common import isnull, _try_sort from pandas.core.index import Index, MultiIndex, _ensure_index from pandas.core.series import Series from pandas.core.frame import (DataFrame, extract_index, _prep_ndarray, _default_index) -import pandas.core.common as com import pandas.core.algorithms as algos from pandas.core.internals import (BlockManager, create_block_manager_from_arrays) -from pandas.core.generic import NDFrame +import pandas.core.generic as generic from pandas.sparse.series import SparseSeries, SparseArray from pandas.util.decorators import Appender import pandas.core.ops as ops +_shared_doc_kwargs = dict(klass='SparseDataFrame') + + class SparseDataFrame(DataFrame): """ DataFrame containing sparse floating point data in the form of SparseSeries @@ -116,7 +122,7 @@ def __init__(self, data=None, index=None, columns=None, default_kind=None, if dtype is not None: mgr = mgr.astype(dtype) - NDFrame.__init__(self, mgr) + generic.NDFrame.__init__(self, mgr) @property def _constructor(self): @@ -149,7 +155,7 @@ def _init_dict(self, data, index, columns, dtype=None): if not isinstance(v, SparseSeries): v = sp_maker(v.values) elif isinstance(v, SparseArray): - v = sp_maker(v.values) + v = v.copy() else: if isinstance(v, dict): v = [v.get(i, nan) for i in index] @@ -186,7 +192,7 @@ def _init_matrix(self, data, index, columns, dtype=None): return self._init_dict(data, index, columns, dtype) def __array_wrap__(self, result): - return SparseDataFrame( + return self._constructor( result, index=self.index, columns=self.columns, default_kind=self._default_kind, default_fill_value=self._default_fill_value).__finalize__(self) @@ -233,8 +239,19 @@ def to_dense(self): data = dict((k, v.to_dense()) for k, v in compat.iteritems(self)) return DataFrame(data, index=self.index, columns=self.columns) + def _apply_columns(self, func): + """ get new SparseDataFrame applying func to each columns """ + + new_data = {} + for col, series in compat.iteritems(self): + new_data[col] = func(series) + + return self._constructor( + data=new_data, index=self.index, columns=self.columns, + default_fill_value=self.default_fill_value).__finalize__(self) + def astype(self, dtype): - raise NotImplementedError + return self._apply_columns(lambda x: x.astype(dtype)) def copy(self, deep=True): """ @@ -405,7 +422,7 @@ def _combine_frame(self, other, func, fill_value=None, level=None): raise NotImplementedError("'level' argument is not supported") if self.empty and other.empty: - return SparseDataFrame(index=new_index).__finalize__(self) + return self._constructor(index=new_index).__finalize__(self) new_data = {} new_fill_value = None @@ -496,14 +513,8 @@ def _combine_match_columns(self, other, func, level=None, fill_value=None): new_data, index=self.index, columns=union, default_fill_value=self.default_fill_value).__finalize__(self) - def _combine_const(self, other, func): - new_data = {} - for col, series in compat.iteritems(self): - new_data[col] = func(series, other) - - return self._constructor( - data=new_data, index=self.index, columns=self.columns, - default_fill_value=self.default_fill_value).__finalize__(self) + def _combine_const(self, other, func, raise_on_error=True): + return self._apply_columns(lambda x: func(x, other)) def _reindex_index(self, index, method, copy, level, fill_value=np.nan, limit=None, takeable=False): @@ -517,10 +528,11 @@ def _reindex_index(self, index, method, copy, level, fill_value=np.nan, return self if len(self.index) == 0: - return SparseDataFrame(index=index, columns=self.columns) + return self._constructor( + index=index, columns=self.columns).__finalize__(self) indexer = self.index.get_indexer(index, method, limit=limit) - indexer = com._ensure_platform_int(indexer) + indexer = _ensure_platform_int(indexer) mask = indexer == -1 need_mask = mask.any() @@ -534,19 +546,23 @@ def _reindex_index(self, index, method, copy, level, fill_value=np.nan, new = values.take(indexer) if need_mask: new = new.values + # convert integer to float if necessary. need to do a lot + # more than that, handle boolean etc also + new, fill_value = _maybe_upcast(new, fill_value=fill_value) np.putmask(new, mask, fill_value) new_series[col] = new - return SparseDataFrame(new_series, index=index, columns=self.columns, - default_fill_value=self._default_fill_value) + return self._constructor( + new_series, index=index, columns=self.columns, + default_fill_value=self._default_fill_value).__finalize__(self) def _reindex_columns(self, columns, copy, level, fill_value, limit=None, takeable=False): if level is not None: raise TypeError('Reindex by level not supported for sparse') - if com.notnull(fill_value): + if notnull(fill_value): raise NotImplementedError("'fill_value' argument is not supported") if limit: @@ -554,8 +570,9 @@ def _reindex_columns(self, columns, copy, level, fill_value, limit=None, # TODO: fill value handling sdict = dict((k, v) for k, v in compat.iteritems(self) if k in columns) - return SparseDataFrame(sdict, index=self.index, columns=columns, - default_fill_value=self._default_fill_value) + return self._constructor( + sdict, index=self.index, columns=columns, + default_fill_value=self._default_fill_value).__finalize__(self) def _reindex_with_indexers(self, reindexers, method=None, fill_value=None, limit=None, copy=False, allow_dups=False): @@ -584,8 +601,8 @@ def _reindex_with_indexers(self, reindexers, method=None, fill_value=None, else: new_arrays[col] = self[col] - return SparseDataFrame(new_arrays, index=index, - columns=columns).__finalize__(self) + return self._constructor(new_arrays, index=index, + columns=columns).__finalize__(self) def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix='', sort=False): @@ -642,7 +659,7 @@ def transpose(self, *args, **kwargs): Returns a DataFrame with the rows/columns switched. """ nv.validate_transpose(args, kwargs) - return SparseDataFrame( + return self._constructor( self.values.T, index=self.columns, columns=self.index, default_fill_value=self._default_fill_value, default_kind=self._default_kind).__finalize__(self) @@ -676,6 +693,14 @@ def cumsum(self, axis=0, *args, **kwargs): return self.apply(lambda x: x.cumsum(), axis=axis) + @Appender(generic._shared_docs['isnull']) + def isnull(self): + return self._apply_columns(lambda x: x.isnull()) + + @Appender(generic._shared_docs['isnotnull']) + def isnotnull(self): + return self._apply_columns(lambda x: x.isnotnull()) + def apply(self, func, axis=0, broadcast=False, reduce=False): """ Analogous to DataFrame.apply, for SparseDataFrame @@ -701,7 +726,7 @@ def apply(self, func, axis=0, broadcast=False, reduce=False): new_series = {} for k, v in compat.iteritems(self): applied = func(v) - applied.fill_value = func(applied.fill_value) + applied.fill_value = func(v.fill_value) new_series[k] = applied return self._constructor( new_series, index=self.index, columns=self.columns, diff --git a/pandas/sparse/list.py b/pandas/sparse/list.py index bc10b73a47723..82de8cd7d3959 100644 --- a/pandas/sparse/list.py +++ b/pandas/sparse/list.py @@ -1,10 +1,11 @@ +import warnings import numpy as np from pandas.core.base import PandasObject from pandas.formats.printing import pprint_thing +from pandas.types.common import is_scalar from pandas.sparse.array import SparseArray import pandas._sparse as splib -import pandas.lib as lib class SparseList(PandasObject): @@ -20,6 +21,11 @@ class SparseList(PandasObject): """ def __init__(self, data=None, fill_value=np.nan): + + # see gh-13784 + warnings.warn("SparseList is deprecated and will be removed " + "in a future version", FutureWarning, stacklevel=2) + self.fill_value = fill_value self._chunks = [] @@ -121,7 +127,7 @@ def append(self, value): ---------- value: scalar or array-like """ - if lib.isscalar(value): + if is_scalar(value): value = [value] sparr = SparseArray(value, fill_value=self.fill_value) diff --git a/pandas/sparse/panel.py b/pandas/sparse/panel.py deleted file mode 100644 index 88f396d20a91e..0000000000000 --- a/pandas/sparse/panel.py +++ /dev/null @@ -1,563 +0,0 @@ -""" -Data structures for sparse float data. Life is made simpler by dealing only -with float64 data -""" - -# pylint: disable=E1101,E1103,W0231 - -import warnings -from pandas.compat import lrange, zip -from pandas import compat -import numpy as np - -from pandas.core.index import Index, MultiIndex, _ensure_index -from pandas.core.frame import DataFrame -from pandas.core.panel import Panel -from pandas.sparse.frame import SparseDataFrame -from pandas.util.decorators import deprecate - -import pandas.core.common as com -import pandas.core.ops as ops -import pandas.lib as lib - - -class SparsePanelAxis(object): - def __init__(self, cache_field, frame_attr): - self.cache_field = cache_field - self.frame_attr = frame_attr - - def __get__(self, obj, type=None): - return getattr(obj, self.cache_field, None) - - def __set__(self, obj, value): - value = _ensure_index(value) - - if isinstance(value, MultiIndex): - raise NotImplementedError("value cannot be a MultiIndex") - - for v in compat.itervalues(obj._frames): - setattr(v, self.frame_attr, value) - - setattr(obj, self.cache_field, value) - - -class SparsePanel(Panel): - """ - Sparse version of Panel - - Parameters - ---------- - frames : dict of DataFrame objects - items : array-like - major_axis : array-like - minor_axis : array-like - default_kind : {'block', 'integer'}, default 'block' - Default sparse kind for converting Series to SparseSeries. Will not - override SparseSeries passed into constructor - default_fill_value : float - Default fill_value for converting Series to SparseSeries. Will not - override SparseSeries passed in - - Notes - ----- - """ - ndim = 3 - _typ = 'panel' - _subtyp = 'sparse_panel' - - def __init__(self, frames=None, items=None, major_axis=None, - minor_axis=None, default_fill_value=np.nan, - default_kind='block', copy=False): - - # deprecation #11157 - warnings.warn("SparsePanel is deprecated and will be removed in a " - "future version", FutureWarning, stacklevel=2) - - if frames is None: - frames = {} - - if isinstance(frames, np.ndarray): - new_frames = {} - for item, vals in zip(items, frames): - new_frames[item] = SparseDataFrame( - vals, index=major_axis, columns=minor_axis, - default_fill_value=default_fill_value, - default_kind=default_kind) - frames = new_frames - - if not isinstance(frames, dict): - raise TypeError('input must be a dict, a %r was passed' % - type(frames).__name__) - - self.default_fill_value = fill_value = default_fill_value - self.default_kind = kind = default_kind - - # pre-filter, if necessary - if items is None: - items = Index(sorted(frames.keys())) - items = _ensure_index(items) - - (clean_frames, major_axis, - minor_axis) = _convert_frames(frames, major_axis, minor_axis, - kind=kind, fill_value=fill_value) - - self._frames = clean_frames - - # do we want to fill missing ones? - for item in items: - if item not in clean_frames: - raise ValueError('column %r not found in data' % item) - - self._items = items - self.major_axis = major_axis - self.minor_axis = minor_axis - - def _consolidate_inplace(self): # pragma: no cover - # do nothing when DataFrame calls this method - pass - - def __array_wrap__(self, result): - return SparsePanel(result, items=self.items, - major_axis=self.major_axis, - minor_axis=self.minor_axis, - default_kind=self.default_kind, - default_fill_value=self.default_fill_value) - - @classmethod - def from_dict(cls, data): - """ - Analogous to Panel.from_dict - """ - return SparsePanel(data) - - def to_dense(self): - """ - Convert SparsePanel to (dense) Panel - - Returns - ------- - dense : Panel - """ - return Panel(self.values, self.items, self.major_axis, self.minor_axis) - - def as_matrix(self): - return self.values - - @property - def values(self): - # return dense values - return np.array([self._frames[item].values for item in self.items]) - - # need a special property for items to make the field assignable - - _items = None - - def _get_items(self): - return self._items - - def _set_items(self, new_items): - new_items = _ensure_index(new_items) - if isinstance(new_items, MultiIndex): - raise NotImplementedError("itemps cannot be a MultiIndex") - - # need to create new frames dict - - old_frame_dict = self._frames - old_items = self._items - self._frames = dict((new_k, old_frame_dict[old_k]) - for new_k, old_k in zip(new_items, old_items)) - self._items = new_items - - items = property(fget=_get_items, fset=_set_items) - - # DataFrame's index - major_axis = SparsePanelAxis('_major_axis', 'index') - - # DataFrame's columns / "items" - minor_axis = SparsePanelAxis('_minor_axis', 'columns') - - def _ixs(self, i, axis=0): - """ - for compat as we don't support Block Manager here - i : int, slice, or sequence of integers - axis : int - """ - - key = self._get_axis(axis)[i] - - # xs cannot handle a non-scalar key, so just reindex here - if com.is_list_like(key): - return self.reindex(**{self._get_axis_name(axis): key}) - - return self.xs(key, axis=axis) - - def _slice(self, slobj, axis=0, kind=None): - """ - for compat as we don't support Block Manager here - """ - axis = self._get_axis_name(axis) - index = self._get_axis(axis) - - return self.reindex(**{axis: index[slobj]}) - - def _get_item_cache(self, key): - return self._frames[key] - - def __setitem__(self, key, value): - if isinstance(value, DataFrame): - value = value.reindex(index=self.major_axis, - columns=self.minor_axis) - if not isinstance(value, SparseDataFrame): - value = value.to_sparse(fill_value=self.default_fill_value, - kind=self.default_kind) - else: - raise ValueError('only DataFrame objects can be set currently') - - self._frames[key] = value - - if key not in self.items: - self._items = Index(list(self.items) + [key]) - - def set_value(self, item, major, minor, value): - """ - Quickly set single value at (item, major, minor) location - - Parameters - ---------- - item : item label (panel item) - major : major axis label (panel item row) - minor : minor axis label (panel item column) - value : scalar - - Notes - ----- - This method *always* returns a new object. It is not particularly - efficient but is provided for API compatibility with Panel - - Returns - ------- - panel : SparsePanel - """ - dense = self.to_dense().set_value(item, major, minor, value) - return dense.to_sparse(kind=self.default_kind, - fill_value=self.default_fill_value) - - def __delitem__(self, key): - loc = self.items.get_loc(key) - indices = lrange(loc) + lrange(loc + 1, len(self.items)) - del self._frames[key] - self._items = self._items.take(indices) - - def __getstate__(self): - # pickling - from pandas.io.pickle import _pickle_array - return (self._frames, _pickle_array(self.items), - _pickle_array(self.major_axis), - _pickle_array(self.minor_axis), self.default_fill_value, - self.default_kind) - - def __setstate__(self, state): - frames, items, major, minor, fv, kind = state - - from pandas.io.pickle import _unpickle_array - self.default_fill_value = fv - self.default_kind = kind - self._items = _ensure_index(_unpickle_array(items)) - self._major_axis = _ensure_index(_unpickle_array(major)) - self._minor_axis = _ensure_index(_unpickle_array(minor)) - self._frames = frames - - def copy(self, deep=True): - """ - Make a copy of the sparse panel - - Returns - ------- - copy : SparsePanel - """ - - d = self._construct_axes_dict() - if deep: - new_data = dict((k, v.copy(deep=True)) - for k, v in compat.iteritems(self._frames)) - d = dict((k, v.copy(deep=True)) for k, v in compat.iteritems(d)) - else: - new_data = self._frames.copy() - d['default_fill_value'] = self.default_fill_value - d['default_kind'] = self.default_kind - - return SparsePanel(new_data, **d) - - def to_frame(self, filter_observations=True): - """ - Convert SparsePanel to (dense) DataFrame - - Returns - ------- - frame : DataFrame - """ - if not filter_observations: - raise TypeError('filter_observations=False not supported for ' - 'SparsePanel.to_long') - - I, N, K = self.shape - counts = np.zeros(N * K, dtype=int) - - d_values = {} - d_indexer = {} - - for item in self.items: - frame = self[item] - - values, major, minor = _stack_sparse_info(frame) - - # values are stacked column-major - indexer = minor * N + major - counts.put(indexer, counts.take(indexer) + 1) # cuteness - - d_values[item] = values - d_indexer[item] = indexer - - # have full set of observations for each item - mask = counts == I - - # for each item, take mask values at index locations for those sparse - # values, and use that to select values - values = np.column_stack([d_values[item][mask.take(d_indexer[item])] - for item in self.items]) - - inds, = mask.nonzero() - - # still column major - major_labels = inds % N - minor_labels = inds // N - - index = MultiIndex(levels=[self.major_axis, self.minor_axis], - labels=[major_labels, minor_labels], - verify_integrity=False) - - df = DataFrame(values, index=index, columns=self.items) - return df.sortlevel(level=0) - - to_long = deprecate('to_long', to_frame) - toLong = deprecate('toLong', to_frame) - - def reindex(self, major=None, items=None, minor=None, major_axis=None, - minor_axis=None, copy=False): - """ - Conform / reshape panel axis labels to new input labels - - Parameters - ---------- - major : array-like, default None - items : array-like, default None - minor : array-like, default None - copy : boolean, default False - Copy underlying SparseDataFrame objects - - Returns - ------- - reindexed : SparsePanel - """ - major = com._mut_exclusive(major=major, major_axis=major_axis) - minor = com._mut_exclusive(minor=minor, minor_axis=minor_axis) - - if com._all_none(items, major, minor): - raise ValueError('Must specify at least one axis') - - major = self.major_axis if major is None else major - minor = self.minor_axis if minor is None else minor - - if items is not None: - new_frames = {} - for item in items: - if item in self._frames: - new_frames[item] = self._frames[item] - else: - raise NotImplementedError('Reindexing with new items not ' - 'yet supported') - else: - new_frames = self._frames - - if copy: - new_frames = dict((k, v.copy()) - for k, v in compat.iteritems(new_frames)) - - return SparsePanel(new_frames, items=items, major_axis=major, - minor_axis=minor, - default_fill_value=self.default_fill_value, - default_kind=self.default_kind) - - def _combine(self, other, func, axis=0): - if isinstance(other, DataFrame): - return self._combineFrame(other, func, axis=axis) - elif isinstance(other, Panel): - return self._combinePanel(other, func) - elif lib.isscalar(other): - new_frames = dict((k, func(v, other)) - for k, v in self.iteritems()) - return self._new_like(new_frames) - - def _combineFrame(self, other, func, axis=0): - index, columns = self._get_plane_axes(axis) - axis = self._get_axis_number(axis) - - other = other.reindex(index=index, columns=columns) - - if axis == 0: - new_values = func(self.values, other.values) - elif axis == 1: - new_values = func(self.values.swapaxes(0, 1), other.values.T) - new_values = new_values.swapaxes(0, 1) - elif axis == 2: - new_values = func(self.values.swapaxes(0, 2), other.values) - new_values = new_values.swapaxes(0, 2) - - # TODO: make faster! - new_frames = {} - for item, item_slice in zip(self.items, new_values): - old_frame = self[item] - ofv = old_frame.default_fill_value - ok = old_frame.default_kind - new_frames[item] = SparseDataFrame(item_slice, - index=self.major_axis, - columns=self.minor_axis, - default_fill_value=ofv, - default_kind=ok) - - return self._new_like(new_frames) - - def _new_like(self, new_frames): - return SparsePanel(new_frames, self.items, self.major_axis, - self.minor_axis, - default_fill_value=self.default_fill_value, - default_kind=self.default_kind) - - def _combinePanel(self, other, func): - items = self.items.union(other.items) - major = self.major_axis.union(other.major_axis) - minor = self.minor_axis.union(other.minor_axis) - - # could check that everything's the same size, but forget it - - this = self.reindex(items=items, major=major, minor=minor) - other = other.reindex(items=items, major=major, minor=minor) - - new_frames = {} - for item in items: - new_frames[item] = func(this[item], other[item]) - - if not isinstance(other, SparsePanel): - new_default_fill = self.default_fill_value - else: - # maybe unnecessary - new_default_fill = func(self.default_fill_value, - other.default_fill_value) - - return SparsePanel(new_frames, items, major, minor, - default_fill_value=new_default_fill, - default_kind=self.default_kind) - - def major_xs(self, key): - """ - Return slice of panel along major axis - - Parameters - ---------- - key : object - Major axis label - - Returns - ------- - y : DataFrame - index -> minor axis, columns -> items - """ - slices = dict((k, v.xs(key)) for k, v in self.iteritems()) - return DataFrame(slices, index=self.minor_axis, columns=self.items) - - def minor_xs(self, key): - """ - Return slice of panel along minor axis - - Parameters - ---------- - key : object - Minor axis label - - Returns - ------- - y : SparseDataFrame - index -> major axis, columns -> items - """ - slices = dict((k, v[key]) for k, v in self.iteritems()) - return SparseDataFrame(slices, index=self.major_axis, - columns=self.items, - default_fill_value=self.default_fill_value, - default_kind=self.default_kind) - - # TODO: allow SparsePanel to work with flex arithmetic. - # pow and mod only work for scalars for now - def pow(self, val, *args, **kwargs): - """wrapper around `__pow__` (only works for scalar values)""" - return self.__pow__(val) - - def mod(self, val, *args, **kwargs): - """wrapper around `__mod__` (only works for scalar values""" - return self.__mod__(val) - -# Sparse objects opt out of numexpr -SparsePanel._add_aggregate_operations(use_numexpr=False) -ops.add_special_arithmetic_methods(SparsePanel, use_numexpr=False, ** - ops.panel_special_funcs) -SparseWidePanel = SparsePanel - - -def _convert_frames(frames, index, columns, fill_value=np.nan, kind='block'): - from pandas.core.panel import _get_combined_index - output = {} - for item, df in compat.iteritems(frames): - if not isinstance(df, SparseDataFrame): - df = SparseDataFrame(df, default_kind=kind, - default_fill_value=fill_value) - - output[item] = df - - if index is None: - all_indexes = [x.index for x in output.values()] - index = _get_combined_index(all_indexes) - if columns is None: - all_columns = [x.columns for x in output.values()] - columns = _get_combined_index(all_columns) - - index = _ensure_index(index) - columns = _ensure_index(columns) - - for item, df in compat.iteritems(output): - if not (df.index.equals(index) and df.columns.equals(columns)): - output[item] = df.reindex(index=index, columns=columns) - - return output, index, columns - - -def _stack_sparse_info(frame): - lengths = [s.sp_index.npoints for _, s in compat.iteritems(frame)] - - # this is pretty fast - minor_labels = np.repeat(np.arange(len(frame.columns)), lengths) - - inds_to_concat = [] - vals_to_concat = [] - for col in frame.columns: - series = frame[col] - - if not np.isnan(series.fill_value): - raise TypeError('This routine assumes NaN fill value') - - int_index = series.sp_index.to_int_index() - inds_to_concat.append(int_index.indices) - vals_to_concat.append(series.sp_values) - - major_labels = np.concatenate(inds_to_concat) - sparse_values = np.concatenate(vals_to_concat) - - return sparse_values, major_labels, minor_labels diff --git a/pandas/sparse/series.py b/pandas/sparse/series.py index 519068b97a010..ad9168890b8f2 100644 --- a/pandas/sparse/series.py +++ b/pandas/sparse/series.py @@ -7,10 +7,12 @@ import numpy as np import warnings -import operator + +from pandas.types.missing import isnull, notnull +from pandas.types.common import is_scalar +from pandas.core.common import _values_from_object, _maybe_match_name from pandas.compat.numpy import function as nv -from pandas.core.common import isnull, _values_from_object, _maybe_match_name from pandas.core.index import Index, _ensure_index, InvalidIndexError from pandas.core.series import Series from pandas.core.frame import DataFrame @@ -19,7 +21,6 @@ import pandas.core.common as com import pandas.core.ops as ops import pandas.index as _index -import pandas.lib as lib from pandas.util.decorators import Appender from pandas.sparse.array import (make_sparse, _sparse_array_op, SparseArray, @@ -55,18 +56,12 @@ def wrapper(self, other): return _sparse_series_op(self, other, op, name) elif isinstance(other, DataFrame): return NotImplemented - elif lib.isscalar(other): - if isnull(other) or isnull(self.fill_value): - new_fill_value = np.nan - else: - new_fill_value = op(np.float64(self.fill_value), - np.float64(other)) - - return SparseSeries(op(self.sp_values, other), - index=self.index, - sparse_index=self.sp_index, - fill_value=new_fill_value, - name=self.name) + elif is_scalar(other): + with np.errstate(all='ignore'): + new_values = op(self.values, other) + return self._constructor(new_values, + index=self.index, + name=self.name) else: # pragma: no cover raise TypeError('operation with %s not supported' % type(other)) @@ -83,8 +78,9 @@ def _sparse_series_op(left, right, op, name): new_index = left.index new_name = _maybe_match_name(left, right) - result = _sparse_array_op(left, right, op, name) - return SparseSeries(result, index=new_index, name=new_name) + result = _sparse_array_op(left.values, right.values, op, name, + series=True) + return left._constructor(result, index=new_index, name=new_name) class SparseSeries(Series): @@ -95,7 +91,8 @@ class SparseSeries(Series): data : {array-like, Series, SparseSeries, dict} kind : {'block', 'integer'} fill_value : float - Defaults to NaN (code for missing) + Code for missing value. Defaults depends on dtype. + 0 for int dtype, False for bool dtype, and NaN for other dtypes sparse_index : {BlockIndex, IntIndex}, optional Only if you have one. Mainly used internally @@ -129,26 +126,20 @@ def __init__(self, data=None, index=None, sparse_index=None, kind='block', if isinstance(data, Series) and name is None: name = data.name - is_sparse_array = isinstance(data, SparseArray) - if fill_value is None: - if is_sparse_array: - fill_value = data.fill_value - else: - fill_value = np.nan - - if is_sparse_array: - if isinstance(data, SparseSeries) and index is None: - index = data.index.view() - elif index is not None: + if isinstance(data, SparseArray): + if index is not None: assert (len(index) == len(data)) - sparse_index = data.sp_index + if fill_value is None: + fill_value = data.fill_value + data = np.asarray(data) elif isinstance(data, SparseSeries): if index is None: index = data.index.view() - + if fill_value is None: + fill_value = data.fill_value # extract the SingleBlockManager data = data._data @@ -157,14 +148,14 @@ def __init__(self, data=None, index=None, sparse_index=None, kind='block', index = data.index.view() data = Series(data) - data, sparse_index = make_sparse(data, kind=kind, - fill_value=fill_value) + res = make_sparse(data, kind=kind, fill_value=fill_value) + data, sparse_index, fill_value = res elif isinstance(data, (tuple, list, np.ndarray)): # array-like if sparse_index is None: - data, sparse_index = make_sparse(data, kind=kind, - fill_value=fill_value) + res = make_sparse(data, kind=kind, fill_value=fill_value) + data, sparse_index, fill_value = res else: assert (len(data) == sparse_index.npoints) @@ -306,13 +297,23 @@ def __unicode__(self): rep = '%s\n%s' % (series_rep, repr(self.sp_index)) return rep - def __array_wrap__(self, result): + def __array_wrap__(self, result, context=None): """ Gets called prior to a ufunc (and after) + + See SparseArray.__array_wrap__ for detail. """ + if isinstance(context, tuple) and len(context) == 3: + ufunc, args, domain = context + args = [getattr(a, 'fill_value', a) for a in args] + with np.errstate(all='ignore'): + fill_value = ufunc(self.fill_value, *args[1:]) + else: + fill_value = self.fill_value + return self._constructor(result, index=self.index, sparse_index=self.sp_index, - fill_value=self.fill_value, + fill_value=fill_value, copy=False).__finalize__(self) def __array_finalize__(self, obj): @@ -433,10 +434,8 @@ def abs(self): ------- abs: type of caller """ - res_sp_values = np.abs(self.sp_values) - return self._constructor(res_sp_values, index=self.index, - sparse_index=self.sp_index, - fill_value=self.fill_value).__finalize__(self) + return self._constructor(np.abs(self.values), + index=self.index).__finalize__(self) def get(self, label, default=None): """ @@ -607,6 +606,7 @@ def take(self, indices, axis=0, convert=True, *args, **kwargs): ------- taken : ndarray """ + convert = nv.validate_take_with_convert(convert, args, kwargs) new_values = SparseArray.take(self.values, indices) new_index = self.index.take(indices) @@ -631,6 +631,20 @@ def cumsum(self, axis=0, *args, **kwargs): # TODO: gh-12855 - return a SparseSeries here return Series(new_array, index=self.index).__finalize__(self) + @Appender(generic._shared_docs['isnull']) + def isnull(self): + arr = SparseArray(isnull(self.values.sp_values), + sparse_index=self.values.sp_index, + fill_value=isnull(self.fill_value)) + return self._constructor(arr, index=self.index).__finalize__(self) + + @Appender(generic._shared_docs['isnotnull']) + def isnotnull(self): + arr = SparseArray(notnull(self.values.sp_values), + sparse_index=self.values.sp_index, + fill_value=notnull(self.fill_value)) + return self._constructor(arr, index=self.index).__finalize__(self) + def dropna(self, axis=0, inplace=False, **kwargs): """ Analogous to Series.dropna. If fill_value=NaN, returns a dense Series @@ -803,7 +817,7 @@ def from_coo(cls, A, dense_index=False): # overwrite basic arithmetic to use SparseSeries version # force methods to overwrite previous definitions. ops.add_special_arithmetic_methods(SparseSeries, _arith_method, - radd_func=operator.add, comp_method=None, + comp_method=_arith_method, bool_method=None, use_numexpr=False, force=True) diff --git a/pandas/sparse/tests/test_arithmetics.py b/pandas/sparse/tests/test_arithmetics.py new file mode 100644 index 0000000000000..f24244b38c42b --- /dev/null +++ b/pandas/sparse/tests/test_arithmetics.py @@ -0,0 +1,453 @@ +import numpy as np +import pandas as pd +import pandas.util.testing as tm + + +class TestSparseArrayArithmetics(tm.TestCase): + + _multiprocess_can_split_ = True + + _base = np.array + _klass = pd.SparseArray + + def _assert(self, a, b): + tm.assert_numpy_array_equal(a, b) + + def _check_numeric_ops(self, a, b, a_dense, b_dense): + with np.errstate(invalid='ignore', divide='ignore'): + # Unfortunately, trying to wrap the computation of each expected + # value is with np.errstate() is too tedious. + + # sparse & sparse + self._assert((a + b).to_dense(), a_dense + b_dense) + self._assert((b + a).to_dense(), b_dense + a_dense) + + self._assert((a - b).to_dense(), a_dense - b_dense) + self._assert((b - a).to_dense(), b_dense - a_dense) + + self._assert((a * b).to_dense(), a_dense * b_dense) + self._assert((b * a).to_dense(), b_dense * a_dense) + + # pandas uses future division + self._assert((a / b).to_dense(), a_dense * 1.0 / b_dense) + self._assert((b / a).to_dense(), b_dense * 1.0 / a_dense) + + # ToDo: FIXME in GH 13843 + if not (self._base == pd.Series and a.dtype == 'int64'): + self._assert((a // b).to_dense(), a_dense // b_dense) + self._assert((b // a).to_dense(), b_dense // a_dense) + + self._assert((a % b).to_dense(), a_dense % b_dense) + self._assert((b % a).to_dense(), b_dense % a_dense) + + self._assert((a ** b).to_dense(), a_dense ** b_dense) + self._assert((b ** a).to_dense(), b_dense ** a_dense) + + # sparse & dense + self._assert((a + b_dense).to_dense(), a_dense + b_dense) + self._assert((b_dense + a).to_dense(), b_dense + a_dense) + + self._assert((a - b_dense).to_dense(), a_dense - b_dense) + self._assert((b_dense - a).to_dense(), b_dense - a_dense) + + self._assert((a * b_dense).to_dense(), a_dense * b_dense) + self._assert((b_dense * a).to_dense(), b_dense * a_dense) + + # pandas uses future division + self._assert((a / b_dense).to_dense(), a_dense * 1.0 / b_dense) + self._assert((b_dense / a).to_dense(), b_dense * 1.0 / a_dense) + + # ToDo: FIXME in GH 13843 + if not (self._base == pd.Series and a.dtype == 'int64'): + self._assert((a // b_dense).to_dense(), a_dense // b_dense) + self._assert((b_dense // a).to_dense(), b_dense // a_dense) + + self._assert((a % b_dense).to_dense(), a_dense % b_dense) + self._assert((b_dense % a).to_dense(), b_dense % a_dense) + + self._assert((a ** b_dense).to_dense(), a_dense ** b_dense) + self._assert((b_dense ** a).to_dense(), b_dense ** a_dense) + + def _check_bool_result(self, res): + tm.assertIsInstance(res, self._klass) + self.assertEqual(res.dtype, np.bool) + self.assertIsInstance(res.fill_value, bool) + + def _check_comparison_ops(self, a, b, a_dense, b_dense): + with np.errstate(invalid='ignore'): + # Unfortunately, trying to wrap the computation of each expected + # value is with np.errstate() is too tedious. + # + # sparse & sparse + self._check_bool_result(a == b) + self._assert((a == b).to_dense(), a_dense == b_dense) + + self._check_bool_result(a != b) + self._assert((a != b).to_dense(), a_dense != b_dense) + + self._check_bool_result(a >= b) + self._assert((a >= b).to_dense(), a_dense >= b_dense) + + self._check_bool_result(a <= b) + self._assert((a <= b).to_dense(), a_dense <= b_dense) + + self._check_bool_result(a > b) + self._assert((a > b).to_dense(), a_dense > b_dense) + + self._check_bool_result(a < b) + self._assert((a < b).to_dense(), a_dense < b_dense) + + # sparse & dense + self._check_bool_result(a == b_dense) + self._assert((a == b_dense).to_dense(), a_dense == b_dense) + + self._check_bool_result(a != b_dense) + self._assert((a != b_dense).to_dense(), a_dense != b_dense) + + self._check_bool_result(a >= b_dense) + self._assert((a >= b_dense).to_dense(), a_dense >= b_dense) + + self._check_bool_result(a <= b_dense) + self._assert((a <= b_dense).to_dense(), a_dense <= b_dense) + + self._check_bool_result(a > b_dense) + self._assert((a > b_dense).to_dense(), a_dense > b_dense) + + self._check_bool_result(a < b_dense) + self._assert((a < b_dense).to_dense(), a_dense < b_dense) + + def _check_logical_ops(self, a, b, a_dense, b_dense): + # sparse & sparse + self._check_bool_result(a & b) + self._assert((a & b).to_dense(), a_dense & b_dense) + + self._check_bool_result(a | b) + self._assert((a | b).to_dense(), a_dense | b_dense) + # sparse & dense + self._check_bool_result(a & b_dense) + self._assert((a & b_dense).to_dense(), a_dense & b_dense) + + self._check_bool_result(a | b_dense) + self._assert((a | b_dense).to_dense(), a_dense | b_dense) + + def test_float_scalar(self): + values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) + + for kind in ['integer', 'block']: + a = self._klass(values, kind=kind) + self._check_numeric_ops(a, 1, values, 1) + self._check_numeric_ops(a, 0, values, 0) + self._check_numeric_ops(a, 3, values, 3) + + a = self._klass(values, kind=kind, fill_value=0) + self._check_numeric_ops(a, 1, values, 1) + self._check_numeric_ops(a, 0, values, 0) + self._check_numeric_ops(a, 3, values, 3) + + a = self._klass(values, kind=kind, fill_value=2) + self._check_numeric_ops(a, 1, values, 1) + self._check_numeric_ops(a, 0, values, 0) + self._check_numeric_ops(a, 3, values, 3) + + def test_float_scalar_comparison(self): + values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) + + for kind in ['integer', 'block']: + a = self._klass(values, kind=kind) + self._check_comparison_ops(a, 1, values, 1) + self._check_comparison_ops(a, 0, values, 0) + self._check_comparison_ops(a, 3, values, 3) + + a = self._klass(values, kind=kind, fill_value=0) + self._check_comparison_ops(a, 1, values, 1) + self._check_comparison_ops(a, 0, values, 0) + self._check_comparison_ops(a, 3, values, 3) + + a = self._klass(values, kind=kind, fill_value=2) + self._check_comparison_ops(a, 1, values, 1) + self._check_comparison_ops(a, 0, values, 0) + self._check_comparison_ops(a, 3, values, 3) + + def test_float_same_index(self): + # when sp_index are the same + for kind in ['integer', 'block']: + values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) + rvalues = self._base([np.nan, 2, 3, 4, np.nan, 0, 1, 3, 2, np.nan]) + + a = self._klass(values, kind=kind) + b = self._klass(rvalues, kind=kind) + self._check_numeric_ops(a, b, values, rvalues) + + values = self._base([0., 1., 2., 6., 0., 0., 1., 2., 1., 0.]) + rvalues = self._base([0., 2., 3., 4., 0., 0., 1., 3., 2., 0.]) + + a = self._klass(values, kind=kind, fill_value=0) + b = self._klass(rvalues, kind=kind, fill_value=0) + self._check_numeric_ops(a, b, values, rvalues) + + def test_float_same_index_comparison(self): + # when sp_index are the same + for kind in ['integer', 'block']: + values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) + rvalues = self._base([np.nan, 2, 3, 4, np.nan, 0, 1, 3, 2, np.nan]) + + a = self._klass(values, kind=kind) + b = self._klass(rvalues, kind=kind) + self._check_comparison_ops(a, b, values, rvalues) + + values = self._base([0., 1., 2., 6., 0., 0., 1., 2., 1., 0.]) + rvalues = self._base([0., 2., 3., 4., 0., 0., 1., 3., 2., 0.]) + + a = self._klass(values, kind=kind, fill_value=0) + b = self._klass(rvalues, kind=kind, fill_value=0) + self._check_comparison_ops(a, b, values, rvalues) + + def test_float_array(self): + values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) + rvalues = self._base([2, np.nan, 2, 3, np.nan, 0, 1, 5, 2, np.nan]) + + for kind in ['integer', 'block']: + a = self._klass(values, kind=kind) + b = self._klass(rvalues, kind=kind) + self._check_numeric_ops(a, b, values, rvalues) + self._check_numeric_ops(a, b * 0, values, rvalues * 0) + + a = self._klass(values, kind=kind, fill_value=0) + b = self._klass(rvalues, kind=kind) + self._check_numeric_ops(a, b, values, rvalues) + + a = self._klass(values, kind=kind, fill_value=0) + b = self._klass(rvalues, kind=kind, fill_value=0) + self._check_numeric_ops(a, b, values, rvalues) + + a = self._klass(values, kind=kind, fill_value=1) + b = self._klass(rvalues, kind=kind, fill_value=2) + self._check_numeric_ops(a, b, values, rvalues) + + def test_float_array_different_kind(self): + values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) + rvalues = self._base([2, np.nan, 2, 3, np.nan, 0, 1, 5, 2, np.nan]) + + a = self._klass(values, kind='integer') + b = self._klass(rvalues, kind='block') + self._check_numeric_ops(a, b, values, rvalues) + self._check_numeric_ops(a, b * 0, values, rvalues * 0) + + a = self._klass(values, kind='integer', fill_value=0) + b = self._klass(rvalues, kind='block') + self._check_numeric_ops(a, b, values, rvalues) + + a = self._klass(values, kind='integer', fill_value=0) + b = self._klass(rvalues, kind='block', fill_value=0) + self._check_numeric_ops(a, b, values, rvalues) + + a = self._klass(values, kind='integer', fill_value=1) + b = self._klass(rvalues, kind='block', fill_value=2) + self._check_numeric_ops(a, b, values, rvalues) + + def test_float_array_comparison(self): + values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) + rvalues = self._base([2, np.nan, 2, 3, np.nan, 0, 1, 5, 2, np.nan]) + + for kind in ['integer', 'block']: + a = self._klass(values, kind=kind) + b = self._klass(rvalues, kind=kind) + self._check_comparison_ops(a, b, values, rvalues) + self._check_comparison_ops(a, b * 0, values, rvalues * 0) + + a = self._klass(values, kind=kind, fill_value=0) + b = self._klass(rvalues, kind=kind) + self._check_comparison_ops(a, b, values, rvalues) + + a = self._klass(values, kind=kind, fill_value=0) + b = self._klass(rvalues, kind=kind, fill_value=0) + self._check_comparison_ops(a, b, values, rvalues) + + a = self._klass(values, kind=kind, fill_value=1) + b = self._klass(rvalues, kind=kind, fill_value=2) + self._check_comparison_ops(a, b, values, rvalues) + + def test_int_array(self): + # have to specify dtype explicitly until fixing GH 667 + dtype = np.int64 + + values = self._base([0, 1, 2, 0, 0, 0, 1, 2, 1, 0], dtype=dtype) + rvalues = self._base([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=dtype) + + for kind in ['integer', 'block']: + a = self._klass(values, dtype=dtype, kind=kind) + self.assertEqual(a.dtype, dtype) + b = self._klass(rvalues, dtype=dtype, kind=kind) + self.assertEqual(b.dtype, dtype) + + self._check_numeric_ops(a, b, values, rvalues) + self._check_numeric_ops(a, b * 0, values, rvalues * 0) + + a = self._klass(values, fill_value=0, dtype=dtype, kind=kind) + self.assertEqual(a.dtype, dtype) + b = self._klass(rvalues, dtype=dtype, kind=kind) + self.assertEqual(b.dtype, dtype) + + self._check_numeric_ops(a, b, values, rvalues) + + a = self._klass(values, fill_value=0, dtype=dtype, kind=kind) + self.assertEqual(a.dtype, dtype) + b = self._klass(rvalues, fill_value=0, dtype=dtype, kind=kind) + self.assertEqual(b.dtype, dtype) + self._check_numeric_ops(a, b, values, rvalues) + + a = self._klass(values, fill_value=1, dtype=dtype, kind=kind) + self.assertEqual(a.dtype, dtype) + b = self._klass(rvalues, fill_value=2, dtype=dtype, kind=kind) + self.assertEqual(b.dtype, dtype) + self._check_numeric_ops(a, b, values, rvalues) + + def test_int_array_comparison(self): + + # int32 NI ATM + for dtype in ['int64']: + values = self._base([0, 1, 2, 0, 0, 0, 1, 2, 1, 0], dtype=dtype) + rvalues = self._base([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=dtype) + + for kind in ['integer', 'block']: + a = self._klass(values, dtype=dtype, kind=kind) + b = self._klass(rvalues, dtype=dtype, kind=kind) + self._check_comparison_ops(a, b, values, rvalues) + self._check_comparison_ops(a, b * 0, values, rvalues * 0) + + a = self._klass(values, dtype=dtype, kind=kind, fill_value=0) + b = self._klass(rvalues, dtype=dtype, kind=kind) + self._check_comparison_ops(a, b, values, rvalues) + + a = self._klass(values, dtype=dtype, kind=kind, fill_value=0) + b = self._klass(rvalues, dtype=dtype, kind=kind, fill_value=0) + self._check_comparison_ops(a, b, values, rvalues) + + a = self._klass(values, dtype=dtype, kind=kind, fill_value=1) + b = self._klass(rvalues, dtype=dtype, kind=kind, fill_value=2) + self._check_comparison_ops(a, b, values, rvalues) + + def test_bool_same_index(self): + # GH 14000 + # when sp_index are the same + for kind in ['integer', 'block']: + values = self._base([True, False, True, True], dtype=np.bool) + rvalues = self._base([True, False, True, True], dtype=np.bool) + + for fill_value in [True, False, np.nan]: + a = self._klass(values, kind=kind, dtype=np.bool, + fill_value=fill_value) + b = self._klass(rvalues, kind=kind, dtype=np.bool, + fill_value=fill_value) + self._check_logical_ops(a, b, values, rvalues) + + def test_bool_array_logical(self): + # GH 14000 + # when sp_index are the same + for kind in ['integer', 'block']: + values = self._base([True, False, True, False, True, True], + dtype=np.bool) + rvalues = self._base([True, False, False, True, False, True], + dtype=np.bool) + + for fill_value in [True, False, np.nan]: + a = self._klass(values, kind=kind, dtype=np.bool, + fill_value=fill_value) + b = self._klass(rvalues, kind=kind, dtype=np.bool, + fill_value=fill_value) + self._check_logical_ops(a, b, values, rvalues) + + def test_mixed_array_float_int(self): + + for rdtype in ['int64']: + values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) + rvalues = self._base([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=rdtype) + + for kind in ['integer', 'block']: + a = self._klass(values, kind=kind) + b = self._klass(rvalues, kind=kind) + self.assertEqual(b.dtype, rdtype) + + self._check_numeric_ops(a, b, values, rvalues) + self._check_numeric_ops(a, b * 0, values, rvalues * 0) + + a = self._klass(values, kind=kind, fill_value=0) + b = self._klass(rvalues, kind=kind) + self.assertEqual(b.dtype, rdtype) + self._check_numeric_ops(a, b, values, rvalues) + + a = self._klass(values, kind=kind, fill_value=0) + b = self._klass(rvalues, kind=kind, fill_value=0) + self.assertEqual(b.dtype, rdtype) + self._check_numeric_ops(a, b, values, rvalues) + + a = self._klass(values, kind=kind, fill_value=1) + b = self._klass(rvalues, kind=kind, fill_value=2) + self.assertEqual(b.dtype, rdtype) + self._check_numeric_ops(a, b, values, rvalues) + + def test_mixed_array_comparison(self): + + # int32 NI ATM + for rdtype in ['int64']: + values = self._base([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) + rvalues = self._base([2, 0, 2, 3, 0, 0, 1, 5, 2, 0], dtype=rdtype) + + for kind in ['integer', 'block']: + a = self._klass(values, kind=kind) + b = self._klass(rvalues, kind=kind) + self.assertEqual(b.dtype, rdtype) + + self._check_comparison_ops(a, b, values, rvalues) + self._check_comparison_ops(a, b * 0, values, rvalues * 0) + + a = self._klass(values, kind=kind, fill_value=0) + b = self._klass(rvalues, kind=kind) + self.assertEqual(b.dtype, rdtype) + self._check_comparison_ops(a, b, values, rvalues) + + a = self._klass(values, kind=kind, fill_value=0) + b = self._klass(rvalues, kind=kind, fill_value=0) + self.assertEqual(b.dtype, rdtype) + self._check_comparison_ops(a, b, values, rvalues) + + a = self._klass(values, kind=kind, fill_value=1) + b = self._klass(rvalues, kind=kind, fill_value=2) + self.assertEqual(b.dtype, rdtype) + self._check_comparison_ops(a, b, values, rvalues) + + +class TestSparseSeriesArithmetic(TestSparseArrayArithmetics): + + _base = pd.Series + _klass = pd.SparseSeries + + def _assert(self, a, b): + tm.assert_series_equal(a, b) + + def test_alignment(self): + da = pd.Series(np.arange(4)) + db = pd.Series(np.arange(4), index=[1, 2, 3, 4]) + + sa = pd.SparseSeries(np.arange(4), dtype=np.int64, fill_value=0) + sb = pd.SparseSeries(np.arange(4), index=[1, 2, 3, 4], + dtype=np.int64, fill_value=0) + self._check_numeric_ops(sa, sb, da, db) + + sa = pd.SparseSeries(np.arange(4), dtype=np.int64, fill_value=np.nan) + sb = pd.SparseSeries(np.arange(4), index=[1, 2, 3, 4], + dtype=np.int64, fill_value=np.nan) + self._check_numeric_ops(sa, sb, da, db) + + da = pd.Series(np.arange(4)) + db = pd.Series(np.arange(4), index=[10, 11, 12, 13]) + + sa = pd.SparseSeries(np.arange(4), dtype=np.int64, fill_value=0) + sb = pd.SparseSeries(np.arange(4), index=[10, 11, 12, 13], + dtype=np.int64, fill_value=0) + self._check_numeric_ops(sa, sb, da, db) + + sa = pd.SparseSeries(np.arange(4), dtype=np.int64, fill_value=np.nan) + sb = pd.SparseSeries(np.arange(4), index=[10, 11, 12, 13], + dtype=np.int64, fill_value=np.nan) + self._check_numeric_ops(sa, sb, da, db) diff --git a/pandas/sparse/tests/test_array.py b/pandas/sparse/tests/test_array.py index dd2126d0f52d2..dd86e9e791e5e 100644 --- a/pandas/sparse/tests/test_array.py +++ b/pandas/sparse/tests/test_array.py @@ -7,7 +7,7 @@ import numpy as np from pandas import _np_version_under1p8 -from pandas.sparse.api import SparseArray +from pandas.sparse.api import SparseArray, SparseSeries from pandas._sparse import IntIndex from pandas.util.testing import assert_almost_equal, assertRaisesRegexp import pandas.util.testing as tm @@ -30,9 +30,13 @@ def test_constructor_dtype(self): self.assertEqual(arr.dtype, np.float64) self.assertEqual(arr.fill_value, 0) + arr = SparseArray([0, 1, 2, 4], dtype=np.float64) + self.assertEqual(arr.dtype, np.float64) + self.assertTrue(np.isnan(arr.fill_value)) + arr = SparseArray([0, 1, 2, 4], dtype=np.int64) self.assertEqual(arr.dtype, np.int64) - self.assertTrue(np.isnan(arr.fill_value)) + self.assertEqual(arr.fill_value, 0) arr = SparseArray([0, 1, 2, 4], fill_value=0, dtype=np.int64) self.assertEqual(arr.dtype, np.int64) @@ -40,7 +44,7 @@ def test_constructor_dtype(self): arr = SparseArray([0, 1, 2, 4], dtype=None) self.assertEqual(arr.dtype, np.int64) - self.assertTrue(np.isnan(arr.fill_value)) + self.assertEqual(arr.fill_value, 0) arr = SparseArray([0, 1, 2, 4], fill_value=0, dtype=None) self.assertEqual(arr.dtype, np.int64) @@ -63,13 +67,13 @@ def test_constructor_spindex_dtype(self): self.assertEqual(arr.dtype, np.float64) self.assertTrue(np.isnan(arr.fill_value)) - arr = SparseArray(data=[0, 1, 2, 3], - sparse_index=IntIndex(4, [0, 1, 2, 3]), - dtype=np.int64) - exp = SparseArray([0, 1, 2, 3], dtype=np.int64) + arr = SparseArray(data=[1, 2, 3], + sparse_index=IntIndex(4, [1, 2, 3]), + dtype=np.int64, fill_value=0) + exp = SparseArray([0, 1, 2, 3], dtype=np.int64, fill_value=0) tm.assert_sp_array_equal(arr, exp) self.assertEqual(arr.dtype, np.int64) - self.assertTrue(np.isnan(arr.fill_value)) + self.assertEqual(arr.fill_value, 0) arr = SparseArray(data=[1, 2], sparse_index=IntIndex(4, [1, 2]), fill_value=0, dtype=np.int64) @@ -78,22 +82,20 @@ def test_constructor_spindex_dtype(self): self.assertEqual(arr.dtype, np.int64) self.assertEqual(arr.fill_value, 0) - arr = SparseArray(data=[0, 1, 2, 3], - sparse_index=IntIndex(4, [0, 1, 2, 3]), - dtype=None) + arr = SparseArray(data=[1, 2, 3], + sparse_index=IntIndex(4, [1, 2, 3]), + dtype=None, fill_value=0) exp = SparseArray([0, 1, 2, 3], dtype=None) tm.assert_sp_array_equal(arr, exp) self.assertEqual(arr.dtype, np.int64) - self.assertTrue(np.isnan(arr.fill_value)) + self.assertEqual(arr.fill_value, 0) # scalar input - arr = SparseArray(data=1, - sparse_index=IntIndex(1, [0]), - dtype=None) + arr = SparseArray(data=1, sparse_index=IntIndex(1, [0]), dtype=None) exp = SparseArray([1], dtype=None) tm.assert_sp_array_equal(arr, exp) self.assertEqual(arr.dtype, np.int64) - self.assertTrue(np.isnan(arr.fill_value)) + self.assertEqual(arr.fill_value, 0) arr = SparseArray(data=[1, 2], sparse_index=IntIndex(4, [1, 2]), fill_value=0, dtype=None) @@ -102,6 +104,32 @@ def test_constructor_spindex_dtype(self): self.assertEqual(arr.dtype, np.int64) self.assertEqual(arr.fill_value, 0) + def test_sparseseries_roundtrip(self): + # GH 13999 + for kind in ['integer', 'block']: + for fill in [1, np.nan, 0]: + arr = SparseArray([np.nan, 1, np.nan, 2, 3], kind=kind, + fill_value=fill) + res = SparseArray(SparseSeries(arr)) + tm.assert_sp_array_equal(arr, res) + + arr = SparseArray([0, 0, 0, 1, 1, 2], dtype=np.int64, + kind=kind, fill_value=fill) + res = SparseArray(SparseSeries(arr), dtype=np.int64) + tm.assert_sp_array_equal(arr, res) + + res = SparseArray(SparseSeries(arr)) + tm.assert_sp_array_equal(arr, res) + + for fill in [True, False, np.nan]: + arr = SparseArray([True, False, True, True], dtype=np.bool, + kind=kind, fill_value=fill) + res = SparseArray(SparseSeries(arr)) + tm.assert_sp_array_equal(arr, res) + + res = SparseArray(SparseSeries(arr)) + tm.assert_sp_array_equal(arr, res) + def test_get_item(self): self.assertTrue(np.isnan(self.arr[1])) @@ -324,7 +352,68 @@ def test_astype(self): res.sp_values[:3] = 27 self.assertFalse((self.arr.sp_values[:3] == 27).any()) - assertRaisesRegexp(TypeError, "floating point", self.arr.astype, 'i8') + msg = "unable to coerce current fill_value nan to int64 dtype" + with tm.assertRaisesRegexp(ValueError, msg): + self.arr.astype('i8') + + arr = SparseArray([0, np.nan, 0, 1]) + with tm.assertRaisesRegexp(ValueError, msg): + arr.astype('i8') + + arr = SparseArray([0, np.nan, 0, 1], fill_value=0) + msg = "Cannot convert NA to integer" + with tm.assertRaisesRegexp(ValueError, msg): + arr.astype('i8') + + def test_astype_all(self): + vals = np.array([1, 2, 3]) + arr = SparseArray(vals, fill_value=1) + + types = [np.float64, np.float32, np.int64, + np.int32, np.int16, np.int8] + for typ in types: + res = arr.astype(typ) + self.assertEqual(res.dtype, typ) + self.assertEqual(res.sp_values.dtype, typ) + + tm.assert_numpy_array_equal(res.values, vals.astype(typ)) + + def test_set_fill_value(self): + arr = SparseArray([1., np.nan, 2.], fill_value=np.nan) + arr.fill_value = 2 + self.assertEqual(arr.fill_value, 2) + + arr = SparseArray([1, 0, 2], fill_value=0, dtype=np.int64) + arr.fill_value = 2 + self.assertEqual(arr.fill_value, 2) + + # coerces to int + msg = "unable to set fill_value 3\\.1 to int64 dtype" + with tm.assertRaisesRegexp(ValueError, msg): + arr.fill_value = 3.1 + + msg = "unable to set fill_value nan to int64 dtype" + with tm.assertRaisesRegexp(ValueError, msg): + arr.fill_value = np.nan + + arr = SparseArray([True, False, True], fill_value=False, dtype=np.bool) + arr.fill_value = True + self.assertTrue(arr.fill_value) + + # coerces to bool + msg = "unable to set fill_value 0 to bool dtype" + with tm.assertRaisesRegexp(ValueError, msg): + arr.fill_value = 0 + + msg = "unable to set fill_value nan to bool dtype" + with tm.assertRaisesRegexp(ValueError, msg): + arr.fill_value = np.nan + + # invalid + msg = "fill_value must be a scalar" + for val in [[1, 2, 3], np.array([1, 2]), (1, 2, 3)]: + with tm.assertRaisesRegexp(ValueError, msg): + arr.fill_value = val def test_copy_shallow(self): arr2 = self.arr.copy(deep=False) @@ -455,15 +544,17 @@ def _check_inplace_op(op): tmp = arr1.copy() self.assertRaises(NotImplementedError, op, tmp, arr2) - bin_ops = [operator.add, operator.sub, operator.mul, operator.truediv, - operator.floordiv, operator.pow] - for op in bin_ops: - _check_op(op, arr1, arr2) - _check_op(op, farr1, farr2) + with np.errstate(all='ignore'): + bin_ops = [operator.add, operator.sub, operator.mul, + operator.truediv, operator.floordiv, operator.pow] + for op in bin_ops: + _check_op(op, arr1, arr2) + _check_op(op, farr1, farr2) - inplace_ops = ['iadd', 'isub', 'imul', 'itruediv', 'ifloordiv', 'ipow'] - for op in inplace_ops: - _check_inplace_op(getattr(operator, op)) + inplace_ops = ['iadd', 'isub', 'imul', 'itruediv', 'ifloordiv', + 'ipow'] + for op in inplace_ops: + _check_inplace_op(getattr(operator, op)) def test_pickle(self): def _check_roundtrip(obj): @@ -487,44 +578,63 @@ def test_generator_warnings(self): def test_fillna(self): s = SparseArray([1, np.nan, np.nan, 3, np.nan]) res = s.fillna(-1) - exp = SparseArray([1, -1, -1, 3, -1], fill_value=-1) + exp = SparseArray([1, -1, -1, 3, -1], fill_value=-1, dtype=np.float64) tm.assert_sp_array_equal(res, exp) s = SparseArray([1, np.nan, np.nan, 3, np.nan], fill_value=0) res = s.fillna(-1) - exp = SparseArray([1, -1, -1, 3, -1], fill_value=0) + exp = SparseArray([1, -1, -1, 3, -1], fill_value=0, dtype=np.float64) tm.assert_sp_array_equal(res, exp) s = SparseArray([1, np.nan, 0, 3, 0]) res = s.fillna(-1) - exp = SparseArray([1, -1, 0, 3, 0], fill_value=-1) + exp = SparseArray([1, -1, 0, 3, 0], fill_value=-1, dtype=np.float64) tm.assert_sp_array_equal(res, exp) s = SparseArray([1, np.nan, 0, 3, 0], fill_value=0) res = s.fillna(-1) - exp = SparseArray([1, -1, 0, 3, 0], fill_value=0) + exp = SparseArray([1, -1, 0, 3, 0], fill_value=0, dtype=np.float64) tm.assert_sp_array_equal(res, exp) s = SparseArray([np.nan, np.nan, np.nan, np.nan]) res = s.fillna(-1) - exp = SparseArray([-1, -1, -1, -1], fill_value=-1) + exp = SparseArray([-1, -1, -1, -1], fill_value=-1, dtype=np.float64) tm.assert_sp_array_equal(res, exp) s = SparseArray([np.nan, np.nan, np.nan, np.nan], fill_value=0) res = s.fillna(-1) - exp = SparseArray([-1, -1, -1, -1], fill_value=0) + exp = SparseArray([-1, -1, -1, -1], fill_value=0, dtype=np.float64) tm.assert_sp_array_equal(res, exp) - s = SparseArray([0, 0, 0, 0]) + # float dtype's fill_value is np.nan, replaced by -1 + s = SparseArray([0., 0., 0., 0.]) res = s.fillna(-1) - exp = SparseArray([0, 0, 0, 0], fill_value=-1) + exp = SparseArray([0., 0., 0., 0.], fill_value=-1) tm.assert_sp_array_equal(res, exp) + # int dtype shouldn't have missing. No changes. + s = SparseArray([0, 0, 0, 0]) + self.assertEqual(s.dtype, np.int64) + self.assertEqual(s.fill_value, 0) + res = s.fillna(-1) + tm.assert_sp_array_equal(res, s) + s = SparseArray([0, 0, 0, 0], fill_value=0) + self.assertEqual(s.dtype, np.int64) + self.assertEqual(s.fill_value, 0) res = s.fillna(-1) exp = SparseArray([0, 0, 0, 0], fill_value=0) tm.assert_sp_array_equal(res, exp) + # fill_value can be nan if there is no missing hole. + # only fill_value will be changed + s = SparseArray([0, 0, 0, 0], fill_value=np.nan) + self.assertEqual(s.dtype, np.int64) + self.assertTrue(np.isnan(s.fill_value)) + res = s.fillna(-1) + exp = SparseArray([0, 0, 0, 0], fill_value=-1) + tm.assert_sp_array_equal(res, exp) + def test_fillna_overlap(self): s = SparseArray([1, np.nan, np.nan, 3, np.nan]) # filling with existing value doesn't replace existing value with @@ -535,199 +645,10 @@ def test_fillna_overlap(self): s = SparseArray([1, np.nan, np.nan, 3, np.nan], fill_value=0) res = s.fillna(3) - exp = SparseArray([1, 3, 3, 3, 3], fill_value=0) + exp = SparseArray([1, 3, 3, 3, 3], fill_value=0, dtype=np.float64) tm.assert_sp_array_equal(res, exp) -class TestSparseArrayArithmetic(tm.TestCase): - - _multiprocess_can_split_ = True - - def _check_numeric_ops(self, a, b, a_dense, b_dense): - tm.assert_numpy_array_equal((a + b).to_dense(), a_dense + b_dense) - tm.assert_numpy_array_equal((b + a).to_dense(), b_dense + a_dense) - - tm.assert_numpy_array_equal((a - b).to_dense(), a_dense - b_dense) - tm.assert_numpy_array_equal((b - a).to_dense(), b_dense - a_dense) - - tm.assert_numpy_array_equal((a * b).to_dense(), a_dense * b_dense) - tm.assert_numpy_array_equal((b * a).to_dense(), b_dense * a_dense) - - tm.assert_numpy_array_equal((a / b).to_dense(), a_dense / b_dense) - tm.assert_numpy_array_equal((b / a).to_dense(), b_dense / a_dense) - - tm.assert_numpy_array_equal((a // b).to_dense(), a_dense // b_dense) - tm.assert_numpy_array_equal((b // a).to_dense(), b_dense // a_dense) - - tm.assert_numpy_array_equal((a % b).to_dense(), a_dense % b_dense) - tm.assert_numpy_array_equal((b % a).to_dense(), b_dense % a_dense) - - tm.assert_numpy_array_equal((a ** b).to_dense(), a_dense ** b_dense) - tm.assert_numpy_array_equal((b ** a).to_dense(), b_dense ** a_dense) - - def _check_comparison_ops(self, a, b, a_dense, b_dense): - - def _check(res): - tm.assertIsInstance(res, SparseArray) - self.assertEqual(res.dtype, np.bool) - self.assertIsInstance(res.fill_value, bool) - - _check(a == b) - tm.assert_numpy_array_equal((a == b).to_dense(), a_dense == b_dense) - - _check(a != b) - tm.assert_numpy_array_equal((a != b).to_dense(), a_dense != b_dense) - - _check(a >= b) - tm.assert_numpy_array_equal((a >= b).to_dense(), a_dense >= b_dense) - - _check(a <= b) - tm.assert_numpy_array_equal((a <= b).to_dense(), a_dense <= b_dense) - - _check(a > b) - tm.assert_numpy_array_equal((a > b).to_dense(), a_dense > b_dense) - - _check(a < b) - tm.assert_numpy_array_equal((a < b).to_dense(), a_dense < b_dense) - - def test_float_scalar(self): - values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) - - for kind in ['integer', 'block']: - a = SparseArray(values, kind=kind) - self._check_numeric_ops(a, 1, values, 1) - self._check_numeric_ops(a, 0, values, 0) - self._check_numeric_ops(a, 3, values, 3) - - a = SparseArray(values, kind=kind, fill_value=0) - self._check_numeric_ops(a, 1, values, 1) - self._check_numeric_ops(a, 0, values, 0) - self._check_numeric_ops(a, 3, values, 3) - - a = SparseArray(values, kind=kind, fill_value=2) - self._check_numeric_ops(a, 1, values, 1) - self._check_numeric_ops(a, 0, values, 0) - self._check_numeric_ops(a, 3, values, 3) - - def test_float_scalar_comparison(self): - values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) - - for kind in ['integer', 'block']: - a = SparseArray(values, kind=kind) - self._check_comparison_ops(a, 1, values, 1) - self._check_comparison_ops(a, 0, values, 0) - self._check_comparison_ops(a, 3, values, 3) - - a = SparseArray(values, kind=kind, fill_value=0) - self._check_comparison_ops(a, 1, values, 1) - self._check_comparison_ops(a, 0, values, 0) - self._check_comparison_ops(a, 3, values, 3) - - a = SparseArray(values, kind=kind, fill_value=2) - self._check_comparison_ops(a, 1, values, 1) - self._check_comparison_ops(a, 0, values, 0) - self._check_comparison_ops(a, 3, values, 3) - - def test_float_same_index(self): - # when sp_index are the same - for kind in ['integer', 'block']: - values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) - rvalues = np.array([np.nan, 2, 3, 4, np.nan, 0, 1, 3, 2, np.nan]) - - a = SparseArray(values, kind=kind) - b = SparseArray(rvalues, kind=kind) - self._check_numeric_ops(a, b, values, rvalues) - - values = np.array([0., 1., 2., 6., 0., 0., 1., 2., 1., 0.]) - rvalues = np.array([0., 2., 3., 4., 0., 0., 1., 3., 2., 0.]) - - a = SparseArray(values, kind=kind, fill_value=0) - b = SparseArray(rvalues, kind=kind, fill_value=0) - self._check_numeric_ops(a, b, values, rvalues) - - def test_float_same_index_comparison(self): - # when sp_index are the same - for kind in ['integer', 'block']: - values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) - rvalues = np.array([np.nan, 2, 3, 4, np.nan, 0, 1, 3, 2, np.nan]) - - a = SparseArray(values, kind=kind) - b = SparseArray(rvalues, kind=kind) - self._check_comparison_ops(a, b, values, rvalues) - - values = np.array([0., 1., 2., 6., 0., 0., 1., 2., 1., 0.]) - rvalues = np.array([0., 2., 3., 4., 0., 0., 1., 3., 2., 0.]) - - a = SparseArray(values, kind=kind, fill_value=0) - b = SparseArray(rvalues, kind=kind, fill_value=0) - self._check_comparison_ops(a, b, values, rvalues) - - def test_float_array(self): - values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) - rvalues = np.array([2, np.nan, 2, 3, np.nan, 0, 1, 5, 2, np.nan]) - - for kind in ['integer', 'block']: - a = SparseArray(values, kind=kind) - b = SparseArray(rvalues, kind=kind) - self._check_numeric_ops(a, b, values, rvalues) - self._check_numeric_ops(a, b * 0, values, rvalues * 0) - - a = SparseArray(values, kind=kind, fill_value=0) - b = SparseArray(rvalues, kind=kind) - self._check_numeric_ops(a, b, values, rvalues) - - a = SparseArray(values, kind=kind, fill_value=0) - b = SparseArray(rvalues, kind=kind, fill_value=0) - self._check_numeric_ops(a, b, values, rvalues) - - a = SparseArray(values, kind=kind, fill_value=1) - b = SparseArray(rvalues, kind=kind, fill_value=2) - self._check_numeric_ops(a, b, values, rvalues) - - def test_float_array_different_kind(self): - values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) - rvalues = np.array([2, np.nan, 2, 3, np.nan, 0, 1, 5, 2, np.nan]) - - a = SparseArray(values, kind='integer') - b = SparseArray(rvalues, kind='block') - self._check_numeric_ops(a, b, values, rvalues) - self._check_numeric_ops(a, b * 0, values, rvalues * 0) - - a = SparseArray(values, kind='integer', fill_value=0) - b = SparseArray(rvalues, kind='block') - self._check_numeric_ops(a, b, values, rvalues) - - a = SparseArray(values, kind='integer', fill_value=0) - b = SparseArray(rvalues, kind='block', fill_value=0) - self._check_numeric_ops(a, b, values, rvalues) - - a = SparseArray(values, kind='integer', fill_value=1) - b = SparseArray(rvalues, kind='block', fill_value=2) - self._check_numeric_ops(a, b, values, rvalues) - - def test_float_array_comparison(self): - values = np.array([np.nan, 1, 2, 0, np.nan, 0, 1, 2, 1, np.nan]) - rvalues = np.array([2, np.nan, 2, 3, np.nan, 0, 1, 5, 2, np.nan]) - - for kind in ['integer', 'block']: - a = SparseArray(values, kind=kind) - b = SparseArray(rvalues, kind=kind) - self._check_comparison_ops(a, b, values, rvalues) - self._check_comparison_ops(a, b * 0, values, rvalues * 0) - - a = SparseArray(values, kind=kind, fill_value=0) - b = SparseArray(rvalues, kind=kind) - self._check_comparison_ops(a, b, values, rvalues) - - a = SparseArray(values, kind=kind, fill_value=0) - b = SparseArray(rvalues, kind=kind, fill_value=0) - self._check_comparison_ops(a, b, values, rvalues) - - a = SparseArray(values, kind=kind, fill_value=1) - b = SparseArray(rvalues, kind=kind, fill_value=2) - self._check_comparison_ops(a, b, values, rvalues) - - class TestSparseArrayAnalytics(tm.TestCase): def test_sum(self): data = np.arange(10).astype(float) @@ -829,6 +750,52 @@ def test_numpy_mean(self): tm.assertRaisesRegexp(ValueError, msg, np.mean, SparseArray(data), out=out) + def test_ufunc(self): + # GH 13853 make sure ufunc is applied to fill_value + sparse = SparseArray([1, np.nan, 2, np.nan, -2]) + result = SparseArray([1, np.nan, 2, np.nan, 2]) + tm.assert_sp_array_equal(abs(sparse), result) + tm.assert_sp_array_equal(np.abs(sparse), result) + + sparse = SparseArray([1, -1, 2, -2], fill_value=1) + result = SparseArray([1, 2, 2], sparse_index=sparse.sp_index, + fill_value=1) + tm.assert_sp_array_equal(abs(sparse), result) + tm.assert_sp_array_equal(np.abs(sparse), result) + + sparse = SparseArray([1, -1, 2, -2], fill_value=-1) + result = SparseArray([1, 2, 2], sparse_index=sparse.sp_index, + fill_value=1) + tm.assert_sp_array_equal(abs(sparse), result) + tm.assert_sp_array_equal(np.abs(sparse), result) + + sparse = SparseArray([1, np.nan, 2, np.nan, -2]) + result = SparseArray(np.sin([1, np.nan, 2, np.nan, -2])) + tm.assert_sp_array_equal(np.sin(sparse), result) + + sparse = SparseArray([1, -1, 2, -2], fill_value=1) + result = SparseArray(np.sin([1, -1, 2, -2]), fill_value=np.sin(1)) + tm.assert_sp_array_equal(np.sin(sparse), result) + + sparse = SparseArray([1, -1, 0, -2], fill_value=0) + result = SparseArray(np.sin([1, -1, 0, -2]), fill_value=np.sin(0)) + tm.assert_sp_array_equal(np.sin(sparse), result) + + def test_ufunc_args(self): + # GH 13853 make sure ufunc is applied to fill_value, including its arg + sparse = SparseArray([1, np.nan, 2, np.nan, -2]) + result = SparseArray([2, np.nan, 3, np.nan, -1]) + tm.assert_sp_array_equal(np.add(sparse, 1), result) + + sparse = SparseArray([1, -1, 2, -2], fill_value=1) + result = SparseArray([2, 0, 3, -1], fill_value=2) + tm.assert_sp_array_equal(np.add(sparse, 1), result) + + sparse = SparseArray([1, -1, 0, -2], fill_value=0) + result = SparseArray([2, 0, 1, -1], fill_value=1) + tm.assert_sp_array_equal(np.add(sparse, 1), result) + + if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/sparse/tests/test_format.py b/pandas/sparse/tests/test_format.py index 9bdc1fdd101ea..377eaa20565a2 100644 --- a/pandas/sparse/tests/test_format.py +++ b/pandas/sparse/tests/test_format.py @@ -13,7 +13,7 @@ use_32bit_repr = is_platform_windows() or is_platform_32bit() -class TestSeriesFormatting(tm.TestCase): +class TestSparseSeriesFormatting(tm.TestCase): _multiprocess_can_split_ = True @@ -62,3 +62,59 @@ def test_sparse_mi_max_row(self): "Block locations: array([0, 3]{0})\n" "Block lengths: array([1, 1]{0})".format(dfm)) self.assertEqual(result, exp) + + def test_sparse_bool(self): + # GH 13110 + s = pd.SparseSeries([True, False, False, True, False, False], + fill_value=False) + result = repr(s) + dtype = '' if use_32bit_repr else ', dtype=int32' + exp = ("0 True\n1 False\n2 False\n" + "3 True\n4 False\n5 False\n" + "dtype: bool\nBlockIndex\n" + "Block locations: array([0, 3]{0})\n" + "Block lengths: array([1, 1]{0})".format(dtype)) + self.assertEqual(result, exp) + + with option_context("display.max_rows", 3): + result = repr(s) + exp = ("0 True\n ... \n5 False\n" + "dtype: bool\nBlockIndex\n" + "Block locations: array([0, 3]{0})\n" + "Block lengths: array([1, 1]{0})".format(dtype)) + self.assertEqual(result, exp) + + def test_sparse_int(self): + # GH 13110 + s = pd.SparseSeries([0, 1, 0, 0, 1, 0], fill_value=False) + + result = repr(s) + dtype = '' if use_32bit_repr else ', dtype=int32' + exp = ("0 0\n1 1\n2 0\n3 0\n4 1\n" + "5 0\ndtype: int64\nBlockIndex\n" + "Block locations: array([1, 4]{0})\n" + "Block lengths: array([1, 1]{0})".format(dtype)) + self.assertEqual(result, exp) + + with option_context("display.max_rows", 3): + result = repr(s) + exp = ("0 0\n ..\n5 0\n" + "dtype: int64\nBlockIndex\n" + "Block locations: array([1, 4]{0})\n" + "Block lengths: array([1, 1]{0})".format(dtype)) + self.assertEqual(result, exp) + + +class TestSparseDataFrameFormatting(tm.TestCase): + + def test_sparse_frame(self): + # GH 13110 + df = pd.DataFrame({'A': [True, False, True, False, True], + 'B': [True, False, True, False, True], + 'C': [0, 0, 3, 0, 5], + 'D': [np.nan, np.nan, np.nan, 1, 2]}) + sparse = df.to_sparse() + self.assertEqual(repr(sparse), repr(df)) + + with option_context("display.max_rows", 3): + self.assertEqual(repr(sparse), repr(df)) diff --git a/pandas/sparse/tests/test_frame.py b/pandas/sparse/tests/test_frame.py index 43d35a4e7f72e..5cc765a2c1cf3 100644 --- a/pandas/sparse/tests/test_frame.py +++ b/pandas/sparse/tests/test_frame.py @@ -8,14 +8,14 @@ from pandas import Series, DataFrame, bdate_range, Panel from pandas.tseries.index import DatetimeIndex -import pandas.core.datetools as datetools +from pandas.tseries.offsets import BDay import pandas.util.testing as tm from pandas.compat import lrange from pandas import compat import pandas.sparse.frame as spf from pandas._sparse import BlockIndex, IntIndex -from pandas.sparse.api import SparseSeries, SparseDataFrame +from pandas.sparse.api import SparseSeries, SparseDataFrame, SparseArray from pandas.tests.frame.test_misc_api import SharedWithSparse @@ -25,10 +25,9 @@ class TestSparseDataFrame(tm.TestCase, SharedWithSparse): _multiprocess_can_split_ = True def setUp(self): - self.data = {'A': [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6], 'B': [0, 1, 2, nan, nan, nan, 3, 4, 5, 6], - 'C': np.arange(10), + 'C': np.arange(10, dtype=np.float64), 'D': [0, 1, 2, 3, 4, 5, nan, nan, nan, nan]} self.dates = bdate_range('1/1/2011', periods=10) @@ -125,10 +124,12 @@ def test_constructor(self): default_fill_value=self.frame.default_fill_value, default_kind=self.frame.default_kind, copy=True) reindexed = self.frame.reindex(idx) + tm.assert_sp_frame_equal(cons, reindexed, exact_indices=False) # assert level parameter breaks reindex - self.assertRaises(TypeError, self.frame.reindex, idx, level=0) + with tm.assertRaises(TypeError): + self.frame.reindex(idx, level=0) repr(self.frame) @@ -192,6 +193,28 @@ def test_constructor_from_series(self): # without sparse value raises error # df2 = SparseDataFrame([x2_sparse, y]) + def test_constructor_preserve_attr(self): + # GH 13866 + arr = pd.SparseArray([1, 0, 3, 0], dtype=np.int64, fill_value=0) + self.assertEqual(arr.dtype, np.int64) + self.assertEqual(arr.fill_value, 0) + + df = pd.SparseDataFrame({'x': arr}) + self.assertEqual(df['x'].dtype, np.int64) + self.assertEqual(df['x'].fill_value, 0) + + s = pd.SparseSeries(arr, name='x') + self.assertEqual(s.dtype, np.int64) + self.assertEqual(s.fill_value, 0) + + df = pd.SparseDataFrame(s) + self.assertEqual(df['x'].dtype, np.int64) + self.assertEqual(df['x'].fill_value, 0) + + df = pd.SparseDataFrame({'x': s}) + self.assertEqual(df['x'].dtype, np.int64) + self.assertEqual(df['x'].fill_value, 0) + def test_dtypes(self): df = DataFrame(np.random.randn(10000, 4)) df.ix[:9998] = np.nan @@ -547,18 +570,23 @@ def test_apply(self): self.frame.to_dense().apply(nanops.nansum)) def test_apply_nonuq(self): - df_orig = DataFrame( - [[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=['a', 'a', 'c']) - df = df_orig.to_sparse() - rs = df.apply(lambda s: s[0], axis=1) - xp = Series([1., 4., 7.], ['a', 'a', 'c']) - tm.assert_series_equal(rs, xp) + orig = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], + index=['a', 'a', 'c']) + sparse = orig.to_sparse() + res = sparse.apply(lambda s: s[0], axis=1) + exp = orig.apply(lambda s: s[0], axis=1) + # dtype must be kept + self.assertEqual(res.dtype, np.int64) + # ToDo: apply must return subclassed dtype + self.assertIsInstance(res, pd.Series) + tm.assert_series_equal(res.to_dense(), exp) # df.T breaks - df = df_orig.T.to_sparse() - rs = df.apply(lambda s: s[0], axis=0) # noqa + sparse = orig.T.to_sparse() + res = sparse.apply(lambda s: s[0], axis=0) # noqa + exp = orig.T.apply(lambda s: s[0], axis=0) # TODO: no non-unique columns supported in sparse yet - # assert_series_equal(rs, xp) + # tm.assert_series_equal(res.to_dense(), exp) def test_applymap(self): # just test that it works @@ -566,7 +594,63 @@ def test_applymap(self): tm.assertIsInstance(result, SparseDataFrame) def test_astype(self): - self.assertRaises(Exception, self.frame.astype, np.int64) + sparse = pd.SparseDataFrame({'A': SparseArray([1, 2, 3, 4], + dtype=np.int64), + 'B': SparseArray([4, 5, 6, 7], + dtype=np.int64)}) + self.assertEqual(sparse['A'].dtype, np.int64) + self.assertEqual(sparse['B'].dtype, np.int64) + + res = sparse.astype(np.float64) + exp = pd.SparseDataFrame({'A': SparseArray([1., 2., 3., 4.], + fill_value=0.), + 'B': SparseArray([4., 5., 6., 7.], + fill_value=0.)}, + default_fill_value=np.nan) + tm.assert_sp_frame_equal(res, exp) + self.assertEqual(res['A'].dtype, np.float64) + self.assertEqual(res['B'].dtype, np.float64) + + sparse = pd.SparseDataFrame({'A': SparseArray([0, 2, 0, 4], + dtype=np.int64), + 'B': SparseArray([0, 5, 0, 7], + dtype=np.int64)}, + default_fill_value=0) + self.assertEqual(sparse['A'].dtype, np.int64) + self.assertEqual(sparse['B'].dtype, np.int64) + + res = sparse.astype(np.float64) + exp = pd.SparseDataFrame({'A': SparseArray([0., 2., 0., 4.], + fill_value=0.), + 'B': SparseArray([0., 5., 0., 7.], + fill_value=0.)}, + default_fill_value=0.) + tm.assert_sp_frame_equal(res, exp) + self.assertEqual(res['A'].dtype, np.float64) + self.assertEqual(res['B'].dtype, np.float64) + + def test_astype_bool(self): + sparse = pd.SparseDataFrame({'A': SparseArray([0, 2, 0, 4], + fill_value=0, + dtype=np.int64), + 'B': SparseArray([0, 5, 0, 7], + fill_value=0, + dtype=np.int64)}, + default_fill_value=0) + self.assertEqual(sparse['A'].dtype, np.int64) + self.assertEqual(sparse['B'].dtype, np.int64) + + res = sparse.astype(bool) + exp = pd.SparseDataFrame({'A': SparseArray([False, True, False, True], + dtype=np.bool, + fill_value=False), + 'B': SparseArray([False, True, False, True], + dtype=np.bool, + fill_value=False)}, + default_fill_value=False) + tm.assert_sp_frame_equal(res, exp) + self.assertEqual(res['A'].dtype, np.bool) + self.assertEqual(res['B'].dtype, np.bool) def test_fillna(self): df = self.zframe.reindex(lrange(5)) @@ -739,6 +823,10 @@ def _check(frame, orig): untransposed = transposed.T tm.assert_sp_frame_equal(frame, untransposed) + tm.assert_frame_equal(frame.T.to_dense(), orig.T) + tm.assert_frame_equal(frame.T.T.to_dense(), orig.T.T) + tm.assert_sp_frame_equal(frame, frame.T.T, exact_indices=False) + self._check_all(_check) def test_shift(self): @@ -747,8 +835,8 @@ def _check(frame, orig): shifted = frame.shift(0) exp = orig.shift(0) - # int is coerced to float dtype - tm.assert_frame_equal(shifted.to_dense(), exp, check_dtype=False) + tm.assert_frame_equal(shifted.to_dense(), exp) + shifted = frame.shift(1) exp = orig.shift(1) tm.assert_frame_equal(shifted, exp) @@ -762,8 +850,8 @@ def _check(frame, orig): exp = exp.to_sparse(frame.default_fill_value) tm.assert_frame_equal(shifted, exp) - shifted = frame.shift(2, freq=datetools.bday) - exp = orig.shift(2, freq=datetools.bday) + shifted = frame.shift(2, freq=BDay()) + exp = orig.shift(2, freq=BDay()) exp = exp.to_sparse(frame.default_fill_value) tm.assert_frame_equal(shifted, exp) @@ -858,12 +946,85 @@ def test_nan_columnname(self): nan_colname_sparse = nan_colname.to_sparse() self.assertTrue(np.isnan(nan_colname_sparse.columns[0])) + def test_isnull(self): + # GH 8276 + df = pd.SparseDataFrame({'A': [np.nan, np.nan, 1, 2, np.nan], + 'B': [0, np.nan, np.nan, 2, np.nan]}) + + res = df.isnull() + exp = pd.SparseDataFrame({'A': [True, True, False, False, True], + 'B': [False, True, True, False, True]}, + default_fill_value=True) + exp._default_fill_value = np.nan + tm.assert_sp_frame_equal(res, exp) + + # if fill_value is not nan, True can be included in sp_values + df = pd.SparseDataFrame({'A': [0, 0, 1, 2, np.nan], + 'B': [0, np.nan, 0, 2, np.nan]}, + default_fill_value=0.) + res = df.isnull() + tm.assertIsInstance(res, pd.SparseDataFrame) + exp = pd.DataFrame({'A': [False, False, False, False, True], + 'B': [False, True, False, False, True]}) + tm.assert_frame_equal(res.to_dense(), exp) + + def test_isnotnull(self): + # GH 8276 + df = pd.SparseDataFrame({'A': [np.nan, np.nan, 1, 2, np.nan], + 'B': [0, np.nan, np.nan, 2, np.nan]}) + + res = df.isnotnull() + exp = pd.SparseDataFrame({'A': [False, False, True, True, False], + 'B': [True, False, False, True, False]}, + default_fill_value=False) + exp._default_fill_value = np.nan + tm.assert_sp_frame_equal(res, exp) + + # if fill_value is not nan, True can be included in sp_values + df = pd.SparseDataFrame({'A': [0, 0, 1, 2, np.nan], + 'B': [0, np.nan, 0, 2, np.nan]}, + default_fill_value=0.) + res = df.isnotnull() + tm.assertIsInstance(res, pd.SparseDataFrame) + exp = pd.DataFrame({'A': [True, True, True, True, False], + 'B': [True, False, True, True, False]}) + tm.assert_frame_equal(res.to_dense(), exp) + + +class TestSparseDataFrameArithmetic(tm.TestCase): + + def test_numeric_op_scalar(self): + df = pd.DataFrame({'A': [nan, nan, 0, 1, ], + 'B': [0, 1, 2, nan], + 'C': [1., 2., 3., 4.], + 'D': [nan, nan, nan, nan]}) + sparse = df.to_sparse() + + tm.assert_sp_frame_equal(sparse + 1, (df + 1).to_sparse()) + + def test_comparison_op_scalar(self): + # GH 13001 + df = pd.DataFrame({'A': [nan, nan, 0, 1, ], + 'B': [0, 1, 2, nan], + 'C': [1., 2., 3., 4.], + 'D': [nan, nan, nan, nan]}) + sparse = df.to_sparse() + + # comparison changes internal repr, compare with dense + res = sparse > 1 + tm.assertIsInstance(res, pd.SparseDataFrame) + tm.assert_frame_equal(res.to_dense(), df > 1) + + res = sparse != 0 + tm.assertIsInstance(res, pd.SparseDataFrame) + tm.assert_frame_equal(res.to_dense(), df != 0) + class TestSparseDataFrameAnalytics(tm.TestCase): def setUp(self): self.data = {'A': [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6], 'B': [0, 1, 2, nan, nan, nan, 3, 4, 5, 6], - 'C': np.arange(10), + 'C': np.arange(10, dtype=float), 'D': [0, 1, 2, 3, 4, 5, nan, nan, nan, nan]} self.dates = bdate_range('1/1/2011', periods=10) diff --git a/pandas/sparse/tests/test_indexing.py b/pandas/sparse/tests/test_indexing.py index 1f88d22bd8f93..c0d4b70c41dc4 100644 --- a/pandas/sparse/tests/test_indexing.py +++ b/pandas/sparse/tests/test_indexing.py @@ -36,6 +36,10 @@ def test_getitem(self): exp = orig[orig % 2 == 1].to_sparse() tm.assert_sp_series_equal(result, exp) + # sparse array + result = sparse[pd.SparseArray(sparse % 2 == 1, dtype=bool)] + tm.assert_sp_series_equal(result, exp) + def test_getitem_slice(self): orig = self.orig sparse = self.sparse @@ -45,6 +49,21 @@ def test_getitem_slice(self): tm.assert_sp_series_equal(sparse[::2], orig[::2].to_sparse()) tm.assert_sp_series_equal(sparse[-5:], orig[-5:].to_sparse()) + def test_getitem_int_dtype(self): + # GH 8292 + s = pd.SparseSeries([0, 1, 2, 3, 4, 5, 6], name='xxx') + res = s[::2] + exp = pd.SparseSeries([0, 2, 4, 6], index=[0, 2, 4, 6], name='xxx') + tm.assert_sp_series_equal(res, exp) + self.assertEqual(res.dtype, np.int64) + + s = pd.SparseSeries([0, 1, 2, 3, 4, 5, 6], fill_value=0, name='xxx') + res = s[::2] + exp = pd.SparseSeries([0, 2, 4, 6], index=[0, 2, 4, 6], + fill_value=0, name='xxx') + tm.assert_sp_series_equal(res, exp) + self.assertEqual(res.dtype, np.int64) + def test_getitem_fill_value(self): orig = pd.Series([1, np.nan, 0, 3, 0]) sparse = orig.to_sparse(fill_value=0) @@ -68,6 +87,10 @@ def test_getitem_fill_value(self): exp = orig[orig % 2 == 1].to_sparse(fill_value=0) tm.assert_sp_series_equal(result, exp) + # sparse array + result = sparse[pd.SparseArray(sparse % 2 == 1, dtype=bool)] + tm.assert_sp_series_equal(result, exp) + def test_getitem_ellipsis(self): # GH 9467 s = pd.SparseSeries([1, np.nan, 2, 0, np.nan]) @@ -116,6 +139,10 @@ def test_loc(self): exp = orig.loc[orig % 2 == 1].to_sparse() tm.assert_sp_series_equal(result, exp) + # sparse array + result = sparse.loc[pd.SparseArray(sparse % 2 == 1, dtype=bool)] + tm.assert_sp_series_equal(result, exp) + def test_loc_index(self): orig = pd.Series([1, np.nan, np.nan, 3, np.nan], index=list('ABCDE')) sparse = orig.to_sparse() @@ -137,6 +164,10 @@ def test_loc_index(self): exp = orig.loc[orig % 2 == 1].to_sparse() tm.assert_sp_series_equal(result, exp) + # sparse array + result = sparse[pd.SparseArray(sparse % 2 == 1, dtype=bool)] + tm.assert_sp_series_equal(result, exp) + def test_loc_index_fill_value(self): orig = pd.Series([1, np.nan, 0, 3, 0], index=list('ABCDE')) sparse = orig.to_sparse(fill_value=0) @@ -368,6 +399,35 @@ def test_reindex_fill_value(self): exp = orig.reindex(['A', 'E', 'C', 'D']).to_sparse(fill_value=0) tm.assert_sp_series_equal(res, exp) + def tests_indexing_with_sparse(self): + # GH 13985 + + for kind in ['integer', 'block']: + for fill in [True, False, np.nan]: + arr = pd.SparseArray([1, 2, 3], kind=kind) + indexer = pd.SparseArray([True, False, True], fill_value=fill, + dtype=bool) + + tm.assert_sp_array_equal(pd.SparseArray([1, 3], kind=kind), + arr[indexer]) + + s = pd.SparseSeries(arr, index=['a', 'b', 'c'], + dtype=np.float64) + exp = pd.SparseSeries([1, 3], index=['a', 'c'], + dtype=np.float64, kind=kind) + tm.assert_sp_series_equal(s[indexer], exp) + tm.assert_sp_series_equal(s.loc[indexer], exp) + tm.assert_sp_series_equal(s.iloc[indexer], exp) + + indexer = pd.SparseSeries(indexer, index=['a', 'b', 'c']) + tm.assert_sp_series_equal(s[indexer], exp) + tm.assert_sp_series_equal(s.loc[indexer], exp) + + msg = ("iLocation based boolean indexing cannot use an " + "indexable as a mask") + with tm.assertRaisesRegexp(ValueError, msg): + s.iloc[indexer] + class TestSparseSeriesMultiIndexing(TestSparseSeriesIndexing): @@ -405,6 +465,10 @@ def test_getitem_multi(self): exp = orig[orig % 2 == 1].to_sparse() tm.assert_sp_series_equal(result, exp) + # sparse array + result = sparse[pd.SparseArray(sparse % 2 == 1, dtype=bool)] + tm.assert_sp_series_equal(result, exp) + def test_getitem_multi_tuple(self): orig = self.orig sparse = self.sparse @@ -454,6 +518,10 @@ def test_loc(self): exp = orig.loc[orig % 2 == 1].to_sparse() tm.assert_sp_series_equal(result, exp) + # sparse array + result = sparse.loc[pd.SparseArray(sparse % 2 == 1, dtype=bool)] + tm.assert_sp_series_equal(result, exp) + def test_loc_multi_tuple(self): orig = self.orig sparse = self.sparse @@ -578,6 +646,10 @@ def test_loc(self): exp = orig.loc[orig.x % 2 == 1].to_sparse() tm.assert_sp_frame_equal(result, exp) + # sparse array + result = sparse.loc[pd.SparseArray(sparse.x % 2 == 1, dtype=bool)] + tm.assert_sp_frame_equal(result, exp) + def test_loc_index(self): orig = pd.DataFrame([[1, np.nan, np.nan], [2, 3, np.nan], @@ -627,6 +699,10 @@ def test_loc_index(self): exp = orig.loc[orig.x % 2 == 1].to_sparse() tm.assert_sp_frame_equal(result, exp) + # sparse array + result = sparse.loc[pd.SparseArray(sparse.x % 2 == 1, dtype=bool)] + tm.assert_sp_frame_equal(result, exp) + def test_loc_slice(self): orig = pd.DataFrame([[1, np.nan, np.nan], [2, 3, np.nan], @@ -829,3 +905,81 @@ def test_reindex_fill_value(self): res = sparse.reindex(['A', 'C', 'B']) exp = orig.reindex(['A', 'C', 'B']).to_sparse(fill_value=0) tm.assert_sp_frame_equal(res, exp) + + +class TestMultitype(tm.TestCase): + def setUp(self): + self.cols = ['string', 'int', 'float', 'object'] + + self.string_series = pd.SparseSeries(['a', 'b', 'c']) + self.int_series = pd.SparseSeries([1, 2, 3]) + self.float_series = pd.SparseSeries([1.1, 1.2, 1.3]) + self.object_series = pd.SparseSeries([[], {}, set()]) + self.sdf = pd.SparseDataFrame({ + 'string': self.string_series, + 'int': self.int_series, + 'float': self.float_series, + 'object': self.object_series, + }) + self.sdf = self.sdf[self.cols] + self.ss = pd.SparseSeries(['a', 1, 1.1, []], index=self.cols) + + def test_frame_basic_dtypes(self): + for _, row in self.sdf.iterrows(): + self.assertEqual(row.dtype, object) + tm.assert_sp_series_equal(self.sdf['string'], self.string_series, + check_names=False) + tm.assert_sp_series_equal(self.sdf['int'], self.int_series, + check_names=False) + tm.assert_sp_series_equal(self.sdf['float'], self.float_series, + check_names=False) + tm.assert_sp_series_equal(self.sdf['object'], self.object_series, + check_names=False) + + def test_frame_indexing_single(self): + tm.assert_sp_series_equal(self.sdf.iloc[0], + pd.SparseSeries(['a', 1, 1.1, []], + index=self.cols), + check_names=False) + tm.assert_sp_series_equal(self.sdf.iloc[1], + pd.SparseSeries(['b', 2, 1.2, {}], + index=self.cols), + check_names=False) + tm.assert_sp_series_equal(self.sdf.iloc[2], + pd.SparseSeries(['c', 3, 1.3, set()], + index=self.cols), + check_names=False) + + def test_frame_indexing_multiple(self): + tm.assert_sp_frame_equal(self.sdf, self.sdf[:]) + tm.assert_sp_frame_equal(self.sdf, self.sdf.loc[:]) + tm.assert_sp_frame_equal(self.sdf.iloc[[1, 2]], + pd.SparseDataFrame({ + 'string': self.string_series.iloc[[1, 2]], + 'int': self.int_series.iloc[[1, 2]], + 'float': self.float_series.iloc[[1, 2]], + 'object': self.object_series.iloc[[1, 2]] + }, index=[1, 2])[self.cols]) + tm.assert_sp_frame_equal(self.sdf[['int', 'string']], + pd.SparseDataFrame({ + 'int': self.int_series, + 'string': self.string_series, + })) + + def test_series_indexing_single(self): + for i, idx in enumerate(self.cols): + self.assertEqual(self.ss.iloc[i], self.ss[idx]) + self.assertEqual(type(self.ss.iloc[i]), + type(self.ss[idx])) + self.assertEqual(self.ss['string'], 'a') + self.assertEqual(self.ss['int'], 1) + self.assertEqual(self.ss['float'], 1.1) + self.assertEqual(self.ss['object'], []) + + def test_series_indexing_multiple(self): + tm.assert_sp_series_equal(self.ss.loc[['string', 'int']], + pd.SparseSeries(['a', 1], + index=['string', 'int'])) + tm.assert_sp_series_equal(self.ss.loc[['string', 'object']], + pd.SparseSeries(['a', []], + index=['string', 'object'])) diff --git a/pandas/sparse/tests/test_libsparse.py b/pandas/sparse/tests/test_libsparse.py index 11bf980a99fec..c289b4a1b204f 100644 --- a/pandas/sparse/tests/test_libsparse.py +++ b/pandas/sparse/tests/test_libsparse.py @@ -243,6 +243,61 @@ class TestSparseIndexCommon(tm.TestCase): _multiprocess_can_split_ = True + def test_int_internal(self): + idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind='integer') + self.assertIsInstance(idx, IntIndex) + self.assertEqual(idx.npoints, 2) + tm.assert_numpy_array_equal(idx.indices, + np.array([2, 3], dtype=np.int32)) + + idx = _make_index(4, np.array([], dtype=np.int32), kind='integer') + self.assertIsInstance(idx, IntIndex) + self.assertEqual(idx.npoints, 0) + tm.assert_numpy_array_equal(idx.indices, + np.array([], dtype=np.int32)) + + idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32), + kind='integer') + self.assertIsInstance(idx, IntIndex) + self.assertEqual(idx.npoints, 4) + tm.assert_numpy_array_equal(idx.indices, + np.array([0, 1, 2, 3], dtype=np.int32)) + + def test_block_internal(self): + idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind='block') + self.assertIsInstance(idx, BlockIndex) + self.assertEqual(idx.npoints, 2) + tm.assert_numpy_array_equal(idx.blocs, + np.array([2], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blengths, + np.array([2], dtype=np.int32)) + + idx = _make_index(4, np.array([], dtype=np.int32), kind='block') + self.assertIsInstance(idx, BlockIndex) + self.assertEqual(idx.npoints, 0) + tm.assert_numpy_array_equal(idx.blocs, + np.array([], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blengths, + np.array([], dtype=np.int32)) + + idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32), + kind='block') + self.assertIsInstance(idx, BlockIndex) + self.assertEqual(idx.npoints, 4) + tm.assert_numpy_array_equal(idx.blocs, + np.array([0], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blengths, + np.array([4], dtype=np.int32)) + + idx = _make_index(4, np.array([0, 2, 3], dtype=np.int32), + kind='block') + self.assertIsInstance(idx, BlockIndex) + self.assertEqual(idx.npoints, 3) + tm.assert_numpy_array_equal(idx.blocs, + np.array([0, 2], dtype=np.int32)) + tm.assert_numpy_array_equal(idx.blengths, + np.array([1, 2], dtype=np.int32)) + def test_lookup(self): for kind in ['integer', 'block']: idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind=kind) @@ -486,13 +541,14 @@ def _check_case(xloc, xlen, yloc, ylen, eloc, elen): xfill = 0 yfill = 2 - result_block_vals, rb_index = sparse_op(x, xindex, xfill, y, - yindex, yfill) - result_int_vals, ri_index = sparse_op(x, xdindex, xfill, y, - ydindex, yfill) + result_block_vals, rb_index, bfill = sparse_op(x, xindex, xfill, y, + yindex, yfill) + result_int_vals, ri_index, ifill = sparse_op(x, xdindex, xfill, y, + ydindex, yfill) self.assertTrue(rb_index.to_int_index().equals(ri_index)) tm.assert_numpy_array_equal(result_block_vals, result_int_vals) + self.assertEqual(bfill, ifill) # check versus Series... xseries = Series(x, xdindex.indices) @@ -517,7 +573,7 @@ def _check_case(xloc, xlen, yloc, ylen, eloc, elen): def make_optestf(op): def f(self): - sparse_op = getattr(splib, 'sparse_%s' % op) + sparse_op = getattr(splib, 'sparse_%s_float64' % op) python_op = getattr(operator, op) self._op_tests(sparse_op, python_op) diff --git a/pandas/sparse/tests/test_list.py b/pandas/sparse/tests/test_list.py index 5f8627103e18b..b117685b6e968 100644 --- a/pandas/sparse/tests/test_list.py +++ b/pandas/sparse/tests/test_list.py @@ -16,83 +16,102 @@ def setUp(self): self.na_data = np.array([nan, nan, 1, 2, 3, nan, 4, 5, nan, 6]) self.zero_data = np.array([0, 0, 1, 2, 3, 0, 4, 5, 0, 6]) + def test_deprecation(self): + # see gh-13784 + with tm.assert_produces_warning(FutureWarning): + SparseList() + def test_constructor(self): - lst1 = SparseList(self.na_data[:5]) - exp = SparseList() + with tm.assert_produces_warning(FutureWarning): + lst1 = SparseList(self.na_data[:5]) + with tm.assert_produces_warning(FutureWarning): + exp = SparseList() + exp.append(self.na_data[:5]) tm.assert_sp_list_equal(lst1, exp) def test_len(self): - arr = self.na_data - splist = SparseList() - splist.append(arr[:5]) - self.assertEqual(len(splist), 5) - splist.append(arr[5]) - self.assertEqual(len(splist), 6) - splist.append(arr[6:]) - self.assertEqual(len(splist), 10) + with tm.assert_produces_warning(FutureWarning): + arr = self.na_data + splist = SparseList() + splist.append(arr[:5]) + self.assertEqual(len(splist), 5) + splist.append(arr[5]) + self.assertEqual(len(splist), 6) + splist.append(arr[6:]) + self.assertEqual(len(splist), 10) def test_append_na(self): - arr = self.na_data - splist = SparseList() - splist.append(arr[:5]) - splist.append(arr[5]) - splist.append(arr[6:]) + with tm.assert_produces_warning(FutureWarning): + arr = self.na_data + splist = SparseList() + splist.append(arr[:5]) + splist.append(arr[5]) + splist.append(arr[6:]) - sparr = splist.to_array() - tm.assert_sp_array_equal(sparr, SparseArray(arr)) + sparr = splist.to_array() + tm.assert_sp_array_equal(sparr, SparseArray(arr)) def test_append_zero(self): - arr = self.zero_data - splist = SparseList(fill_value=0) - splist.append(arr[:5]) - splist.append(arr[5]) - splist.append(arr[6:]) - - sparr = splist.to_array() - tm.assert_sp_array_equal(sparr, SparseArray(arr, fill_value=0)) + with tm.assert_produces_warning(FutureWarning): + arr = self.zero_data + splist = SparseList(fill_value=0) + splist.append(arr[:5]) + splist.append(arr[5]) + splist.append(arr[6:]) + + # list always produces int64, but SA constructor + # is platform dtype aware + sparr = splist.to_array() + exp = SparseArray(arr, fill_value=0) + tm.assert_sp_array_equal(sparr, exp, check_dtype=False) def test_consolidate(self): - arr = self.na_data - exp_sparr = SparseArray(arr) + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + arr = self.na_data + exp_sparr = SparseArray(arr) - splist = SparseList() - splist.append(arr[:5]) - splist.append(arr[5]) - splist.append(arr[6:]) + splist = SparseList() + splist.append(arr[:5]) + splist.append(arr[5]) + splist.append(arr[6:]) - consol = splist.consolidate(inplace=False) - self.assertEqual(consol.nchunks, 1) - self.assertEqual(splist.nchunks, 3) - tm.assert_sp_array_equal(consol.to_array(), exp_sparr) + consol = splist.consolidate(inplace=False) + self.assertEqual(consol.nchunks, 1) + self.assertEqual(splist.nchunks, 3) + tm.assert_sp_array_equal(consol.to_array(), exp_sparr) - splist.consolidate() - self.assertEqual(splist.nchunks, 1) - tm.assert_sp_array_equal(splist.to_array(), exp_sparr) + splist.consolidate() + self.assertEqual(splist.nchunks, 1) + tm.assert_sp_array_equal(splist.to_array(), exp_sparr) def test_copy(self): - arr = self.na_data - exp_sparr = SparseArray(arr) + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + arr = self.na_data + exp_sparr = SparseArray(arr) - splist = SparseList() - splist.append(arr[:5]) - splist.append(arr[5]) + splist = SparseList() + splist.append(arr[:5]) + splist.append(arr[5]) - cp = splist.copy() - cp.append(arr[6:]) - self.assertEqual(splist.nchunks, 2) - tm.assert_sp_array_equal(cp.to_array(), exp_sparr) + cp = splist.copy() + cp.append(arr[6:]) + self.assertEqual(splist.nchunks, 2) + tm.assert_sp_array_equal(cp.to_array(), exp_sparr) def test_getitem(self): - arr = self.na_data - splist = SparseList() - splist.append(arr[:5]) - splist.append(arr[5]) - splist.append(arr[6:]) - - for i in range(len(arr)): - tm.assert_almost_equal(splist[i], arr[i]) - tm.assert_almost_equal(splist[-i], arr[-i]) + with tm.assert_produces_warning(FutureWarning): + arr = self.na_data + splist = SparseList() + splist.append(arr[:5]) + splist.append(arr[5]) + splist.append(arr[6:]) + + for i in range(len(arr)): + tm.assert_almost_equal(splist[i], arr[i]) + tm.assert_almost_equal(splist[-i], arr[-i]) if __name__ == '__main__': diff --git a/pandas/sparse/tests/test_panel.py b/pandas/sparse/tests/test_panel.py deleted file mode 100644 index e988ddebd92f0..0000000000000 --- a/pandas/sparse/tests/test_panel.py +++ /dev/null @@ -1,274 +0,0 @@ -# pylint: disable-msg=E1101,W0612 - -import nose # noqa -from numpy import nan -import pandas as pd - -from pandas import DataFrame, bdate_range, Panel -from pandas.core.index import Index -import pandas.util.testing as tm -from pandas.sparse.api import SparseSeries, SparsePanel -import pandas.tests.test_panel as test_panel - - -def panel_data1(): - index = bdate_range('1/1/2011', periods=8) - - return DataFrame({ - 'A': [nan, nan, nan, 0, 1, 2, 3, 4], - 'B': [0, 1, 2, 3, 4, nan, nan, nan], - 'C': [0, 1, 2, nan, nan, nan, 3, 4], - 'D': [nan, 0, 1, nan, 2, 3, 4, nan] - }, index=index) - - -def panel_data2(): - index = bdate_range('1/1/2011', periods=9) - - return DataFrame({ - 'A': [nan, nan, nan, 0, 1, 2, 3, 4, 5], - 'B': [0, 1, 2, 3, 4, 5, nan, nan, nan], - 'C': [0, 1, 2, nan, nan, nan, 3, 4, 5], - 'D': [nan, 0, 1, nan, 2, 3, 4, 5, nan] - }, index=index) - - -def panel_data3(): - index = bdate_range('1/1/2011', periods=10).shift(-2) - - return DataFrame({ - 'A': [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6], - 'B': [0, 1, 2, 3, 4, 5, 6, nan, nan, nan], - 'C': [0, 1, 2, nan, nan, nan, 3, 4, 5, 6], - 'D': [nan, 0, 1, nan, 2, 3, 4, 5, 6, nan] - }, index=index) - - -class TestSparsePanel(tm.TestCase, test_panel.SafeForLongAndSparse, - test_panel.SafeForSparse): - _multiprocess_can_split_ = True - - def setUp(self): - self.data_dict = { - 'ItemA': panel_data1(), - 'ItemB': panel_data2(), - 'ItemC': panel_data3(), - 'ItemD': panel_data1(), - } - with tm.assert_produces_warning(FutureWarning): - self.panel = SparsePanel(self.data_dict) - - @staticmethod - def _test_op(panel, op): - # arithmetic tests - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = op(panel, 1) - tm.assert_sp_frame_equal(result['ItemA'], op(panel['ItemA'], 1)) - - def test_constructor(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - self.assertRaises(ValueError, SparsePanel, self.data_dict, - items=['Item0', 'ItemA', 'ItemB']) - with tm.assertRaisesRegexp(TypeError, - "input must be a dict, a 'list' was " - "passed"): - SparsePanel(['a', 'b', 'c']) - - # deprecation GH11157 - def test_deprecation(self): - with tm.assert_produces_warning(FutureWarning): - SparsePanel() - - # GH 9272 - def test_constructor_empty(self): - with tm.assert_produces_warning(FutureWarning): - sp = SparsePanel() - self.assertEqual(len(sp.items), 0) - self.assertEqual(len(sp.major_axis), 0) - self.assertEqual(len(sp.minor_axis), 0) - - def test_from_dict(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - fd = SparsePanel.from_dict(self.data_dict) - tm.assert_sp_panel_equal(fd, self.panel) - - def test_pickle(self): - def _test_roundtrip(panel): - result = self.round_trip_pickle(panel) - tm.assertIsInstance(result.items, Index) - tm.assertIsInstance(result.major_axis, Index) - tm.assertIsInstance(result.minor_axis, Index) - tm.assert_sp_panel_equal(panel, result) - - _test_roundtrip(self.panel) - - def test_dense_to_sparse(self): - wp = Panel.from_dict(self.data_dict) - dwp = wp.to_sparse() - tm.assertIsInstance(dwp['ItemA']['A'], SparseSeries) - - def test_to_dense(self): - dwp = self.panel.to_dense() - dwp2 = Panel.from_dict(self.data_dict) - tm.assert_panel_equal(dwp, dwp2) - - def test_to_frame(self): - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - - def _compare_with_dense(panel): - slp = panel.to_frame() - dlp = panel.to_dense().to_frame() - - self.assert_numpy_array_equal(slp.values, dlp.values) - self.assert_index_equal(slp.index, dlp.index, - check_names=False) - - _compare_with_dense(self.panel) - _compare_with_dense(self.panel.reindex(items=['ItemA'])) - - with tm.assert_produces_warning(FutureWarning): - zero_panel = SparsePanel(self.data_dict, default_fill_value=0) - self.assertRaises(Exception, zero_panel.to_frame) - - self.assertRaises(Exception, self.panel.to_frame, - filter_observations=False) - - def test_long_to_wide_sparse(self): - pass - - def test_values(self): - pass - - def test_setitem(self): - self.panel['ItemE'] = self.panel['ItemC'] - self.panel['ItemF'] = self.panel['ItemC'].to_dense() - - tm.assert_sp_frame_equal(self.panel['ItemE'], self.panel['ItemC']) - tm.assert_sp_frame_equal(self.panel['ItemF'], self.panel['ItemC']) - - expected = pd.Index(['ItemA', 'ItemB', 'ItemC', - 'ItemD', 'ItemE', 'ItemF']) - tm.assert_index_equal(self.panel.items, expected) - - self.assertRaises(Exception, self.panel.__setitem__, 'item6', 1) - - def test_set_value(self): - def _check_loc(item, major, minor, val=1.5): - res = self.panel.set_value(item, major, minor, val) - self.assertIsNot(res, self.panel) - self.assertEqual(res.get_value(item, major, minor), val) - - _check_loc('ItemA', self.panel.major_axis[4], self.panel.minor_axis[3]) - _check_loc('ItemF', self.panel.major_axis[4], self.panel.minor_axis[3]) - _check_loc('ItemF', 'foo', self.panel.minor_axis[3]) - _check_loc('ItemE', 'foo', 'bar') - - def test_delitem_pop(self): - del self.panel['ItemB'] - tm.assert_index_equal(self.panel.items, - pd.Index(['ItemA', 'ItemC', 'ItemD'])) - crackle = self.panel['ItemC'] - pop = self.panel.pop('ItemC') - self.assertIs(pop, crackle) - tm.assert_almost_equal(self.panel.items, pd.Index(['ItemA', 'ItemD'])) - - self.assertRaises(KeyError, self.panel.__delitem__, 'ItemC') - - def test_copy(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - cop = self.panel.copy() - tm.assert_sp_panel_equal(cop, self.panel) - - def test_reindex(self): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - - def _compare_with_dense(swp, items, major, minor): - swp_re = swp.reindex(items=items, major=major, minor=minor) - dwp_re = swp.to_dense().reindex(items=items, major=major, - minor=minor) - tm.assert_panel_equal(swp_re.to_dense(), dwp_re) - - _compare_with_dense(self.panel, self.panel.items[:2], - self.panel.major_axis[::2], - self.panel.minor_axis[::2]) - _compare_with_dense(self.panel, None, self.panel.major_axis[::2], - self.panel.minor_axis[::2]) - - self.assertRaises(ValueError, self.panel.reindex) - - # TODO: do something about this later... - self.assertRaises(Exception, self.panel.reindex, - items=['item0', 'ItemA', 'ItemB']) - - # test copying - cp = self.panel.reindex(self.panel.major_axis, copy=True) - cp['ItemA']['E'] = cp['ItemA']['A'] - self.assertNotIn('E', self.panel['ItemA']) - - def test_operators(self): - def _check_ops(panel): - def _dense_comp(op): - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - dense = panel.to_dense() - sparse_result = op(panel) - dense_result = op(dense) - tm.assert_panel_equal(sparse_result.to_dense(), - dense_result) - - def _mixed_comp(op): - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - result = op(panel, panel.to_dense()) - expected = op(panel.to_dense(), panel.to_dense()) - tm.assert_panel_equal(result, expected) - - op1 = lambda x: x + 2 - - _dense_comp(op1) - op2 = lambda x: x.add(x.reindex(major=x.major_axis[::2])) - _dense_comp(op2) - op3 = lambda x: x.subtract(x.mean(0), axis=0) - _dense_comp(op3) - op4 = lambda x: x.subtract(x.mean(1), axis=1) - _dense_comp(op4) - op5 = lambda x: x.subtract(x.mean(2), axis=2) - _dense_comp(op5) - - _mixed_comp(Panel.multiply) - _mixed_comp(Panel.subtract) - - # TODO: this case not yet supported! - # op6 = lambda x: x.add(x.to_frame()) - # _dense_comp(op6) - - _check_ops(self.panel) - - def test_major_xs(self): - def _dense_comp(sparse): - dense = sparse.to_dense() - - for idx in sparse.major_axis: - dslice = dense.major_xs(idx) - sslice = sparse.major_xs(idx) - tm.assert_frame_equal(dslice, sslice) - - _dense_comp(self.panel) - - def test_minor_xs(self): - def _dense_comp(sparse): - dense = sparse.to_dense() - - for idx in sparse.minor_axis: - dslice = dense.minor_xs(idx) - sslice = sparse.minor_xs(idx).to_dense() - tm.assert_frame_equal(dslice, sslice) - - _dense_comp(self.panel) - - -if __name__ == '__main__': - import nose # noqa - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/sparse/tests/test_series.py b/pandas/sparse/tests/test_series.py index 27112319ea915..de8c63df9c9e6 100644 --- a/pandas/sparse/tests/test_series.py +++ b/pandas/sparse/tests/test_series.py @@ -7,9 +7,8 @@ import pandas as pd from pandas import Series, DataFrame, bdate_range -from pandas.core.datetools import BDay -import pandas.core.datetools as datetools from pandas.core.common import isnull +from pandas.tseries.offsets import BDay import pandas.util.testing as tm from pandas.compat import range from pandas import compat @@ -98,10 +97,14 @@ def test_constructor_dtype(self): self.assertEqual(arr.dtype, np.float64) self.assertEqual(arr.fill_value, 0) - arr = SparseSeries([0, 1, 2, 4], dtype=np.int64) + arr = SparseSeries([0, 1, 2, 4], dtype=np.int64, fill_value=np.nan) self.assertEqual(arr.dtype, np.int64) self.assertTrue(np.isnan(arr.fill_value)) + arr = SparseSeries([0, 1, 2, 4], dtype=np.int64) + self.assertEqual(arr.dtype, np.int64) + self.assertEqual(arr.fill_value, 0) + arr = SparseSeries([0, 1, 2, 4], fill_value=0, dtype=np.int64) self.assertEqual(arr.dtype, np.int64) self.assertEqual(arr.fill_value, 0) @@ -136,6 +139,15 @@ def test_construct_DataFrame_with_sp_series(self): result = df.ftypes tm.assert_series_equal(expected, result) + def test_constructor_preserve_attr(self): + arr = pd.SparseArray([1, 0, 3, 0], dtype=np.int64, fill_value=0) + self.assertEqual(arr.dtype, np.int64) + self.assertEqual(arr.fill_value, 0) + + s = pd.SparseSeries(arr, name='x') + self.assertEqual(s.dtype, np.int64) + self.assertEqual(s.fill_value, 0) + def test_series_density(self): # GH2803 ts = Series(np.random.randn(10)) @@ -345,7 +357,19 @@ def test_shape(self): self.assertEqual(self.ziseries2.shape, (15, )) def test_astype(self): - self.assertRaises(Exception, self.bseries.astype, np.int64) + with tm.assertRaises(ValueError): + self.bseries.astype(np.int64) + + def test_astype_all(self): + orig = pd.Series(np.array([1, 2, 3])) + s = SparseSeries(orig) + + types = [np.float64, np.float32, np.int64, + np.int32, np.int16, np.int8] + for typ in types: + res = s.astype(typ) + self.assertEqual(res.dtype, typ) + tm.assert_series_equal(res.to_dense(), orig.astype(typ)) def test_kind(self): self.assertEqual(self.bseries.kind, 'block') @@ -503,6 +527,7 @@ def test_setslice(self): name=self.bseries.name)) def test_operators(self): + def _check_op(a, b, op): sp_result = op(a, b) adense = a.to_dense() if isinstance(a, SparseSeries) else a @@ -580,6 +605,21 @@ def test_abs(self): tm.assert_sp_series_equal(result, expected) self.assertEqual(result.name, 'x') + s = SparseSeries([1, -2, 2, -3], fill_value=-2, name='x') + expected = SparseSeries([1, 2, 3], sparse_index=s.sp_index, + fill_value=2, name='x') + result = s.abs() + tm.assert_sp_series_equal(result, expected) + self.assertEqual(result.name, 'x') + + result = abs(s) + tm.assert_sp_series_equal(result, expected) + self.assertEqual(result.name, 'x') + + result = np.abs(s) + tm.assert_sp_series_equal(result, expected) + self.assertEqual(result.name, 'x') + def test_reindex(self): def _compare_with_series(sps, new_index): spsre = sps.reindex(new_index) @@ -741,7 +781,8 @@ def _check_matches(indices, expected): data = {} for i, idx in enumerate(indices): data[i] = SparseSeries(idx.to_int_index().indices, - sparse_index=idx) + sparse_index=idx, fill_value=np.nan) + # homogenized is only valid with NaN fill values homogenized = spf.homogenize(data) for k, v in compat.iteritems(homogenized): @@ -772,7 +813,8 @@ def test_fill_value_corner(self): cop2 = self.zbseries.copy() cop2.fill_value = 1 result = cop2 / cop - self.assertTrue(np.isnan(result.fill_value)) + # 1 / 0 is inf + self.assertTrue(np.isinf(result.fill_value)) def test_fill_value_when_combine_const(self): # GH12723 @@ -800,7 +842,7 @@ def test_shift(self): f = lambda s: s.shift(2, freq='B') _dense_series_compare(series, f) - f = lambda s: s.shift(2, freq=datetools.bday) + f = lambda s: s.shift(2, freq=BDay()) _dense_series_compare(series, f) def test_shift_nan(self): @@ -840,9 +882,14 @@ def test_shift_nan(self): def test_shift_dtype(self): # GH 12908 orig = pd.Series([1, 2, 3, 4], dtype=np.int64) - sparse = orig.to_sparse() + sparse = orig.to_sparse() tm.assert_sp_series_equal(sparse.shift(0), orig.shift(0).to_sparse()) + + sparse = orig.to_sparse(fill_value=np.nan) + tm.assert_sp_series_equal(sparse.shift(0), + orig.shift(0).to_sparse(fill_value=np.nan)) + # shift(1) or more span changes dtype to float64 tm.assert_sp_series_equal(sparse.shift(1), orig.shift(1).to_sparse()) tm.assert_sp_series_equal(sparse.shift(2), orig.shift(2).to_sparse()) tm.assert_sp_series_equal(sparse.shift(3), orig.shift(3).to_sparse()) @@ -855,25 +902,27 @@ def test_shift_dtype(self): def test_shift_dtype_fill_value(self): # GH 12908 orig = pd.Series([1, 0, 0, 4], dtype=np.int64) - sparse = orig.to_sparse(fill_value=0) - tm.assert_sp_series_equal(sparse.shift(0), - orig.shift(0).to_sparse(fill_value=0)) - tm.assert_sp_series_equal(sparse.shift(1), - orig.shift(1).to_sparse(fill_value=0)) - tm.assert_sp_series_equal(sparse.shift(2), - orig.shift(2).to_sparse(fill_value=0)) - tm.assert_sp_series_equal(sparse.shift(3), - orig.shift(3).to_sparse(fill_value=0)) - - tm.assert_sp_series_equal(sparse.shift(-1), - orig.shift(-1).to_sparse(fill_value=0)) - tm.assert_sp_series_equal(sparse.shift(-2), - orig.shift(-2).to_sparse(fill_value=0)) - tm.assert_sp_series_equal(sparse.shift(-3), - orig.shift(-3).to_sparse(fill_value=0)) - tm.assert_sp_series_equal(sparse.shift(-4), - orig.shift(-4).to_sparse(fill_value=0)) + for v in [0, 1, np.nan]: + sparse = orig.to_sparse(fill_value=v) + + tm.assert_sp_series_equal(sparse.shift(0), + orig.shift(0).to_sparse(fill_value=v)) + tm.assert_sp_series_equal(sparse.shift(1), + orig.shift(1).to_sparse(fill_value=v)) + tm.assert_sp_series_equal(sparse.shift(2), + orig.shift(2).to_sparse(fill_value=v)) + tm.assert_sp_series_equal(sparse.shift(3), + orig.shift(3).to_sparse(fill_value=v)) + + tm.assert_sp_series_equal(sparse.shift(-1), + orig.shift(-1).to_sparse(fill_value=v)) + tm.assert_sp_series_equal(sparse.shift(-2), + orig.shift(-2).to_sparse(fill_value=v)) + tm.assert_sp_series_equal(sparse.shift(-3), + orig.shift(-3).to_sparse(fill_value=v)) + tm.assert_sp_series_equal(sparse.shift(-4), + orig.shift(-4).to_sparse(fill_value=v)) def test_combine_first(self): s = self.bseries @@ -1221,6 +1270,40 @@ def test_value_counts_int(self): tm.assert_series_equal(sparse.value_counts(dropna=False), dense.value_counts(dropna=False)) + def test_isnull(self): + # GH 8276 + s = pd.SparseSeries([np.nan, np.nan, 1, 2, np.nan], name='xxx') + + res = s.isnull() + exp = pd.SparseSeries([True, True, False, False, True], name='xxx', + fill_value=True) + tm.assert_sp_series_equal(res, exp) + + # if fill_value is not nan, True can be included in sp_values + s = pd.SparseSeries([np.nan, 0., 1., 2., 0.], name='xxx', + fill_value=0.) + res = s.isnull() + tm.assertIsInstance(res, pd.SparseSeries) + exp = pd.Series([True, False, False, False, False], name='xxx') + tm.assert_series_equal(res.to_dense(), exp) + + def test_isnotnull(self): + # GH 8276 + s = pd.SparseSeries([np.nan, np.nan, 1, 2, np.nan], name='xxx') + + res = s.isnotnull() + exp = pd.SparseSeries([False, False, True, True, False], name='xxx', + fill_value=False) + tm.assert_sp_series_equal(res, exp) + + # if fill_value is not nan, True can be included in sp_values + s = pd.SparseSeries([np.nan, 0., 1., 2., 0.], name='xxx', + fill_value=0.) + res = s.isnotnull() + tm.assertIsInstance(res, pd.SparseSeries) + exp = pd.Series([False, True, True, True, True], name='xxx') + tm.assert_series_equal(res.to_dense(), exp) + def _dense_series_compare(s, f): result = f(s) @@ -1230,6 +1313,7 @@ def _dense_series_compare(s, f): class TestSparseSeriesAnalytics(tm.TestCase): + def setUp(self): arr, index = _test_data1() self.bseries = SparseSeries(arr, index=index, kind='block', @@ -1279,6 +1363,7 @@ def test_numpy_func_call(self): for series in ('bseries', 'zbseries'): getattr(np, func)(getattr(self, series)) + if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/src/algos_common_helper.pxi b/pandas/src/algos_common_helper.pxi new file mode 100644 index 0000000000000..b89a80a73e2dd --- /dev/null +++ b/pandas/src/algos_common_helper.pxi @@ -0,0 +1,2927 @@ +""" +Template for each `dtype` helper function using 1-d template + +# 1-d template +- map_indices +- pad +- pad_1d +- pad_2d +- backfill +- backfill_1d +- backfill_2d +- is_monotonic +- groupby +- arrmap + +WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in +""" + +#---------------------------------------------------------------------- +# 1-d template +#---------------------------------------------------------------------- + + +@cython.wraparound(False) +@cython.boundscheck(False) +cpdef map_indices_float64(ndarray[float64_t] index): + """ + Produce a dict mapping the values of the input array to their respective + locations. + + Example: + array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1} + + Better to do this with Cython because of the enormous speed boost. + """ + cdef Py_ssize_t i, length + cdef dict result = {} + + length = len(index) + + for i in range(length): + result[index[i]] = i + + return result + + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_float64(ndarray[float64_t] old, ndarray[float64_t] new, + limit=None): + cdef Py_ssize_t i, j, nleft, nright + cdef ndarray[int64_t, ndim=1] indexer + cdef float64_t cur, next + cdef int lim, fill_count = 0 + + nleft = len(old) + nright = len(new) + indexer = np.empty(nright, dtype=np.int64) + indexer.fill(-1) + + if limit is None: + lim = nright + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + if nleft == 0 or nright == 0 or new[nright - 1] < old[0]: + return indexer + + i = j = 0 + + cur = old[0] + + while j <= nright - 1 and new[j] < cur: + j += 1 + + while True: + if j == nright: + break + + if i == nleft - 1: + while j < nright: + if new[j] == cur: + indexer[j] = i + elif new[j] > cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j += 1 + break + + next = old[i + 1] + + while j < nright and cur <= new[j] < next: + if new[j] == cur: + indexer[j] = i + elif fill_count < lim: + indexer[j] = i + fill_count += 1 + j += 1 + + fill_count = 0 + i += 1 + cur = next + + return indexer + + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_inplace_float64(ndarray[float64_t] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef float64_t val + cdef int lim, fill_count = 0 + + N = len(values) + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + val = values[0] + for i in range(N): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_2d_inplace_float64(ndarray[float64_t, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef float64_t val + cdef int lim, fill_count = 0 + + K, N = ( values).shape + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + for j in range(K): + fill_count = 0 + val = values[j, 0] + for i in range(N): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] + +""" +Backfilling logic for generating fill vector + +Diagram of what's going on + +Old New Fill vector Mask + . 0 1 + . 0 1 + . 0 1 +A A 0 1 + . 1 1 + . 1 1 + . 1 1 + . 1 1 + . 1 1 +B B 1 1 + . 2 1 + . 2 1 + . 2 1 +C C 2 1 + . 0 + . 0 +D +""" + + +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_float64(ndarray[float64_t] old, ndarray[float64_t] new, + limit=None): + cdef Py_ssize_t i, j, nleft, nright + cdef ndarray[int64_t, ndim=1] indexer + cdef float64_t cur, prev + cdef int lim, fill_count = 0 + + nleft = len(old) + nright = len(new) + indexer = np.empty(nright, dtype=np.int64) + indexer.fill(-1) + + if limit is None: + lim = nright + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]: + return indexer + + i = nleft - 1 + j = nright - 1 + + cur = old[nleft - 1] + + while j >= 0 and new[j] > cur: + j -= 1 + + while True: + if j < 0: + break + + if i == 0: + while j >= 0: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 + break + + prev = old[i - 1] + + while j >= 0 and prev < new[j] <= cur: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 + + fill_count = 0 + i -= 1 + cur = prev + + return indexer + + +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_inplace_float64(ndarray[float64_t] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef float64_t val + cdef int lim, fill_count = 0 + + N = len(values) + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + val = values[N - 1] + for i in range(N - 1, -1, -1): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_2d_inplace_float64(ndarray[float64_t, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef float64_t val + cdef int lim, fill_count = 0 + + K, N = ( values).shape + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + for j in range(K): + fill_count = 0 + val = values[j, N - 1] + for i in range(N - 1, -1, -1): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def is_monotonic_float64(ndarray[float64_t] arr, bint timelike): + """ + Returns + ------- + is_monotonic_inc, is_monotonic_dec + """ + cdef: + Py_ssize_t i, n + float64_t prev, cur + bint is_monotonic_inc = 1 + bint is_monotonic_dec = 1 + + n = len(arr) + + if n == 1: + if arr[0] != arr[0] or (timelike and arr[0] == iNaT): + # single value is NaN + return False, False + else: + return True, True + elif n < 2: + return True, True + + if timelike and arr[0] == iNaT: + return False, False + + with nogil: + prev = arr[0] + for i in range(1, n): + cur = arr[i] + if timelike and cur == iNaT: + is_monotonic_inc = 0 + is_monotonic_dec = 0 + break + if cur < prev: + is_monotonic_inc = 0 + elif cur > prev: + is_monotonic_dec = 0 + elif cur == prev: + pass # is_unique = 0 + else: + # cur or prev is NaN + is_monotonic_inc = 0 + is_monotonic_dec = 0 + break + if not is_monotonic_inc and not is_monotonic_dec: + is_monotonic_inc = 0 + is_monotonic_dec = 0 + break + prev = cur + return is_monotonic_inc, is_monotonic_dec + + +@cython.wraparound(False) +@cython.boundscheck(False) +def groupby_float64(ndarray[float64_t] index, ndarray labels): + cdef dict result = {} + cdef Py_ssize_t i, length + cdef list members + cdef object idx, key + + length = len(index) + + if not length == len(labels): + raise AssertionError("len(index) != len(labels)") + + for i in range(length): + key = util.get_value_1d(labels, i) + + if is_null_datetimelike(key): + continue + + idx = index[i] + if key in result: + members = result[key] + members.append(idx) + else: + result[key] = [idx] + + return result + + +@cython.wraparound(False) +@cython.boundscheck(False) +def arrmap_float64(ndarray[float64_t] index, object func): + cdef Py_ssize_t length = index.shape[0] + cdef Py_ssize_t i = 0 + + cdef ndarray[object] result = np.empty(length, dtype=np.object_) + + from pandas.lib import maybe_convert_objects + + for i in range(length): + result[i] = func(index[i]) + + return maybe_convert_objects(result) + + +@cython.wraparound(False) +@cython.boundscheck(False) +cpdef map_indices_float32(ndarray[float32_t] index): + """ + Produce a dict mapping the values of the input array to their respective + locations. + + Example: + array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1} + + Better to do this with Cython because of the enormous speed boost. + """ + cdef Py_ssize_t i, length + cdef dict result = {} + + length = len(index) + + for i in range(length): + result[index[i]] = i + + return result + + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_float32(ndarray[float32_t] old, ndarray[float32_t] new, + limit=None): + cdef Py_ssize_t i, j, nleft, nright + cdef ndarray[int64_t, ndim=1] indexer + cdef float32_t cur, next + cdef int lim, fill_count = 0 + + nleft = len(old) + nright = len(new) + indexer = np.empty(nright, dtype=np.int64) + indexer.fill(-1) + + if limit is None: + lim = nright + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + if nleft == 0 or nright == 0 or new[nright - 1] < old[0]: + return indexer + + i = j = 0 + + cur = old[0] + + while j <= nright - 1 and new[j] < cur: + j += 1 + + while True: + if j == nright: + break + + if i == nleft - 1: + while j < nright: + if new[j] == cur: + indexer[j] = i + elif new[j] > cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j += 1 + break + + next = old[i + 1] + + while j < nright and cur <= new[j] < next: + if new[j] == cur: + indexer[j] = i + elif fill_count < lim: + indexer[j] = i + fill_count += 1 + j += 1 + + fill_count = 0 + i += 1 + cur = next + + return indexer + + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_inplace_float32(ndarray[float32_t] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef float32_t val + cdef int lim, fill_count = 0 + + N = len(values) + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + val = values[0] + for i in range(N): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_2d_inplace_float32(ndarray[float32_t, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef float32_t val + cdef int lim, fill_count = 0 + + K, N = ( values).shape + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + for j in range(K): + fill_count = 0 + val = values[j, 0] + for i in range(N): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] + +""" +Backfilling logic for generating fill vector + +Diagram of what's going on + +Old New Fill vector Mask + . 0 1 + . 0 1 + . 0 1 +A A 0 1 + . 1 1 + . 1 1 + . 1 1 + . 1 1 + . 1 1 +B B 1 1 + . 2 1 + . 2 1 + . 2 1 +C C 2 1 + . 0 + . 0 +D +""" + + +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_float32(ndarray[float32_t] old, ndarray[float32_t] new, + limit=None): + cdef Py_ssize_t i, j, nleft, nright + cdef ndarray[int64_t, ndim=1] indexer + cdef float32_t cur, prev + cdef int lim, fill_count = 0 + + nleft = len(old) + nright = len(new) + indexer = np.empty(nright, dtype=np.int64) + indexer.fill(-1) + + if limit is None: + lim = nright + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]: + return indexer + + i = nleft - 1 + j = nright - 1 + + cur = old[nleft - 1] + + while j >= 0 and new[j] > cur: + j -= 1 + + while True: + if j < 0: + break + + if i == 0: + while j >= 0: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 + break + + prev = old[i - 1] + + while j >= 0 and prev < new[j] <= cur: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 + + fill_count = 0 + i -= 1 + cur = prev + + return indexer + + +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_inplace_float32(ndarray[float32_t] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef float32_t val + cdef int lim, fill_count = 0 + + N = len(values) + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + val = values[N - 1] + for i in range(N - 1, -1, -1): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_2d_inplace_float32(ndarray[float32_t, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef float32_t val + cdef int lim, fill_count = 0 + + K, N = ( values).shape + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + for j in range(K): + fill_count = 0 + val = values[j, N - 1] + for i in range(N - 1, -1, -1): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def is_monotonic_float32(ndarray[float32_t] arr, bint timelike): + """ + Returns + ------- + is_monotonic_inc, is_monotonic_dec + """ + cdef: + Py_ssize_t i, n + float32_t prev, cur + bint is_monotonic_inc = 1 + bint is_monotonic_dec = 1 + + n = len(arr) + + if n == 1: + if arr[0] != arr[0] or (timelike and arr[0] == iNaT): + # single value is NaN + return False, False + else: + return True, True + elif n < 2: + return True, True + + if timelike and arr[0] == iNaT: + return False, False + + with nogil: + prev = arr[0] + for i in range(1, n): + cur = arr[i] + if timelike and cur == iNaT: + is_monotonic_inc = 0 + is_monotonic_dec = 0 + break + if cur < prev: + is_monotonic_inc = 0 + elif cur > prev: + is_monotonic_dec = 0 + elif cur == prev: + pass # is_unique = 0 + else: + # cur or prev is NaN + is_monotonic_inc = 0 + is_monotonic_dec = 0 + break + if not is_monotonic_inc and not is_monotonic_dec: + is_monotonic_inc = 0 + is_monotonic_dec = 0 + break + prev = cur + return is_monotonic_inc, is_monotonic_dec + + +@cython.wraparound(False) +@cython.boundscheck(False) +def groupby_float32(ndarray[float32_t] index, ndarray labels): + cdef dict result = {} + cdef Py_ssize_t i, length + cdef list members + cdef object idx, key + + length = len(index) + + if not length == len(labels): + raise AssertionError("len(index) != len(labels)") + + for i in range(length): + key = util.get_value_1d(labels, i) + + if is_null_datetimelike(key): + continue + + idx = index[i] + if key in result: + members = result[key] + members.append(idx) + else: + result[key] = [idx] + + return result + + +@cython.wraparound(False) +@cython.boundscheck(False) +def arrmap_float32(ndarray[float32_t] index, object func): + cdef Py_ssize_t length = index.shape[0] + cdef Py_ssize_t i = 0 + + cdef ndarray[object] result = np.empty(length, dtype=np.object_) + + from pandas.lib import maybe_convert_objects + + for i in range(length): + result[i] = func(index[i]) + + return maybe_convert_objects(result) + + +@cython.wraparound(False) +@cython.boundscheck(False) +cpdef map_indices_object(ndarray[object] index): + """ + Produce a dict mapping the values of the input array to their respective + locations. + + Example: + array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1} + + Better to do this with Cython because of the enormous speed boost. + """ + cdef Py_ssize_t i, length + cdef dict result = {} + + length = len(index) + + for i in range(length): + result[index[i]] = i + + return result + + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_object(ndarray[object] old, ndarray[object] new, + limit=None): + cdef Py_ssize_t i, j, nleft, nright + cdef ndarray[int64_t, ndim=1] indexer + cdef object cur, next + cdef int lim, fill_count = 0 + + nleft = len(old) + nright = len(new) + indexer = np.empty(nright, dtype=np.int64) + indexer.fill(-1) + + if limit is None: + lim = nright + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + if nleft == 0 or nright == 0 or new[nright - 1] < old[0]: + return indexer + + i = j = 0 + + cur = old[0] + + while j <= nright - 1 and new[j] < cur: + j += 1 + + while True: + if j == nright: + break + + if i == nleft - 1: + while j < nright: + if new[j] == cur: + indexer[j] = i + elif new[j] > cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j += 1 + break + + next = old[i + 1] + + while j < nright and cur <= new[j] < next: + if new[j] == cur: + indexer[j] = i + elif fill_count < lim: + indexer[j] = i + fill_count += 1 + j += 1 + + fill_count = 0 + i += 1 + cur = next + + return indexer + + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_inplace_object(ndarray[object] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef object val + cdef int lim, fill_count = 0 + + N = len(values) + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + val = values[0] + for i in range(N): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_2d_inplace_object(ndarray[object, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef object val + cdef int lim, fill_count = 0 + + K, N = ( values).shape + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + for j in range(K): + fill_count = 0 + val = values[j, 0] + for i in range(N): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] + +""" +Backfilling logic for generating fill vector + +Diagram of what's going on + +Old New Fill vector Mask + . 0 1 + . 0 1 + . 0 1 +A A 0 1 + . 1 1 + . 1 1 + . 1 1 + . 1 1 + . 1 1 +B B 1 1 + . 2 1 + . 2 1 + . 2 1 +C C 2 1 + . 0 + . 0 +D +""" + + +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_object(ndarray[object] old, ndarray[object] new, + limit=None): + cdef Py_ssize_t i, j, nleft, nright + cdef ndarray[int64_t, ndim=1] indexer + cdef object cur, prev + cdef int lim, fill_count = 0 + + nleft = len(old) + nright = len(new) + indexer = np.empty(nright, dtype=np.int64) + indexer.fill(-1) + + if limit is None: + lim = nright + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]: + return indexer + + i = nleft - 1 + j = nright - 1 + + cur = old[nleft - 1] + + while j >= 0 and new[j] > cur: + j -= 1 + + while True: + if j < 0: + break + + if i == 0: + while j >= 0: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 + break + + prev = old[i - 1] + + while j >= 0 and prev < new[j] <= cur: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 + + fill_count = 0 + i -= 1 + cur = prev + + return indexer + + +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_inplace_object(ndarray[object] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef object val + cdef int lim, fill_count = 0 + + N = len(values) + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + val = values[N - 1] + for i in range(N - 1, -1, -1): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_2d_inplace_object(ndarray[object, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef object val + cdef int lim, fill_count = 0 + + K, N = ( values).shape + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + for j in range(K): + fill_count = 0 + val = values[j, N - 1] + for i in range(N - 1, -1, -1): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def is_monotonic_object(ndarray[object] arr, bint timelike): + """ + Returns + ------- + is_monotonic_inc, is_monotonic_dec + """ + cdef: + Py_ssize_t i, n + object prev, cur + bint is_monotonic_inc = 1 + bint is_monotonic_dec = 1 + + n = len(arr) + + if n == 1: + if arr[0] != arr[0] or (timelike and arr[0] == iNaT): + # single value is NaN + return False, False + else: + return True, True + elif n < 2: + return True, True + + if timelike and arr[0] == iNaT: + return False, False + + + prev = arr[0] + for i in range(1, n): + cur = arr[i] + if timelike and cur == iNaT: + is_monotonic_inc = 0 + is_monotonic_dec = 0 + break + if cur < prev: + is_monotonic_inc = 0 + elif cur > prev: + is_monotonic_dec = 0 + elif cur == prev: + pass # is_unique = 0 + else: + # cur or prev is NaN + is_monotonic_inc = 0 + is_monotonic_dec = 0 + break + if not is_monotonic_inc and not is_monotonic_dec: + is_monotonic_inc = 0 + is_monotonic_dec = 0 + break + prev = cur + return is_monotonic_inc, is_monotonic_dec + + +@cython.wraparound(False) +@cython.boundscheck(False) +def groupby_object(ndarray[object] index, ndarray labels): + cdef dict result = {} + cdef Py_ssize_t i, length + cdef list members + cdef object idx, key + + length = len(index) + + if not length == len(labels): + raise AssertionError("len(index) != len(labels)") + + for i in range(length): + key = util.get_value_1d(labels, i) + + if is_null_datetimelike(key): + continue + + idx = index[i] + if key in result: + members = result[key] + members.append(idx) + else: + result[key] = [idx] + + return result + + +@cython.wraparound(False) +@cython.boundscheck(False) +def arrmap_object(ndarray[object] index, object func): + cdef Py_ssize_t length = index.shape[0] + cdef Py_ssize_t i = 0 + + cdef ndarray[object] result = np.empty(length, dtype=np.object_) + + from pandas.lib import maybe_convert_objects + + for i in range(length): + result[i] = func(index[i]) + + return maybe_convert_objects(result) + + +@cython.wraparound(False) +@cython.boundscheck(False) +cpdef map_indices_int32(ndarray[int32_t] index): + """ + Produce a dict mapping the values of the input array to their respective + locations. + + Example: + array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1} + + Better to do this with Cython because of the enormous speed boost. + """ + cdef Py_ssize_t i, length + cdef dict result = {} + + length = len(index) + + for i in range(length): + result[index[i]] = i + + return result + + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_int32(ndarray[int32_t] old, ndarray[int32_t] new, + limit=None): + cdef Py_ssize_t i, j, nleft, nright + cdef ndarray[int64_t, ndim=1] indexer + cdef int32_t cur, next + cdef int lim, fill_count = 0 + + nleft = len(old) + nright = len(new) + indexer = np.empty(nright, dtype=np.int64) + indexer.fill(-1) + + if limit is None: + lim = nright + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + if nleft == 0 or nright == 0 or new[nright - 1] < old[0]: + return indexer + + i = j = 0 + + cur = old[0] + + while j <= nright - 1 and new[j] < cur: + j += 1 + + while True: + if j == nright: + break + + if i == nleft - 1: + while j < nright: + if new[j] == cur: + indexer[j] = i + elif new[j] > cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j += 1 + break + + next = old[i + 1] + + while j < nright and cur <= new[j] < next: + if new[j] == cur: + indexer[j] = i + elif fill_count < lim: + indexer[j] = i + fill_count += 1 + j += 1 + + fill_count = 0 + i += 1 + cur = next + + return indexer + + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_inplace_int32(ndarray[int32_t] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef int32_t val + cdef int lim, fill_count = 0 + + N = len(values) + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + val = values[0] + for i in range(N): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_2d_inplace_int32(ndarray[int32_t, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef int32_t val + cdef int lim, fill_count = 0 + + K, N = ( values).shape + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + for j in range(K): + fill_count = 0 + val = values[j, 0] + for i in range(N): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] + +""" +Backfilling logic for generating fill vector + +Diagram of what's going on + +Old New Fill vector Mask + . 0 1 + . 0 1 + . 0 1 +A A 0 1 + . 1 1 + . 1 1 + . 1 1 + . 1 1 + . 1 1 +B B 1 1 + . 2 1 + . 2 1 + . 2 1 +C C 2 1 + . 0 + . 0 +D +""" + + +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_int32(ndarray[int32_t] old, ndarray[int32_t] new, + limit=None): + cdef Py_ssize_t i, j, nleft, nright + cdef ndarray[int64_t, ndim=1] indexer + cdef int32_t cur, prev + cdef int lim, fill_count = 0 + + nleft = len(old) + nright = len(new) + indexer = np.empty(nright, dtype=np.int64) + indexer.fill(-1) + + if limit is None: + lim = nright + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]: + return indexer + + i = nleft - 1 + j = nright - 1 + + cur = old[nleft - 1] + + while j >= 0 and new[j] > cur: + j -= 1 + + while True: + if j < 0: + break + + if i == 0: + while j >= 0: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 + break + + prev = old[i - 1] + + while j >= 0 and prev < new[j] <= cur: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 + + fill_count = 0 + i -= 1 + cur = prev + + return indexer + + +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_inplace_int32(ndarray[int32_t] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef int32_t val + cdef int lim, fill_count = 0 + + N = len(values) + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + val = values[N - 1] + for i in range(N - 1, -1, -1): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_2d_inplace_int32(ndarray[int32_t, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef int32_t val + cdef int lim, fill_count = 0 + + K, N = ( values).shape + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + for j in range(K): + fill_count = 0 + val = values[j, N - 1] + for i in range(N - 1, -1, -1): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def is_monotonic_int32(ndarray[int32_t] arr, bint timelike): + """ + Returns + ------- + is_monotonic_inc, is_monotonic_dec + """ + cdef: + Py_ssize_t i, n + int32_t prev, cur + bint is_monotonic_inc = 1 + bint is_monotonic_dec = 1 + + n = len(arr) + + if n == 1: + if arr[0] != arr[0] or (timelike and arr[0] == iNaT): + # single value is NaN + return False, False + else: + return True, True + elif n < 2: + return True, True + + if timelike and arr[0] == iNaT: + return False, False + + with nogil: + prev = arr[0] + for i in range(1, n): + cur = arr[i] + if timelike and cur == iNaT: + is_monotonic_inc = 0 + is_monotonic_dec = 0 + break + if cur < prev: + is_monotonic_inc = 0 + elif cur > prev: + is_monotonic_dec = 0 + elif cur == prev: + pass # is_unique = 0 + else: + # cur or prev is NaN + is_monotonic_inc = 0 + is_monotonic_dec = 0 + break + if not is_monotonic_inc and not is_monotonic_dec: + is_monotonic_inc = 0 + is_monotonic_dec = 0 + break + prev = cur + return is_monotonic_inc, is_monotonic_dec + + +@cython.wraparound(False) +@cython.boundscheck(False) +def groupby_int32(ndarray[int32_t] index, ndarray labels): + cdef dict result = {} + cdef Py_ssize_t i, length + cdef list members + cdef object idx, key + + length = len(index) + + if not length == len(labels): + raise AssertionError("len(index) != len(labels)") + + for i in range(length): + key = util.get_value_1d(labels, i) + + if is_null_datetimelike(key): + continue + + idx = index[i] + if key in result: + members = result[key] + members.append(idx) + else: + result[key] = [idx] + + return result + + +@cython.wraparound(False) +@cython.boundscheck(False) +def arrmap_int32(ndarray[int32_t] index, object func): + cdef Py_ssize_t length = index.shape[0] + cdef Py_ssize_t i = 0 + + cdef ndarray[object] result = np.empty(length, dtype=np.object_) + + from pandas.lib import maybe_convert_objects + + for i in range(length): + result[i] = func(index[i]) + + return maybe_convert_objects(result) + + +@cython.wraparound(False) +@cython.boundscheck(False) +cpdef map_indices_int64(ndarray[int64_t] index): + """ + Produce a dict mapping the values of the input array to their respective + locations. + + Example: + array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1} + + Better to do this with Cython because of the enormous speed boost. + """ + cdef Py_ssize_t i, length + cdef dict result = {} + + length = len(index) + + for i in range(length): + result[index[i]] = i + + return result + + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_int64(ndarray[int64_t] old, ndarray[int64_t] new, + limit=None): + cdef Py_ssize_t i, j, nleft, nright + cdef ndarray[int64_t, ndim=1] indexer + cdef int64_t cur, next + cdef int lim, fill_count = 0 + + nleft = len(old) + nright = len(new) + indexer = np.empty(nright, dtype=np.int64) + indexer.fill(-1) + + if limit is None: + lim = nright + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + if nleft == 0 or nright == 0 or new[nright - 1] < old[0]: + return indexer + + i = j = 0 + + cur = old[0] + + while j <= nright - 1 and new[j] < cur: + j += 1 + + while True: + if j == nright: + break + + if i == nleft - 1: + while j < nright: + if new[j] == cur: + indexer[j] = i + elif new[j] > cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j += 1 + break + + next = old[i + 1] + + while j < nright and cur <= new[j] < next: + if new[j] == cur: + indexer[j] = i + elif fill_count < lim: + indexer[j] = i + fill_count += 1 + j += 1 + + fill_count = 0 + i += 1 + cur = next + + return indexer + + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_inplace_int64(ndarray[int64_t] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef int64_t val + cdef int lim, fill_count = 0 + + N = len(values) + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + val = values[0] + for i in range(N): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_2d_inplace_int64(ndarray[int64_t, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef int64_t val + cdef int lim, fill_count = 0 + + K, N = ( values).shape + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + for j in range(K): + fill_count = 0 + val = values[j, 0] + for i in range(N): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] + +""" +Backfilling logic for generating fill vector + +Diagram of what's going on + +Old New Fill vector Mask + . 0 1 + . 0 1 + . 0 1 +A A 0 1 + . 1 1 + . 1 1 + . 1 1 + . 1 1 + . 1 1 +B B 1 1 + . 2 1 + . 2 1 + . 2 1 +C C 2 1 + . 0 + . 0 +D +""" + + +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_int64(ndarray[int64_t] old, ndarray[int64_t] new, + limit=None): + cdef Py_ssize_t i, j, nleft, nright + cdef ndarray[int64_t, ndim=1] indexer + cdef int64_t cur, prev + cdef int lim, fill_count = 0 + + nleft = len(old) + nright = len(new) + indexer = np.empty(nright, dtype=np.int64) + indexer.fill(-1) + + if limit is None: + lim = nright + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]: + return indexer + + i = nleft - 1 + j = nright - 1 + + cur = old[nleft - 1] + + while j >= 0 and new[j] > cur: + j -= 1 + + while True: + if j < 0: + break + + if i == 0: + while j >= 0: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 + break + + prev = old[i - 1] + + while j >= 0 and prev < new[j] <= cur: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 + + fill_count = 0 + i -= 1 + cur = prev + + return indexer + + +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_inplace_int64(ndarray[int64_t] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef int64_t val + cdef int lim, fill_count = 0 + + N = len(values) + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + val = values[N - 1] + for i in range(N - 1, -1, -1): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_2d_inplace_int64(ndarray[int64_t, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef int64_t val + cdef int lim, fill_count = 0 + + K, N = ( values).shape + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + for j in range(K): + fill_count = 0 + val = values[j, N - 1] + for i in range(N - 1, -1, -1): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def is_monotonic_int64(ndarray[int64_t] arr, bint timelike): + """ + Returns + ------- + is_monotonic_inc, is_monotonic_dec + """ + cdef: + Py_ssize_t i, n + int64_t prev, cur + bint is_monotonic_inc = 1 + bint is_monotonic_dec = 1 + + n = len(arr) + + if n == 1: + if arr[0] != arr[0] or (timelike and arr[0] == iNaT): + # single value is NaN + return False, False + else: + return True, True + elif n < 2: + return True, True + + if timelike and arr[0] == iNaT: + return False, False + + with nogil: + prev = arr[0] + for i in range(1, n): + cur = arr[i] + if timelike and cur == iNaT: + is_monotonic_inc = 0 + is_monotonic_dec = 0 + break + if cur < prev: + is_monotonic_inc = 0 + elif cur > prev: + is_monotonic_dec = 0 + elif cur == prev: + pass # is_unique = 0 + else: + # cur or prev is NaN + is_monotonic_inc = 0 + is_monotonic_dec = 0 + break + if not is_monotonic_inc and not is_monotonic_dec: + is_monotonic_inc = 0 + is_monotonic_dec = 0 + break + prev = cur + return is_monotonic_inc, is_monotonic_dec + + +@cython.wraparound(False) +@cython.boundscheck(False) +def groupby_int64(ndarray[int64_t] index, ndarray labels): + cdef dict result = {} + cdef Py_ssize_t i, length + cdef list members + cdef object idx, key + + length = len(index) + + if not length == len(labels): + raise AssertionError("len(index) != len(labels)") + + for i in range(length): + key = util.get_value_1d(labels, i) + + if is_null_datetimelike(key): + continue + + idx = index[i] + if key in result: + members = result[key] + members.append(idx) + else: + result[key] = [idx] + + return result + + +@cython.wraparound(False) +@cython.boundscheck(False) +def arrmap_int64(ndarray[int64_t] index, object func): + cdef Py_ssize_t length = index.shape[0] + cdef Py_ssize_t i = 0 + + cdef ndarray[object] result = np.empty(length, dtype=np.object_) + + from pandas.lib import maybe_convert_objects + + for i in range(length): + result[i] = func(index[i]) + + return maybe_convert_objects(result) + + +@cython.wraparound(False) +@cython.boundscheck(False) +cpdef map_indices_bool(ndarray[uint8_t] index): + """ + Produce a dict mapping the values of the input array to their respective + locations. + + Example: + array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1} + + Better to do this with Cython because of the enormous speed boost. + """ + cdef Py_ssize_t i, length + cdef dict result = {} + + length = len(index) + + for i in range(length): + result[index[i]] = i + + return result + + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_bool(ndarray[uint8_t] old, ndarray[uint8_t] new, + limit=None): + cdef Py_ssize_t i, j, nleft, nright + cdef ndarray[int64_t, ndim=1] indexer + cdef uint8_t cur, next + cdef int lim, fill_count = 0 + + nleft = len(old) + nright = len(new) + indexer = np.empty(nright, dtype=np.int64) + indexer.fill(-1) + + if limit is None: + lim = nright + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + if nleft == 0 or nright == 0 or new[nright - 1] < old[0]: + return indexer + + i = j = 0 + + cur = old[0] + + while j <= nright - 1 and new[j] < cur: + j += 1 + + while True: + if j == nright: + break + + if i == nleft - 1: + while j < nright: + if new[j] == cur: + indexer[j] = i + elif new[j] > cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j += 1 + break + + next = old[i + 1] + + while j < nright and cur <= new[j] < next: + if new[j] == cur: + indexer[j] = i + elif fill_count < lim: + indexer[j] = i + fill_count += 1 + j += 1 + + fill_count = 0 + i += 1 + cur = next + + return indexer + + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_inplace_bool(ndarray[uint8_t] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef uint8_t val + cdef int lim, fill_count = 0 + + N = len(values) + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + val = values[0] + for i in range(N): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_2d_inplace_bool(ndarray[uint8_t, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef uint8_t val + cdef int lim, fill_count = 0 + + K, N = ( values).shape + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + for j in range(K): + fill_count = 0 + val = values[j, 0] + for i in range(N): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] + +""" +Backfilling logic for generating fill vector + +Diagram of what's going on + +Old New Fill vector Mask + . 0 1 + . 0 1 + . 0 1 +A A 0 1 + . 1 1 + . 1 1 + . 1 1 + . 1 1 + . 1 1 +B B 1 1 + . 2 1 + . 2 1 + . 2 1 +C C 2 1 + . 0 + . 0 +D +""" + + +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_bool(ndarray[uint8_t] old, ndarray[uint8_t] new, + limit=None): + cdef Py_ssize_t i, j, nleft, nright + cdef ndarray[int64_t, ndim=1] indexer + cdef uint8_t cur, prev + cdef int lim, fill_count = 0 + + nleft = len(old) + nright = len(new) + indexer = np.empty(nright, dtype=np.int64) + indexer.fill(-1) + + if limit is None: + lim = nright + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]: + return indexer + + i = nleft - 1 + j = nright - 1 + + cur = old[nleft - 1] + + while j >= 0 and new[j] > cur: + j -= 1 + + while True: + if j < 0: + break + + if i == 0: + while j >= 0: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 + break + + prev = old[i - 1] + + while j >= 0 and prev < new[j] <= cur: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 + + fill_count = 0 + i -= 1 + cur = prev + + return indexer + + +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_inplace_bool(ndarray[uint8_t] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef uint8_t val + cdef int lim, fill_count = 0 + + N = len(values) + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + val = values[N - 1] + for i in range(N - 1, -1, -1): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_2d_inplace_bool(ndarray[uint8_t, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef uint8_t val + cdef int lim, fill_count = 0 + + K, N = ( values).shape + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + for j in range(K): + fill_count = 0 + val = values[j, N - 1] + for i in range(N - 1, -1, -1): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def is_monotonic_bool(ndarray[uint8_t] arr, bint timelike): + """ + Returns + ------- + is_monotonic_inc, is_monotonic_dec + """ + cdef: + Py_ssize_t i, n + uint8_t prev, cur + bint is_monotonic_inc = 1 + bint is_monotonic_dec = 1 + + n = len(arr) + + if n == 1: + if arr[0] != arr[0] or (timelike and arr[0] == iNaT): + # single value is NaN + return False, False + else: + return True, True + elif n < 2: + return True, True + + if timelike and arr[0] == iNaT: + return False, False + + with nogil: + prev = arr[0] + for i in range(1, n): + cur = arr[i] + if timelike and cur == iNaT: + is_monotonic_inc = 0 + is_monotonic_dec = 0 + break + if cur < prev: + is_monotonic_inc = 0 + elif cur > prev: + is_monotonic_dec = 0 + elif cur == prev: + pass # is_unique = 0 + else: + # cur or prev is NaN + is_monotonic_inc = 0 + is_monotonic_dec = 0 + break + if not is_monotonic_inc and not is_monotonic_dec: + is_monotonic_inc = 0 + is_monotonic_dec = 0 + break + prev = cur + return is_monotonic_inc, is_monotonic_dec + + +@cython.wraparound(False) +@cython.boundscheck(False) +def groupby_bool(ndarray[uint8_t] index, ndarray labels): + cdef dict result = {} + cdef Py_ssize_t i, length + cdef list members + cdef object idx, key + + length = len(index) + + if not length == len(labels): + raise AssertionError("len(index) != len(labels)") + + for i in range(length): + key = util.get_value_1d(labels, i) + + if is_null_datetimelike(key): + continue + + idx = index[i] + if key in result: + members = result[key] + members.append(idx) + else: + result[key] = [idx] + + return result + + +@cython.wraparound(False) +@cython.boundscheck(False) +def arrmap_bool(ndarray[uint8_t] index, object func): + cdef Py_ssize_t length = index.shape[0] + cdef Py_ssize_t i = 0 + + cdef ndarray[object] result = np.empty(length, dtype=np.object_) + + from pandas.lib import maybe_convert_objects + + for i in range(length): + result[i] = func(index[i]) + + return maybe_convert_objects(result) + +#---------------------------------------------------------------------- +# put template +#---------------------------------------------------------------------- + + +@cython.boundscheck(False) +@cython.wraparound(False) +def diff_2d_float64(ndarray[float64_t, ndim=2] arr, + ndarray[float64_t, ndim=2] out, + Py_ssize_t periods, int axis): + cdef: + Py_ssize_t i, j, sx, sy + + sx, sy = ( arr).shape + if arr.flags.f_contiguous: + if axis == 0: + if periods >= 0: + start, stop = periods, sx + else: + start, stop = 0, sx + periods + for j in range(sy): + for i in range(start, stop): + out[i, j] = arr[i, j] - arr[i - periods, j] + else: + if periods >= 0: + start, stop = periods, sy + else: + start, stop = 0, sy + periods + for j in range(start, stop): + for i in range(sx): + out[i, j] = arr[i, j] - arr[i, j - periods] + else: + if axis == 0: + if periods >= 0: + start, stop = periods, sx + else: + start, stop = 0, sx + periods + for i in range(start, stop): + for j in range(sy): + out[i, j] = arr[i, j] - arr[i - periods, j] + else: + if periods >= 0: + start, stop = periods, sy + else: + start, stop = 0, sy + periods + for i in range(sx): + for j in range(start, stop): + out[i, j] = arr[i, j] - arr[i, j - periods] + + +def put2d_float64_float64(ndarray[float64_t, ndim=2, cast=True] values, + ndarray[int64_t] indexer, Py_ssize_t loc, + ndarray[float64_t] out): + cdef: + Py_ssize_t i, j, k + + k = len(values) + for j from 0 <= j < k: + i = indexer[j] + out[i] = values[j, loc] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def diff_2d_float32(ndarray[float32_t, ndim=2] arr, + ndarray[float32_t, ndim=2] out, + Py_ssize_t periods, int axis): + cdef: + Py_ssize_t i, j, sx, sy + + sx, sy = ( arr).shape + if arr.flags.f_contiguous: + if axis == 0: + if periods >= 0: + start, stop = periods, sx + else: + start, stop = 0, sx + periods + for j in range(sy): + for i in range(start, stop): + out[i, j] = arr[i, j] - arr[i - periods, j] + else: + if periods >= 0: + start, stop = periods, sy + else: + start, stop = 0, sy + periods + for j in range(start, stop): + for i in range(sx): + out[i, j] = arr[i, j] - arr[i, j - periods] + else: + if axis == 0: + if periods >= 0: + start, stop = periods, sx + else: + start, stop = 0, sx + periods + for i in range(start, stop): + for j in range(sy): + out[i, j] = arr[i, j] - arr[i - periods, j] + else: + if periods >= 0: + start, stop = periods, sy + else: + start, stop = 0, sy + periods + for i in range(sx): + for j in range(start, stop): + out[i, j] = arr[i, j] - arr[i, j - periods] + + +def put2d_float32_float32(ndarray[float32_t, ndim=2, cast=True] values, + ndarray[int64_t] indexer, Py_ssize_t loc, + ndarray[float32_t] out): + cdef: + Py_ssize_t i, j, k + + k = len(values) + for j from 0 <= j < k: + i = indexer[j] + out[i] = values[j, loc] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def diff_2d_int8(ndarray[int8_t, ndim=2] arr, + ndarray[float32_t, ndim=2] out, + Py_ssize_t periods, int axis): + cdef: + Py_ssize_t i, j, sx, sy + + sx, sy = ( arr).shape + if arr.flags.f_contiguous: + if axis == 0: + if periods >= 0: + start, stop = periods, sx + else: + start, stop = 0, sx + periods + for j in range(sy): + for i in range(start, stop): + out[i, j] = arr[i, j] - arr[i - periods, j] + else: + if periods >= 0: + start, stop = periods, sy + else: + start, stop = 0, sy + periods + for j in range(start, stop): + for i in range(sx): + out[i, j] = arr[i, j] - arr[i, j - periods] + else: + if axis == 0: + if periods >= 0: + start, stop = periods, sx + else: + start, stop = 0, sx + periods + for i in range(start, stop): + for j in range(sy): + out[i, j] = arr[i, j] - arr[i - periods, j] + else: + if periods >= 0: + start, stop = periods, sy + else: + start, stop = 0, sy + periods + for i in range(sx): + for j in range(start, stop): + out[i, j] = arr[i, j] - arr[i, j - periods] + + +def put2d_int8_float32(ndarray[int8_t, ndim=2, cast=True] values, + ndarray[int64_t] indexer, Py_ssize_t loc, + ndarray[float32_t] out): + cdef: + Py_ssize_t i, j, k + + k = len(values) + for j from 0 <= j < k: + i = indexer[j] + out[i] = values[j, loc] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def diff_2d_int16(ndarray[int16_t, ndim=2] arr, + ndarray[float32_t, ndim=2] out, + Py_ssize_t periods, int axis): + cdef: + Py_ssize_t i, j, sx, sy + + sx, sy = ( arr).shape + if arr.flags.f_contiguous: + if axis == 0: + if periods >= 0: + start, stop = periods, sx + else: + start, stop = 0, sx + periods + for j in range(sy): + for i in range(start, stop): + out[i, j] = arr[i, j] - arr[i - periods, j] + else: + if periods >= 0: + start, stop = periods, sy + else: + start, stop = 0, sy + periods + for j in range(start, stop): + for i in range(sx): + out[i, j] = arr[i, j] - arr[i, j - periods] + else: + if axis == 0: + if periods >= 0: + start, stop = periods, sx + else: + start, stop = 0, sx + periods + for i in range(start, stop): + for j in range(sy): + out[i, j] = arr[i, j] - arr[i - periods, j] + else: + if periods >= 0: + start, stop = periods, sy + else: + start, stop = 0, sy + periods + for i in range(sx): + for j in range(start, stop): + out[i, j] = arr[i, j] - arr[i, j - periods] + + +def put2d_int16_float32(ndarray[int16_t, ndim=2, cast=True] values, + ndarray[int64_t] indexer, Py_ssize_t loc, + ndarray[float32_t] out): + cdef: + Py_ssize_t i, j, k + + k = len(values) + for j from 0 <= j < k: + i = indexer[j] + out[i] = values[j, loc] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def diff_2d_int32(ndarray[int32_t, ndim=2] arr, + ndarray[float64_t, ndim=2] out, + Py_ssize_t periods, int axis): + cdef: + Py_ssize_t i, j, sx, sy + + sx, sy = ( arr).shape + if arr.flags.f_contiguous: + if axis == 0: + if periods >= 0: + start, stop = periods, sx + else: + start, stop = 0, sx + periods + for j in range(sy): + for i in range(start, stop): + out[i, j] = arr[i, j] - arr[i - periods, j] + else: + if periods >= 0: + start, stop = periods, sy + else: + start, stop = 0, sy + periods + for j in range(start, stop): + for i in range(sx): + out[i, j] = arr[i, j] - arr[i, j - periods] + else: + if axis == 0: + if periods >= 0: + start, stop = periods, sx + else: + start, stop = 0, sx + periods + for i in range(start, stop): + for j in range(sy): + out[i, j] = arr[i, j] - arr[i - periods, j] + else: + if periods >= 0: + start, stop = periods, sy + else: + start, stop = 0, sy + periods + for i in range(sx): + for j in range(start, stop): + out[i, j] = arr[i, j] - arr[i, j - periods] + + +def put2d_int32_float64(ndarray[int32_t, ndim=2, cast=True] values, + ndarray[int64_t] indexer, Py_ssize_t loc, + ndarray[float64_t] out): + cdef: + Py_ssize_t i, j, k + + k = len(values) + for j from 0 <= j < k: + i = indexer[j] + out[i] = values[j, loc] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def diff_2d_int64(ndarray[int64_t, ndim=2] arr, + ndarray[float64_t, ndim=2] out, + Py_ssize_t periods, int axis): + cdef: + Py_ssize_t i, j, sx, sy + + sx, sy = ( arr).shape + if arr.flags.f_contiguous: + if axis == 0: + if periods >= 0: + start, stop = periods, sx + else: + start, stop = 0, sx + periods + for j in range(sy): + for i in range(start, stop): + out[i, j] = arr[i, j] - arr[i - periods, j] + else: + if periods >= 0: + start, stop = periods, sy + else: + start, stop = 0, sy + periods + for j in range(start, stop): + for i in range(sx): + out[i, j] = arr[i, j] - arr[i, j - periods] + else: + if axis == 0: + if periods >= 0: + start, stop = periods, sx + else: + start, stop = 0, sx + periods + for i in range(start, stop): + for j in range(sy): + out[i, j] = arr[i, j] - arr[i - periods, j] + else: + if periods >= 0: + start, stop = periods, sy + else: + start, stop = 0, sy + periods + for i in range(sx): + for j in range(start, stop): + out[i, j] = arr[i, j] - arr[i, j - periods] + + +def put2d_int64_float64(ndarray[int64_t, ndim=2, cast=True] values, + ndarray[int64_t] indexer, Py_ssize_t loc, + ndarray[float64_t] out): + cdef: + Py_ssize_t i, j, k + + k = len(values) + for j from 0 <= j < k: + i = indexer[j] + out[i] = values[j, loc] + +#---------------------------------------------------------------------- +# ensure_dtype +#---------------------------------------------------------------------- + +cdef int PLATFORM_INT = ( np.arange(0, dtype=np.intp)).descr.type_num + +cpdef ensure_platform_int(object arr): + # GH3033, GH1392 + # platform int is the size of the int pointer, e.g. np.intp + if util.is_array(arr): + if ( arr).descr.type_num == PLATFORM_INT: + return arr + else: + return arr.astype(np.intp) + else: + return np.array(arr, dtype=np.intp) + +cpdef ensure_object(object arr): + if util.is_array(arr): + if ( arr).descr.type_num == NPY_OBJECT: + return arr + else: + return arr.astype(np.object_) + elif hasattr(arr, 'asobject'): + return arr.asobject + else: + return np.array(arr, dtype=np.object_) + +cpdef ensure_float64(object arr): + if util.is_array(arr): + if ( arr).descr.type_num == NPY_FLOAT64: + return arr + else: + return arr.astype(np.float64) + else: + return np.array(arr, dtype=np.float64) + +cpdef ensure_float32(object arr): + if util.is_array(arr): + if ( arr).descr.type_num == NPY_FLOAT32: + return arr + else: + return arr.astype(np.float32) + else: + return np.array(arr, dtype=np.float32) + +cpdef ensure_int8(object arr): + if util.is_array(arr): + if ( arr).descr.type_num == NPY_INT8: + return arr + else: + return arr.astype(np.int8) + else: + return np.array(arr, dtype=np.int8) + +cpdef ensure_int16(object arr): + if util.is_array(arr): + if ( arr).descr.type_num == NPY_INT16: + return arr + else: + return arr.astype(np.int16) + else: + return np.array(arr, dtype=np.int16) + +cpdef ensure_int32(object arr): + if util.is_array(arr): + if ( arr).descr.type_num == NPY_INT32: + return arr + else: + return arr.astype(np.int32) + else: + return np.array(arr, dtype=np.int32) + +cpdef ensure_int64(object arr): + if util.is_array(arr): + if ( arr).descr.type_num == NPY_INT64: + return arr + else: + return arr.astype(np.int64) + else: + return np.array(arr, dtype=np.int64) diff --git a/pandas/src/algos_common_helper.pxi.in b/pandas/src/algos_common_helper.pxi.in new file mode 100644 index 0000000000000..1451ffb054e5d --- /dev/null +++ b/pandas/src/algos_common_helper.pxi.in @@ -0,0 +1,605 @@ +""" +Template for each `dtype` helper function using 1-d template + +# 1-d template +- map_indices +- pad +- pad_1d +- pad_2d +- backfill +- backfill_1d +- backfill_2d +- is_monotonic +- groupby +- arrmap + +WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in +""" + +#---------------------------------------------------------------------- +# 1-d template +#---------------------------------------------------------------------- + +{{py: + +# name, c_type, dtype, can_hold_na, nogil +dtypes = [('float64', 'float64_t', 'np.float64', True, True), + ('float32', 'float32_t', 'np.float32', True, True), + ('object', 'object', 'object', True, False), + ('int32', 'int32_t', 'np.int32', False, True), + ('int64', 'int64_t', 'np.int64', False, True), + ('bool', 'uint8_t', 'np.bool', False, True)] + +def get_dispatch(dtypes): + + for name, c_type, dtype, can_hold_na, nogil in dtypes: + + nogil_str = 'with nogil:' if nogil else '' + tab = ' ' if nogil else '' + yield name, c_type, dtype, can_hold_na, nogil_str, tab +}} + +{{for name, c_type, dtype, can_hold_na, nogil_str, tab + in get_dispatch(dtypes)}} + + +@cython.wraparound(False) +@cython.boundscheck(False) +cpdef map_indices_{{name}}(ndarray[{{c_type}}] index): + """ + Produce a dict mapping the values of the input array to their respective + locations. + + Example: + array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1} + + Better to do this with Cython because of the enormous speed boost. + """ + cdef Py_ssize_t i, length + cdef dict result = {} + + length = len(index) + + for i in range(length): + result[index[i]] = i + + return result + + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_{{name}}(ndarray[{{c_type}}] old, ndarray[{{c_type}}] new, + limit=None): + cdef Py_ssize_t i, j, nleft, nright + cdef ndarray[int64_t, ndim=1] indexer + cdef {{c_type}} cur, next + cdef int lim, fill_count = 0 + + nleft = len(old) + nright = len(new) + indexer = np.empty(nright, dtype=np.int64) + indexer.fill(-1) + + if limit is None: + lim = nright + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + if nleft == 0 or nright == 0 or new[nright - 1] < old[0]: + return indexer + + i = j = 0 + + cur = old[0] + + while j <= nright - 1 and new[j] < cur: + j += 1 + + while True: + if j == nright: + break + + if i == nleft - 1: + while j < nright: + if new[j] == cur: + indexer[j] = i + elif new[j] > cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j += 1 + break + + next = old[i + 1] + + while j < nright and cur <= new[j] < next: + if new[j] == cur: + indexer[j] = i + elif fill_count < lim: + indexer[j] = i + fill_count += 1 + j += 1 + + fill_count = 0 + i += 1 + cur = next + + return indexer + + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_inplace_{{name}}(ndarray[{{c_type}}] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef {{c_type}} val + cdef int lim, fill_count = 0 + + N = len(values) + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + val = values[0] + for i in range(N): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_2d_inplace_{{name}}(ndarray[{{c_type}}, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef {{c_type}} val + cdef int lim, fill_count = 0 + + K, N = ( values).shape + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + for j in range(K): + fill_count = 0 + val = values[j, 0] + for i in range(N): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] + +""" +Backfilling logic for generating fill vector + +Diagram of what's going on + +Old New Fill vector Mask + . 0 1 + . 0 1 + . 0 1 +A A 0 1 + . 1 1 + . 1 1 + . 1 1 + . 1 1 + . 1 1 +B B 1 1 + . 2 1 + . 2 1 + . 2 1 +C C 2 1 + . 0 + . 0 +D +""" + + +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_{{name}}(ndarray[{{c_type}}] old, ndarray[{{c_type}}] new, + limit=None): + cdef Py_ssize_t i, j, nleft, nright + cdef ndarray[int64_t, ndim=1] indexer + cdef {{c_type}} cur, prev + cdef int lim, fill_count = 0 + + nleft = len(old) + nright = len(new) + indexer = np.empty(nright, dtype=np.int64) + indexer.fill(-1) + + if limit is None: + lim = nright + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]: + return indexer + + i = nleft - 1 + j = nright - 1 + + cur = old[nleft - 1] + + while j >= 0 and new[j] > cur: + j -= 1 + + while True: + if j < 0: + break + + if i == 0: + while j >= 0: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 + break + + prev = old[i - 1] + + while j >= 0 and prev < new[j] <= cur: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 + + fill_count = 0 + i -= 1 + cur = prev + + return indexer + + +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_inplace_{{name}}(ndarray[{{c_type}}] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef {{c_type}} val + cdef int lim, fill_count = 0 + + N = len(values) + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + val = values[N - 1] + for i in range(N - 1, -1, -1): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_2d_inplace_{{name}}(ndarray[{{c_type}}, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef {{c_type}} val + cdef int lim, fill_count = 0 + + K, N = ( values).shape + + # GH 2778 + if N == 0: + return + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + for j in range(K): + fill_count = 0 + val = values[j, N - 1] + for i in range(N - 1, -1, -1): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def is_monotonic_{{name}}(ndarray[{{c_type}}] arr, bint timelike): + """ + Returns + ------- + is_monotonic_inc, is_monotonic_dec + """ + cdef: + Py_ssize_t i, n + {{c_type}} prev, cur + bint is_monotonic_inc = 1 + bint is_monotonic_dec = 1 + + n = len(arr) + + if n == 1: + if arr[0] != arr[0] or (timelike and arr[0] == iNaT): + # single value is NaN + return False, False + else: + return True, True + elif n < 2: + return True, True + + if timelike and arr[0] == iNaT: + return False, False + + {{nogil_str}} + {{tab}}prev = arr[0] + {{tab}}for i in range(1, n): + {{tab}} cur = arr[i] + {{tab}} if timelike and cur == iNaT: + {{tab}} is_monotonic_inc = 0 + {{tab}} is_monotonic_dec = 0 + {{tab}} break + {{tab}} if cur < prev: + {{tab}} is_monotonic_inc = 0 + {{tab}} elif cur > prev: + {{tab}} is_monotonic_dec = 0 + {{tab}} elif cur == prev: + {{tab}} pass # is_unique = 0 + {{tab}} else: + {{tab}} # cur or prev is NaN + {{tab}} is_monotonic_inc = 0 + {{tab}} is_monotonic_dec = 0 + {{tab}} break + {{tab}} if not is_monotonic_inc and not is_monotonic_dec: + {{tab}} is_monotonic_inc = 0 + {{tab}} is_monotonic_dec = 0 + {{tab}} break + {{tab}} prev = cur + return is_monotonic_inc, is_monotonic_dec + + +@cython.wraparound(False) +@cython.boundscheck(False) +def groupby_{{name}}(ndarray[{{c_type}}] index, ndarray labels): + cdef dict result = {} + cdef Py_ssize_t i, length + cdef list members + cdef object idx, key + + length = len(index) + + if not length == len(labels): + raise AssertionError("len(index) != len(labels)") + + for i in range(length): + key = util.get_value_1d(labels, i) + + if is_null_datetimelike(key): + continue + + idx = index[i] + if key in result: + members = result[key] + members.append(idx) + else: + result[key] = [idx] + + return result + + +@cython.wraparound(False) +@cython.boundscheck(False) +def arrmap_{{name}}(ndarray[{{c_type}}] index, object func): + cdef Py_ssize_t length = index.shape[0] + cdef Py_ssize_t i = 0 + + cdef ndarray[object] result = np.empty(length, dtype=np.object_) + + from pandas.lib import maybe_convert_objects + + for i in range(length): + result[i] = func(index[i]) + + return maybe_convert_objects(result) + +{{endfor}} + +#---------------------------------------------------------------------- +# put template +#---------------------------------------------------------------------- + +{{py: + +# name, c_type, dest_type, dest_dtype +dtypes = [('float64', 'float64_t', 'float64_t', 'np.float64'), + ('float32', 'float32_t', 'float32_t', 'np.float32'), + ('int8', 'int8_t', 'float32_t', 'np.float32'), + ('int16', 'int16_t', 'float32_t', 'np.float32'), + ('int32', 'int32_t', 'float64_t', 'np.float64'), + ('int64', 'int64_t', 'float64_t', 'np.float64')] + +def get_dispatch(dtypes): + + for name, c_type, dest_type, dest_dtype, in dtypes: + + dest_type2 = dest_type + dest_type = dest_type.replace('_t', '') + + yield name, c_type, dest_type, dest_type2, dest_dtype + +}} + +{{for name, c_type, dest_type, dest_type2, dest_dtype + in get_dispatch(dtypes)}} + + +@cython.boundscheck(False) +@cython.wraparound(False) +def diff_2d_{{name}}(ndarray[{{c_type}}, ndim=2] arr, + ndarray[{{dest_type2}}, ndim=2] out, + Py_ssize_t periods, int axis): + cdef: + Py_ssize_t i, j, sx, sy + + sx, sy = ( arr).shape + if arr.flags.f_contiguous: + if axis == 0: + if periods >= 0: + start, stop = periods, sx + else: + start, stop = 0, sx + periods + for j in range(sy): + for i in range(start, stop): + out[i, j] = arr[i, j] - arr[i - periods, j] + else: + if periods >= 0: + start, stop = periods, sy + else: + start, stop = 0, sy + periods + for j in range(start, stop): + for i in range(sx): + out[i, j] = arr[i, j] - arr[i, j - periods] + else: + if axis == 0: + if periods >= 0: + start, stop = periods, sx + else: + start, stop = 0, sx + periods + for i in range(start, stop): + for j in range(sy): + out[i, j] = arr[i, j] - arr[i - periods, j] + else: + if periods >= 0: + start, stop = periods, sy + else: + start, stop = 0, sy + periods + for i in range(sx): + for j in range(start, stop): + out[i, j] = arr[i, j] - arr[i, j - periods] + + +def put2d_{{name}}_{{dest_type}}(ndarray[{{c_type}}, ndim=2, cast=True] values, + ndarray[int64_t] indexer, Py_ssize_t loc, + ndarray[{{dest_type2}}] out): + cdef: + Py_ssize_t i, j, k + + k = len(values) + for j from 0 <= j < k: + i = indexer[j] + out[i] = values[j, loc] + +{{endfor}} + +#---------------------------------------------------------------------- +# ensure_dtype +#---------------------------------------------------------------------- + +cdef int PLATFORM_INT = ( np.arange(0, dtype=np.intp)).descr.type_num + +cpdef ensure_platform_int(object arr): + # GH3033, GH1392 + # platform int is the size of the int pointer, e.g. np.intp + if util.is_array(arr): + if ( arr).descr.type_num == PLATFORM_INT: + return arr + else: + return arr.astype(np.intp) + else: + return np.array(arr, dtype=np.intp) + +cpdef ensure_object(object arr): + if util.is_array(arr): + if ( arr).descr.type_num == NPY_OBJECT: + return arr + else: + return arr.astype(np.object_) + elif hasattr(arr, 'asobject'): + return arr.asobject + else: + return np.array(arr, dtype=np.object_) + +{{py: + +# name, c_type, dtype +dtypes = [('float64', 'FLOAT64', 'float64'), + ('float32', 'FLOAT32', 'float32'), + ('int8', 'INT8', 'int8'), + ('int16', 'INT16', 'int16'), + ('int32', 'INT32', 'int32'), + ('int64', 'INT64', 'int64'), + # ('platform_int', 'INT', 'int_'), + # ('object', 'OBJECT', 'object_'), +] + +def get_dispatch(dtypes): + + for name, c_type, dtype in dtypes: + yield name, c_type, dtype +}} + +{{for name, c_type, dtype in get_dispatch(dtypes)}} + +cpdef ensure_{{name}}(object arr): + if util.is_array(arr): + if ( arr).descr.type_num == NPY_{{c_type}}: + return arr + else: + return arr.astype(np.{{dtype}}) + else: + return np.array(arr, dtype=np.{{dtype}}) + +{{endfor}} diff --git a/pandas/src/algos_groupby_helper.pxi b/pandas/src/algos_groupby_helper.pxi new file mode 100644 index 0000000000000..013a03f719bbd --- /dev/null +++ b/pandas/src/algos_groupby_helper.pxi @@ -0,0 +1,1375 @@ +""" +Template for each `dtype` helper function using groupby + +WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in +""" + +cdef extern from "numpy/npy_math.h": + double NAN "NPY_NAN" +_int64_max = np.iinfo(np.int64).max + +#---------------------------------------------------------------------- +# group_add, group_prod, group_var, group_mean, group_ohlc +#---------------------------------------------------------------------- + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_add_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] labels): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + float64_t val, count + ndarray[float64_t, ndim=2] sumx, nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + sumx = np.zeros_like(out) + + N, K = ( values).shape + + with nogil: + + if K > 1: + + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + sumx[lab, j] += val + + else: + + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[lab, 0] += 1 + sumx[lab, 0] += val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = NAN + else: + out[i, j] = sumx[i, j] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_prod_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] labels): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + float64_t val, count + ndarray[float64_t, ndim=2] prodx, nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + prodx = np.ones_like(out) + + N, K = ( values).shape + + with nogil: + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + prodx[lab, j] *= val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[lab, 0] += 1 + prodx[lab, 0] *= val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = NAN + else: + out[i, j] = prodx[i, j] + + +@cython.wraparound(False) +@cython.boundscheck(False) +@cython.cdivision(True) +def group_var_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] labels): + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + float64_t val, ct, oldmean + ndarray[float64_t, ndim=2] nobs, mean + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + mean = np.zeros_like(out) + + N, K = ( values).shape + + out[:, :] = 0.0 + + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + oldmean = mean[lab, j] + mean[lab, j] += (val - oldmean) / nobs[lab, j] + out[lab, j] += (val - mean[lab, j]) * (val - oldmean) + + for i in range(ncounts): + for j in range(K): + ct = nobs[i, j] + if ct < 2: + out[i, j] = NAN + else: + out[i, j] /= (ct - 1) +# add passing bin edges, instead of labels + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_mean_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] labels): + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + float64_t val, count + ndarray[float64_t, ndim=2] sumx, nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + sumx = np.zeros_like(out) + + N, K = ( values).shape + + with nogil: + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + # not nan + if val == val: + nobs[lab, j] += 1 + sumx[lab, j] += val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + # not nan + if val == val: + nobs[lab, 0] += 1 + sumx[lab, 0] += val + + for i in range(ncounts): + for j in range(K): + count = nobs[i, j] + if nobs[i, j] == 0: + out[i, j] = NAN + else: + out[i, j] = sumx[i, j] / count + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_ohlc_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] labels): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab + float64_t val, count + Py_ssize_t ngroups = len(counts) + + if len(labels) == 0: + return + + N, K = ( values).shape + + if out.shape[1] != 4: + raise ValueError('Output array must have 4 columns') + + if K > 1: + raise NotImplementedError("Argument 'values' must have only " + "one dimension") + out.fill(np.nan) + + with nogil: + for i in range(N): + lab = labels[i] + if lab == -1: + continue + + counts[lab] += 1 + val = values[i, 0] + if val != val: + continue + + if out[lab, 0] != out[lab, 0]: + out[lab, 0] = out[lab, 1] = out[lab, 2] = out[lab, 3] = val + else: + out[lab, 1] = max(out[lab, 1], val) + out[lab, 2] = min(out[lab, 2], val) + out[lab, 3] = val + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_add_float32(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float32_t, ndim=2] values, + ndarray[int64_t] labels): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + float32_t val, count + ndarray[float32_t, ndim=2] sumx, nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + sumx = np.zeros_like(out) + + N, K = ( values).shape + + with nogil: + + if K > 1: + + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + sumx[lab, j] += val + + else: + + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[lab, 0] += 1 + sumx[lab, 0] += val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = NAN + else: + out[i, j] = sumx[i, j] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_prod_float32(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float32_t, ndim=2] values, + ndarray[int64_t] labels): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + float32_t val, count + ndarray[float32_t, ndim=2] prodx, nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + prodx = np.ones_like(out) + + N, K = ( values).shape + + with nogil: + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + prodx[lab, j] *= val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[lab, 0] += 1 + prodx[lab, 0] *= val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = NAN + else: + out[i, j] = prodx[i, j] + + +@cython.wraparound(False) +@cython.boundscheck(False) +@cython.cdivision(True) +def group_var_float32(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float32_t, ndim=2] values, + ndarray[int64_t] labels): + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + float32_t val, ct, oldmean + ndarray[float32_t, ndim=2] nobs, mean + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + mean = np.zeros_like(out) + + N, K = ( values).shape + + out[:, :] = 0.0 + + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + oldmean = mean[lab, j] + mean[lab, j] += (val - oldmean) / nobs[lab, j] + out[lab, j] += (val - mean[lab, j]) * (val - oldmean) + + for i in range(ncounts): + for j in range(K): + ct = nobs[i, j] + if ct < 2: + out[i, j] = NAN + else: + out[i, j] /= (ct - 1) +# add passing bin edges, instead of labels + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_mean_float32(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float32_t, ndim=2] values, + ndarray[int64_t] labels): + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + float32_t val, count + ndarray[float32_t, ndim=2] sumx, nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + sumx = np.zeros_like(out) + + N, K = ( values).shape + + with nogil: + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + # not nan + if val == val: + nobs[lab, j] += 1 + sumx[lab, j] += val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + # not nan + if val == val: + nobs[lab, 0] += 1 + sumx[lab, 0] += val + + for i in range(ncounts): + for j in range(K): + count = nobs[i, j] + if nobs[i, j] == 0: + out[i, j] = NAN + else: + out[i, j] = sumx[i, j] / count + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_ohlc_float32(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float32_t, ndim=2] values, + ndarray[int64_t] labels): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab + float32_t val, count + Py_ssize_t ngroups = len(counts) + + if len(labels) == 0: + return + + N, K = ( values).shape + + if out.shape[1] != 4: + raise ValueError('Output array must have 4 columns') + + if K > 1: + raise NotImplementedError("Argument 'values' must have only " + "one dimension") + out.fill(np.nan) + + with nogil: + for i in range(N): + lab = labels[i] + if lab == -1: + continue + + counts[lab] += 1 + val = values[i, 0] + if val != val: + continue + + if out[lab, 0] != out[lab, 0]: + out[lab, 0] = out[lab, 1] = out[lab, 2] = out[lab, 3] = val + else: + out[lab, 1] = max(out[lab, 1], val) + out[lab, 2] = min(out[lab, 2], val) + out[lab, 3] = val + +#---------------------------------------------------------------------- +# group_nth, group_last +#---------------------------------------------------------------------- + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_last_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] labels): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + float64_t val, count + ndarray[float64_t, ndim=2] resx + ndarray[int64_t, ndim=2] nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros(( out).shape, dtype=np.int64) + resx = np.empty_like(out) + + N, K = ( values).shape + + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val and val != NAN: + nobs[lab, j] += 1 + resx[lab, j] = val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = NAN + else: + out[i, j] = resx[i, j] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_nth_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] labels, int64_t rank): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + float64_t val, count + ndarray[float64_t, ndim=2] resx + ndarray[int64_t, ndim=2] nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros(( out).shape, dtype=np.int64) + resx = np.empty_like(out) + + N, K = ( values).shape + + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val and val != NAN: + nobs[lab, j] += 1 + if nobs[lab, j] == rank: + resx[lab, j] = val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = NAN + else: + out[i, j] = resx[i, j] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_last_float32(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float32_t, ndim=2] values, + ndarray[int64_t] labels): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + float32_t val, count + ndarray[float32_t, ndim=2] resx + ndarray[int64_t, ndim=2] nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros(( out).shape, dtype=np.int64) + resx = np.empty_like(out) + + N, K = ( values).shape + + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val and val != NAN: + nobs[lab, j] += 1 + resx[lab, j] = val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = NAN + else: + out[i, j] = resx[i, j] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_nth_float32(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float32_t, ndim=2] values, + ndarray[int64_t] labels, int64_t rank): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + float32_t val, count + ndarray[float32_t, ndim=2] resx + ndarray[int64_t, ndim=2] nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros(( out).shape, dtype=np.int64) + resx = np.empty_like(out) + + N, K = ( values).shape + + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val and val != NAN: + nobs[lab, j] += 1 + if nobs[lab, j] == rank: + resx[lab, j] = val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = NAN + else: + out[i, j] = resx[i, j] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_last_int64(ndarray[int64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[int64_t, ndim=2] values, + ndarray[int64_t] labels): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + int64_t val, count + ndarray[int64_t, ndim=2] resx + ndarray[int64_t, ndim=2] nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros(( out).shape, dtype=np.int64) + resx = np.empty_like(out) + + N, K = ( values).shape + + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val and val != iNaT: + nobs[lab, j] += 1 + resx[lab, j] = val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = iNaT + else: + out[i, j] = resx[i, j] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_nth_int64(ndarray[int64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[int64_t, ndim=2] values, + ndarray[int64_t] labels, int64_t rank): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + int64_t val, count + ndarray[int64_t, ndim=2] resx + ndarray[int64_t, ndim=2] nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros(( out).shape, dtype=np.int64) + resx = np.empty_like(out) + + N, K = ( values).shape + + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val and val != iNaT: + nobs[lab, j] += 1 + if nobs[lab, j] == rank: + resx[lab, j] = val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = iNaT + else: + out[i, j] = resx[i, j] + +#---------------------------------------------------------------------- +# group_min, group_max +#---------------------------------------------------------------------- + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_max_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] labels): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + float64_t val, count + ndarray[float64_t, ndim=2] maxx, nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + + maxx = np.empty_like(out) + maxx.fill(-np.inf) + + N, K = ( values).shape + + with nogil: + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val and val != NAN: + nobs[lab, j] += 1 + if val > maxx[lab, j]: + maxx[lab, j] = val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + + # not nan + if val == val and val != NAN: + nobs[lab, 0] += 1 + if val > maxx[lab, 0]: + maxx[lab, 0] = val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = NAN + else: + out[i, j] = maxx[i, j] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_min_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] labels): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + float64_t val, count + ndarray[float64_t, ndim=2] minx, nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + + minx = np.empty_like(out) + minx.fill(np.inf) + + N, K = ( values).shape + + with nogil: + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val and val != NAN: + + nobs[lab, j] += 1 + if val < minx[lab, j]: + minx[lab, j] = val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + + # not nan + if val == val and val != NAN: + nobs[lab, 0] += 1 + if val < minx[lab, 0]: + minx[lab, 0] = val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = NAN + else: + out[i, j] = minx[i, j] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_max_float32(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float32_t, ndim=2] values, + ndarray[int64_t] labels): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + float32_t val, count + ndarray[float32_t, ndim=2] maxx, nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + + maxx = np.empty_like(out) + maxx.fill(-np.inf) + + N, K = ( values).shape + + with nogil: + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val and val != NAN: + nobs[lab, j] += 1 + if val > maxx[lab, j]: + maxx[lab, j] = val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + + # not nan + if val == val and val != NAN: + nobs[lab, 0] += 1 + if val > maxx[lab, 0]: + maxx[lab, 0] = val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = NAN + else: + out[i, j] = maxx[i, j] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_min_float32(ndarray[float32_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float32_t, ndim=2] values, + ndarray[int64_t] labels): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + float32_t val, count + ndarray[float32_t, ndim=2] minx, nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + + minx = np.empty_like(out) + minx.fill(np.inf) + + N, K = ( values).shape + + with nogil: + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val and val != NAN: + + nobs[lab, j] += 1 + if val < minx[lab, j]: + minx[lab, j] = val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + + # not nan + if val == val and val != NAN: + nobs[lab, 0] += 1 + if val < minx[lab, 0]: + minx[lab, 0] = val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = NAN + else: + out[i, j] = minx[i, j] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_max_int64(ndarray[int64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[int64_t, ndim=2] values, + ndarray[int64_t] labels): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + int64_t val, count + ndarray[int64_t, ndim=2] maxx, nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + + maxx = np.empty_like(out) + maxx.fill(-_int64_max) + + N, K = ( values).shape + + with nogil: + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val and val != iNaT: + nobs[lab, j] += 1 + if val > maxx[lab, j]: + maxx[lab, j] = val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + + # not nan + if val == val and val != iNaT: + nobs[lab, 0] += 1 + if val > maxx[lab, 0]: + maxx[lab, 0] = val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = iNaT + else: + out[i, j] = maxx[i, j] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_min_int64(ndarray[int64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[int64_t, ndim=2] values, + ndarray[int64_t] labels): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + int64_t val, count + ndarray[int64_t, ndim=2] minx, nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + + minx = np.empty_like(out) + minx.fill(_int64_max) + + N, K = ( values).shape + + with nogil: + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val and val != iNaT: + + nobs[lab, j] += 1 + if val < minx[lab, j]: + minx[lab, j] = val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + + # not nan + if val == val and val != iNaT: + nobs[lab, 0] += 1 + if val < minx[lab, 0]: + minx[lab, 0] = val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = iNaT + else: + out[i, j] = minx[i, j] + +#---------------------------------------------------------------------- +# other grouping functions not needing a template +#---------------------------------------------------------------------- + + +def group_median_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] labels): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, ngroups, size + ndarray[int64_t] _counts + ndarray data + float64_t* ptr + ngroups = len(counts) + N, K = ( values).shape + + indexer, _counts = groupsort_indexer(labels, ngroups) + counts[:] = _counts[1:] + + data = np.empty((K, N), dtype=np.float64) + ptr = data.data + + take_2d_axis1_float64_float64(values.T, indexer, out=data) + + for i in range(K): + # exclude NA group + ptr += _counts[0] + for j in range(ngroups): + size = _counts[j + 1] + out[j, i] = _median_linear(ptr, size) + ptr += size + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_cumprod_float64(float64_t[:, :] out, + float64_t[:, :] values, + int64_t[:] labels, + float64_t[:, :] accum): + """ + Only transforms on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, size + float64_t val + int64_t lab + + N, K = ( values).shape + accum = np.ones_like(accum) + + with nogil: + for i in range(N): + lab = labels[i] + + if lab < 0: + continue + for j in range(K): + val = values[i, j] + if val == val: + accum[lab, j] *= val + out[i, j] = accum[lab, j] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_cumsum(numeric[:, :] out, + numeric[:, :] values, + int64_t[:] labels, + numeric[:, :] accum): + """ + Only transforms on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, size + numeric val + int64_t lab + + N, K = ( values).shape + accum = np.zeros_like(accum) + + with nogil: + for i in range(N): + lab = labels[i] + + if lab < 0: + continue + for j in range(K): + val = values[i, j] + if val == val: + accum[lab, j] += val + out[i, j] = accum[lab, j] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_shift_indexer(int64_t[:] out, int64_t[:] labels, + int ngroups, int periods): + cdef: + Py_ssize_t N, i, j, ii + int offset, sign + int64_t lab, idxer, idxer_slot + int64_t[:] label_seen = np.zeros(ngroups, dtype=np.int64) + int64_t[:, :] label_indexer + + N, = ( labels).shape + + if periods < 0: + periods = -periods + offset = N - 1 + sign = -1 + elif periods > 0: + offset = 0 + sign = 1 + + if periods == 0: + with nogil: + for i in range(N): + out[i] = i + else: + # array of each previous indexer seen + label_indexer = np.zeros((ngroups, periods), dtype=np.int64) + with nogil: + for i in range(N): + ## reverse iterator if shifting backwards + ii = offset + sign * i + lab = labels[ii] + + # Skip null keys + if lab == -1: + out[ii] = -1 + continue + + label_seen[lab] += 1 + + idxer_slot = label_seen[lab] % periods + idxer = label_indexer[lab, idxer_slot] + + if label_seen[lab] > periods: + out[ii] = idxer + else: + out[ii] = -1 + + label_indexer[lab, idxer_slot] = ii diff --git a/pandas/src/algos_groupby_helper.pxi.in b/pandas/src/algos_groupby_helper.pxi.in new file mode 100644 index 0000000000000..5c704436ce3a0 --- /dev/null +++ b/pandas/src/algos_groupby_helper.pxi.in @@ -0,0 +1,719 @@ +""" +Template for each `dtype` helper function using groupby + +WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in +""" + +cdef extern from "numpy/npy_math.h": + double NAN "NPY_NAN" +_int64_max = np.iinfo(np.int64).max + +#---------------------------------------------------------------------- +# group_add, group_prod, group_var, group_mean, group_ohlc +#---------------------------------------------------------------------- + +{{py: + +# name, c_type, dest_type, dest_dtype +dtypes = [('float64', 'float64_t', 'float64_t', 'np.float64'), + ('float32', 'float32_t', 'float32_t', 'np.float32')] + +def get_dispatch(dtypes): + + for name, c_type, dest_type, dest_dtype in dtypes: + + dest_type2 = dest_type + dest_type = dest_type.replace('_t', '') + + yield name, c_type, dest_type, dest_type2, dest_dtype +}} + +{{for name, c_type, dest_type, dest_type2, dest_dtype in get_dispatch(dtypes)}} + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_add_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, + ndarray[int64_t] counts, + ndarray[{{c_type}}, ndim=2] values, + ndarray[int64_t] labels): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + {{dest_type2}} val, count + ndarray[{{dest_type2}}, ndim=2] sumx, nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + sumx = np.zeros_like(out) + + N, K = ( values).shape + + with nogil: + + if K > 1: + + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + sumx[lab, j] += val + + else: + + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[lab, 0] += 1 + sumx[lab, 0] += val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = NAN + else: + out[i, j] = sumx[i, j] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_prod_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, + ndarray[int64_t] counts, + ndarray[{{c_type}}, ndim=2] values, + ndarray[int64_t] labels): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + {{dest_type2}} val, count + ndarray[{{dest_type2}}, ndim=2] prodx, nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + prodx = np.ones_like(out) + + N, K = ( values).shape + + with nogil: + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + prodx[lab, j] *= val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[lab, 0] += 1 + prodx[lab, 0] *= val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = NAN + else: + out[i, j] = prodx[i, j] + + +@cython.wraparound(False) +@cython.boundscheck(False) +@cython.cdivision(True) +def group_var_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, + ndarray[int64_t] counts, + ndarray[{{dest_type2}}, ndim=2] values, + ndarray[int64_t] labels): + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + {{dest_type2}} val, ct, oldmean + ndarray[{{dest_type2}}, ndim=2] nobs, mean + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + mean = np.zeros_like(out) + + N, K = ( values).shape + + out[:, :] = 0.0 + + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + oldmean = mean[lab, j] + mean[lab, j] += (val - oldmean) / nobs[lab, j] + out[lab, j] += (val - mean[lab, j]) * (val - oldmean) + + for i in range(ncounts): + for j in range(K): + ct = nobs[i, j] + if ct < 2: + out[i, j] = NAN + else: + out[i, j] /= (ct - 1) +# add passing bin edges, instead of labels + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_mean_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, + ndarray[int64_t] counts, + ndarray[{{dest_type2}}, ndim=2] values, + ndarray[int64_t] labels): + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + {{dest_type2}} val, count + ndarray[{{dest_type2}}, ndim=2] sumx, nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + sumx = np.zeros_like(out) + + N, K = ( values).shape + + with nogil: + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + # not nan + if val == val: + nobs[lab, j] += 1 + sumx[lab, j] += val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + # not nan + if val == val: + nobs[lab, 0] += 1 + sumx[lab, 0] += val + + for i in range(ncounts): + for j in range(K): + count = nobs[i, j] + if nobs[i, j] == 0: + out[i, j] = NAN + else: + out[i, j] = sumx[i, j] / count + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_ohlc_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, + ndarray[int64_t] counts, + ndarray[{{dest_type2}}, ndim=2] values, + ndarray[int64_t] labels): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab + {{dest_type2}} val, count + Py_ssize_t ngroups = len(counts) + + if len(labels) == 0: + return + + N, K = ( values).shape + + if out.shape[1] != 4: + raise ValueError('Output array must have 4 columns') + + if K > 1: + raise NotImplementedError("Argument 'values' must have only " + "one dimension") + out.fill(np.nan) + + with nogil: + for i in range(N): + lab = labels[i] + if lab == -1: + continue + + counts[lab] += 1 + val = values[i, 0] + if val != val: + continue + + if out[lab, 0] != out[lab, 0]: + out[lab, 0] = out[lab, 1] = out[lab, 2] = out[lab, 3] = val + else: + out[lab, 1] = max(out[lab, 1], val) + out[lab, 2] = min(out[lab, 2], val) + out[lab, 3] = val + +{{endfor}} + +#---------------------------------------------------------------------- +# group_nth, group_last +#---------------------------------------------------------------------- + +{{py: + +# name, c_type, dest_type2, nan_val +dtypes = [('float64', 'float64_t', 'float64_t', 'NAN'), + ('float32', 'float32_t', 'float32_t', 'NAN'), + ('int64', 'int64_t', 'int64_t', 'iNaT')] + +def get_dispatch(dtypes): + + for name, c_type, dest_type2, nan_val in dtypes: + + yield name, c_type, dest_type2, nan_val +}} + + +{{for name, c_type, dest_type2, nan_val in get_dispatch(dtypes)}} + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, + ndarray[int64_t] counts, + ndarray[{{c_type}}, ndim=2] values, + ndarray[int64_t] labels): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + {{dest_type2}} val, count + ndarray[{{dest_type2}}, ndim=2] resx + ndarray[int64_t, ndim=2] nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros(( out).shape, dtype=np.int64) + resx = np.empty_like(out) + + N, K = ( values).shape + + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val and val != {{nan_val}}: + nobs[lab, j] += 1 + resx[lab, j] = val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = {{nan_val}} + else: + out[i, j] = resx[i, j] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, + ndarray[int64_t] counts, + ndarray[{{c_type}}, ndim=2] values, + ndarray[int64_t] labels, int64_t rank): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + {{dest_type2}} val, count + ndarray[{{dest_type2}}, ndim=2] resx + ndarray[int64_t, ndim=2] nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros(( out).shape, dtype=np.int64) + resx = np.empty_like(out) + + N, K = ( values).shape + + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val and val != {{nan_val}}: + nobs[lab, j] += 1 + if nobs[lab, j] == rank: + resx[lab, j] = val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = {{nan_val}} + else: + out[i, j] = resx[i, j] + +{{endfor}} + +#---------------------------------------------------------------------- +# group_min, group_max +#---------------------------------------------------------------------- + +{{py: + +# name, c_type, dest_type2, nan_val +dtypes = [('float64', 'float64_t', 'NAN', 'np.inf'), + ('float32', 'float32_t', 'NAN', 'np.inf'), + ('int64', 'int64_t', 'iNaT', '_int64_max')] + +def get_dispatch(dtypes): + + for name, dest_type2, nan_val, inf_val in dtypes: + yield name, dest_type2, nan_val, inf_val +}} + + +{{for name, dest_type2, nan_val, inf_val in get_dispatch(dtypes)}} + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_max_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, + ndarray[int64_t] counts, + ndarray[{{dest_type2}}, ndim=2] values, + ndarray[int64_t] labels): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + {{dest_type2}} val, count + ndarray[{{dest_type2}}, ndim=2] maxx, nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + + maxx = np.empty_like(out) + maxx.fill(-{{inf_val}}) + + N, K = ( values).shape + + with nogil: + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val and val != {{nan_val}}: + nobs[lab, j] += 1 + if val > maxx[lab, j]: + maxx[lab, j] = val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + + # not nan + if val == val and val != {{nan_val}}: + nobs[lab, 0] += 1 + if val > maxx[lab, 0]: + maxx[lab, 0] = val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = {{nan_val}} + else: + out[i, j] = maxx[i, j] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_min_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, + ndarray[int64_t] counts, + ndarray[{{dest_type2}}, ndim=2] values, + ndarray[int64_t] labels): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + {{dest_type2}} val, count + ndarray[{{dest_type2}}, ndim=2] minx, nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + + minx = np.empty_like(out) + minx.fill({{inf_val}}) + + N, K = ( values).shape + + with nogil: + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val and val != {{nan_val}}: + + nobs[lab, j] += 1 + if val < minx[lab, j]: + minx[lab, j] = val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + + # not nan + if val == val and val != {{nan_val}}: + nobs[lab, 0] += 1 + if val < minx[lab, 0]: + minx[lab, 0] = val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = {{nan_val}} + else: + out[i, j] = minx[i, j] + +{{endfor}} + +#---------------------------------------------------------------------- +# other grouping functions not needing a template +#---------------------------------------------------------------------- + + +def group_median_float64(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] labels): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, ngroups, size + ndarray[int64_t] _counts + ndarray data + float64_t* ptr + ngroups = len(counts) + N, K = ( values).shape + + indexer, _counts = groupsort_indexer(labels, ngroups) + counts[:] = _counts[1:] + + data = np.empty((K, N), dtype=np.float64) + ptr = data.data + + take_2d_axis1_float64_float64(values.T, indexer, out=data) + + for i in range(K): + # exclude NA group + ptr += _counts[0] + for j in range(ngroups): + size = _counts[j + 1] + out[j, i] = _median_linear(ptr, size) + ptr += size + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_cumprod_float64(float64_t[:, :] out, + float64_t[:, :] values, + int64_t[:] labels, + float64_t[:, :] accum): + """ + Only transforms on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, size + float64_t val + int64_t lab + + N, K = ( values).shape + accum = np.ones_like(accum) + + with nogil: + for i in range(N): + lab = labels[i] + + if lab < 0: + continue + for j in range(K): + val = values[i, j] + if val == val: + accum[lab, j] *= val + out[i, j] = accum[lab, j] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_cumsum(numeric[:, :] out, + numeric[:, :] values, + int64_t[:] labels, + numeric[:, :] accum): + """ + Only transforms on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, size + numeric val + int64_t lab + + N, K = ( values).shape + accum = np.zeros_like(accum) + + with nogil: + for i in range(N): + lab = labels[i] + + if lab < 0: + continue + for j in range(K): + val = values[i, j] + if val == val: + accum[lab, j] += val + out[i, j] = accum[lab, j] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_shift_indexer(int64_t[:] out, int64_t[:] labels, + int ngroups, int periods): + cdef: + Py_ssize_t N, i, j, ii + int offset, sign + int64_t lab, idxer, idxer_slot + int64_t[:] label_seen = np.zeros(ngroups, dtype=np.int64) + int64_t[:, :] label_indexer + + N, = ( labels).shape + + if periods < 0: + periods = -periods + offset = N - 1 + sign = -1 + elif periods > 0: + offset = 0 + sign = 1 + + if periods == 0: + with nogil: + for i in range(N): + out[i] = i + else: + # array of each previous indexer seen + label_indexer = np.zeros((ngroups, periods), dtype=np.int64) + with nogil: + for i in range(N): + ## reverse iterator if shifting backwards + ii = offset + sign * i + lab = labels[ii] + + # Skip null keys + if lab == -1: + out[ii] = -1 + continue + + label_seen[lab] += 1 + + idxer_slot = label_seen[lab] % periods + idxer = label_indexer[lab, idxer_slot] + + if label_seen[lab] > periods: + out[ii] = idxer + else: + out[ii] = -1 + + label_indexer[lab, idxer_slot] = ii diff --git a/pandas/src/algos_take_helper.pxi b/pandas/src/algos_take_helper.pxi new file mode 100644 index 0000000000000..d8fb05804d4e5 --- /dev/null +++ b/pandas/src/algos_take_helper.pxi @@ -0,0 +1,4949 @@ +""" +Template for each `dtype` helper function for take + +WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in +""" + +#---------------------------------------------------------------------- +# take_1d, take_2d +#---------------------------------------------------------------------- + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_1d_bool_bool_memview(uint8_t[:] values, + int64_t[:] indexer, + uint8_t[:] out, + fill_value=np.nan): + + + + cdef: + Py_ssize_t i, n, idx + uint8_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_1d_bool_bool(ndarray[uint8_t, ndim=1] values, + int64_t[:] indexer, + uint8_t[:] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_1d_bool_bool_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + + cdef: + Py_ssize_t i, n, idx + uint8_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis0_bool_bool_memview(uint8_t[:, :] values, + int64_t[:] indexer, + uint8_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + uint8_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF True: + cdef: + uint8_t *v + uint8_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(uint8_t) and + sizeof(uint8_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(uint8_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_bool_bool(ndarray[uint8_t, ndim=2] values, + ndarray[int64_t] indexer, + uint8_t[:, :] out, + fill_value=np.nan): + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis0_bool_bool_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + uint8_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF True: + cdef: + uint8_t *v + uint8_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(uint8_t) and + sizeof(uint8_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(uint8_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis1_bool_bool_memview(uint8_t[:, :] values, + int64_t[:] indexer, + uint8_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + uint8_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_bool_bool(ndarray[uint8_t, ndim=2] values, + ndarray[int64_t] indexer, + uint8_t[:, :] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis1_bool_bool_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + uint8_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_bool_bool(ndarray[uint8_t, ndim=2] values, + indexer, + ndarray[uint8_t, ndim=2] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t] idx0 = indexer[0] + ndarray[int64_t] idx1 = indexer[1] + uint8_t fv + + n = len(idx0) + k = len(idx1) + + fv = fill_value + for i from 0 <= i < n: + idx = idx0[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + if idx1[j] == -1: + out[i, j] = fv + else: + out[i, j] = values[idx, idx1[j]] + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_1d_bool_object_memview(uint8_t[:] values, + int64_t[:] indexer, + object[:] out, + fill_value=np.nan): + + + + cdef: + Py_ssize_t i, n, idx + object fv + + n = indexer.shape[0] + + fv = fill_value + + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = True if values[idx] > 0 else False + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_1d_bool_object(ndarray[uint8_t, ndim=1] values, + int64_t[:] indexer, + object[:] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_1d_bool_object_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + + cdef: + Py_ssize_t i, n, idx + object fv + + n = indexer.shape[0] + + fv = fill_value + + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = True if values[idx] > 0 else False + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis0_bool_object_memview(uint8_t[:, :] values, + int64_t[:] indexer, + object[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + object fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + object *v + object *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(object) and + sizeof(object) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(object) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = True if values[idx, j] > 0 else False + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_bool_object(ndarray[uint8_t, ndim=2] values, + ndarray[int64_t] indexer, + object[:, :] out, + fill_value=np.nan): + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis0_bool_object_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + object fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + object *v + object *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(object) and + sizeof(object) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(object) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = True if values[idx, j] > 0 else False + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis1_bool_object_memview(uint8_t[:, :] values, + int64_t[:] indexer, + object[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + object fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = True if values[i, idx] > 0 else False + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_bool_object(ndarray[uint8_t, ndim=2] values, + ndarray[int64_t] indexer, + object[:, :] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis1_bool_object_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + object fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = True if values[i, idx] > 0 else False + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_bool_object(ndarray[uint8_t, ndim=2] values, + indexer, + ndarray[object, ndim=2] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t] idx0 = indexer[0] + ndarray[int64_t] idx1 = indexer[1] + object fv + + n = len(idx0) + k = len(idx1) + + fv = fill_value + for i from 0 <= i < n: + idx = idx0[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + if idx1[j] == -1: + out[i, j] = fv + else: + out[i, j] = True if values[idx, idx1[j]] > 0 else False + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_1d_int8_int8_memview(int8_t[:] values, + int64_t[:] indexer, + int8_t[:] out, + fill_value=np.nan): + + + + cdef: + Py_ssize_t i, n, idx + int8_t fv + + n = indexer.shape[0] + + fv = fill_value + + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_1d_int8_int8(ndarray[int8_t, ndim=1] values, + int64_t[:] indexer, + int8_t[:] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_1d_int8_int8_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + + cdef: + Py_ssize_t i, n, idx + int8_t fv + + n = indexer.shape[0] + + fv = fill_value + + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis0_int8_int8_memview(int8_t[:, :] values, + int64_t[:] indexer, + int8_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + int8_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF True: + cdef: + int8_t *v + int8_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(int8_t) and + sizeof(int8_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(int8_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_int8_int8(ndarray[int8_t, ndim=2] values, + ndarray[int64_t] indexer, + int8_t[:, :] out, + fill_value=np.nan): + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis0_int8_int8_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + int8_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF True: + cdef: + int8_t *v + int8_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(int8_t) and + sizeof(int8_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(int8_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis1_int8_int8_memview(int8_t[:, :] values, + int64_t[:] indexer, + int8_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + int8_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_int8_int8(ndarray[int8_t, ndim=2] values, + ndarray[int64_t] indexer, + int8_t[:, :] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis1_int8_int8_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + int8_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_int8_int8(ndarray[int8_t, ndim=2] values, + indexer, + ndarray[int8_t, ndim=2] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t] idx0 = indexer[0] + ndarray[int64_t] idx1 = indexer[1] + int8_t fv + + n = len(idx0) + k = len(idx1) + + fv = fill_value + for i from 0 <= i < n: + idx = idx0[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + if idx1[j] == -1: + out[i, j] = fv + else: + out[i, j] = values[idx, idx1[j]] + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_1d_int8_int32_memview(int8_t[:] values, + int64_t[:] indexer, + int32_t[:] out, + fill_value=np.nan): + + + + cdef: + Py_ssize_t i, n, idx + int32_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_1d_int8_int32(ndarray[int8_t, ndim=1] values, + int64_t[:] indexer, + int32_t[:] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_1d_int8_int32_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + + cdef: + Py_ssize_t i, n, idx + int32_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis0_int8_int32_memview(int8_t[:, :] values, + int64_t[:] indexer, + int32_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + int32_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + int32_t *v + int32_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(int32_t) and + sizeof(int32_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(int32_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_int8_int32(ndarray[int8_t, ndim=2] values, + ndarray[int64_t] indexer, + int32_t[:, :] out, + fill_value=np.nan): + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis0_int8_int32_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + int32_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + int32_t *v + int32_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(int32_t) and + sizeof(int32_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(int32_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis1_int8_int32_memview(int8_t[:, :] values, + int64_t[:] indexer, + int32_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + int32_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_int8_int32(ndarray[int8_t, ndim=2] values, + ndarray[int64_t] indexer, + int32_t[:, :] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis1_int8_int32_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + int32_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_int8_int32(ndarray[int8_t, ndim=2] values, + indexer, + ndarray[int32_t, ndim=2] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t] idx0 = indexer[0] + ndarray[int64_t] idx1 = indexer[1] + int32_t fv + + n = len(idx0) + k = len(idx1) + + fv = fill_value + for i from 0 <= i < n: + idx = idx0[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + if idx1[j] == -1: + out[i, j] = fv + else: + out[i, j] = values[idx, idx1[j]] + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_1d_int8_int64_memview(int8_t[:] values, + int64_t[:] indexer, + int64_t[:] out, + fill_value=np.nan): + + + + cdef: + Py_ssize_t i, n, idx + int64_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_1d_int8_int64(ndarray[int8_t, ndim=1] values, + int64_t[:] indexer, + int64_t[:] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_1d_int8_int64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + + cdef: + Py_ssize_t i, n, idx + int64_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis0_int8_int64_memview(int8_t[:, :] values, + int64_t[:] indexer, + int64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + int64_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + int64_t *v + int64_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(int64_t) and + sizeof(int64_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(int64_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_int8_int64(ndarray[int8_t, ndim=2] values, + ndarray[int64_t] indexer, + int64_t[:, :] out, + fill_value=np.nan): + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis0_int8_int64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + int64_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + int64_t *v + int64_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(int64_t) and + sizeof(int64_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(int64_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis1_int8_int64_memview(int8_t[:, :] values, + int64_t[:] indexer, + int64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + int64_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_int8_int64(ndarray[int8_t, ndim=2] values, + ndarray[int64_t] indexer, + int64_t[:, :] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis1_int8_int64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + int64_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_int8_int64(ndarray[int8_t, ndim=2] values, + indexer, + ndarray[int64_t, ndim=2] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t] idx0 = indexer[0] + ndarray[int64_t] idx1 = indexer[1] + int64_t fv + + n = len(idx0) + k = len(idx1) + + fv = fill_value + for i from 0 <= i < n: + idx = idx0[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + if idx1[j] == -1: + out[i, j] = fv + else: + out[i, j] = values[idx, idx1[j]] + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_1d_int8_float64_memview(int8_t[:] values, + int64_t[:] indexer, + float64_t[:] out, + fill_value=np.nan): + + + + cdef: + Py_ssize_t i, n, idx + float64_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_1d_int8_float64(ndarray[int8_t, ndim=1] values, + int64_t[:] indexer, + float64_t[:] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_1d_int8_float64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + + cdef: + Py_ssize_t i, n, idx + float64_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis0_int8_float64_memview(int8_t[:, :] values, + int64_t[:] indexer, + float64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + float64_t *v + float64_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(float64_t) and + sizeof(float64_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(float64_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_int8_float64(ndarray[int8_t, ndim=2] values, + ndarray[int64_t] indexer, + float64_t[:, :] out, + fill_value=np.nan): + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis0_int8_float64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + float64_t *v + float64_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(float64_t) and + sizeof(float64_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(float64_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis1_int8_float64_memview(int8_t[:, :] values, + int64_t[:] indexer, + float64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_int8_float64(ndarray[int8_t, ndim=2] values, + ndarray[int64_t] indexer, + float64_t[:, :] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis1_int8_float64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_int8_float64(ndarray[int8_t, ndim=2] values, + indexer, + ndarray[float64_t, ndim=2] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t] idx0 = indexer[0] + ndarray[int64_t] idx1 = indexer[1] + float64_t fv + + n = len(idx0) + k = len(idx1) + + fv = fill_value + for i from 0 <= i < n: + idx = idx0[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + if idx1[j] == -1: + out[i, j] = fv + else: + out[i, j] = values[idx, idx1[j]] + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_1d_int16_int16_memview(int16_t[:] values, + int64_t[:] indexer, + int16_t[:] out, + fill_value=np.nan): + + + + cdef: + Py_ssize_t i, n, idx + int16_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_1d_int16_int16(ndarray[int16_t, ndim=1] values, + int64_t[:] indexer, + int16_t[:] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_1d_int16_int16_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + + cdef: + Py_ssize_t i, n, idx + int16_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis0_int16_int16_memview(int16_t[:, :] values, + int64_t[:] indexer, + int16_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + int16_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF True: + cdef: + int16_t *v + int16_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(int16_t) and + sizeof(int16_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(int16_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_int16_int16(ndarray[int16_t, ndim=2] values, + ndarray[int64_t] indexer, + int16_t[:, :] out, + fill_value=np.nan): + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis0_int16_int16_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + int16_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF True: + cdef: + int16_t *v + int16_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(int16_t) and + sizeof(int16_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(int16_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis1_int16_int16_memview(int16_t[:, :] values, + int64_t[:] indexer, + int16_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + int16_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_int16_int16(ndarray[int16_t, ndim=2] values, + ndarray[int64_t] indexer, + int16_t[:, :] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis1_int16_int16_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + int16_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_int16_int16(ndarray[int16_t, ndim=2] values, + indexer, + ndarray[int16_t, ndim=2] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t] idx0 = indexer[0] + ndarray[int64_t] idx1 = indexer[1] + int16_t fv + + n = len(idx0) + k = len(idx1) + + fv = fill_value + for i from 0 <= i < n: + idx = idx0[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + if idx1[j] == -1: + out[i, j] = fv + else: + out[i, j] = values[idx, idx1[j]] + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_1d_int16_int32_memview(int16_t[:] values, + int64_t[:] indexer, + int32_t[:] out, + fill_value=np.nan): + + + + cdef: + Py_ssize_t i, n, idx + int32_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_1d_int16_int32(ndarray[int16_t, ndim=1] values, + int64_t[:] indexer, + int32_t[:] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_1d_int16_int32_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + + cdef: + Py_ssize_t i, n, idx + int32_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis0_int16_int32_memview(int16_t[:, :] values, + int64_t[:] indexer, + int32_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + int32_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + int32_t *v + int32_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(int32_t) and + sizeof(int32_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(int32_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_int16_int32(ndarray[int16_t, ndim=2] values, + ndarray[int64_t] indexer, + int32_t[:, :] out, + fill_value=np.nan): + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis0_int16_int32_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + int32_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + int32_t *v + int32_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(int32_t) and + sizeof(int32_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(int32_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis1_int16_int32_memview(int16_t[:, :] values, + int64_t[:] indexer, + int32_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + int32_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_int16_int32(ndarray[int16_t, ndim=2] values, + ndarray[int64_t] indexer, + int32_t[:, :] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis1_int16_int32_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + int32_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_int16_int32(ndarray[int16_t, ndim=2] values, + indexer, + ndarray[int32_t, ndim=2] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t] idx0 = indexer[0] + ndarray[int64_t] idx1 = indexer[1] + int32_t fv + + n = len(idx0) + k = len(idx1) + + fv = fill_value + for i from 0 <= i < n: + idx = idx0[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + if idx1[j] == -1: + out[i, j] = fv + else: + out[i, j] = values[idx, idx1[j]] + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_1d_int16_int64_memview(int16_t[:] values, + int64_t[:] indexer, + int64_t[:] out, + fill_value=np.nan): + + + + cdef: + Py_ssize_t i, n, idx + int64_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_1d_int16_int64(ndarray[int16_t, ndim=1] values, + int64_t[:] indexer, + int64_t[:] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_1d_int16_int64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + + cdef: + Py_ssize_t i, n, idx + int64_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis0_int16_int64_memview(int16_t[:, :] values, + int64_t[:] indexer, + int64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + int64_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + int64_t *v + int64_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(int64_t) and + sizeof(int64_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(int64_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_int16_int64(ndarray[int16_t, ndim=2] values, + ndarray[int64_t] indexer, + int64_t[:, :] out, + fill_value=np.nan): + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis0_int16_int64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + int64_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + int64_t *v + int64_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(int64_t) and + sizeof(int64_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(int64_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis1_int16_int64_memview(int16_t[:, :] values, + int64_t[:] indexer, + int64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + int64_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_int16_int64(ndarray[int16_t, ndim=2] values, + ndarray[int64_t] indexer, + int64_t[:, :] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis1_int16_int64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + int64_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_int16_int64(ndarray[int16_t, ndim=2] values, + indexer, + ndarray[int64_t, ndim=2] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t] idx0 = indexer[0] + ndarray[int64_t] idx1 = indexer[1] + int64_t fv + + n = len(idx0) + k = len(idx1) + + fv = fill_value + for i from 0 <= i < n: + idx = idx0[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + if idx1[j] == -1: + out[i, j] = fv + else: + out[i, j] = values[idx, idx1[j]] + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_1d_int16_float64_memview(int16_t[:] values, + int64_t[:] indexer, + float64_t[:] out, + fill_value=np.nan): + + + + cdef: + Py_ssize_t i, n, idx + float64_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_1d_int16_float64(ndarray[int16_t, ndim=1] values, + int64_t[:] indexer, + float64_t[:] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_1d_int16_float64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + + cdef: + Py_ssize_t i, n, idx + float64_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis0_int16_float64_memview(int16_t[:, :] values, + int64_t[:] indexer, + float64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + float64_t *v + float64_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(float64_t) and + sizeof(float64_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(float64_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_int16_float64(ndarray[int16_t, ndim=2] values, + ndarray[int64_t] indexer, + float64_t[:, :] out, + fill_value=np.nan): + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis0_int16_float64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + float64_t *v + float64_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(float64_t) and + sizeof(float64_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(float64_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis1_int16_float64_memview(int16_t[:, :] values, + int64_t[:] indexer, + float64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_int16_float64(ndarray[int16_t, ndim=2] values, + ndarray[int64_t] indexer, + float64_t[:, :] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis1_int16_float64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_int16_float64(ndarray[int16_t, ndim=2] values, + indexer, + ndarray[float64_t, ndim=2] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t] idx0 = indexer[0] + ndarray[int64_t] idx1 = indexer[1] + float64_t fv + + n = len(idx0) + k = len(idx1) + + fv = fill_value + for i from 0 <= i < n: + idx = idx0[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + if idx1[j] == -1: + out[i, j] = fv + else: + out[i, j] = values[idx, idx1[j]] + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_1d_int32_int32_memview(int32_t[:] values, + int64_t[:] indexer, + int32_t[:] out, + fill_value=np.nan): + + + + cdef: + Py_ssize_t i, n, idx + int32_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_1d_int32_int32(ndarray[int32_t, ndim=1] values, + int64_t[:] indexer, + int32_t[:] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_1d_int32_int32_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + + cdef: + Py_ssize_t i, n, idx + int32_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis0_int32_int32_memview(int32_t[:, :] values, + int64_t[:] indexer, + int32_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + int32_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF True: + cdef: + int32_t *v + int32_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(int32_t) and + sizeof(int32_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(int32_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_int32_int32(ndarray[int32_t, ndim=2] values, + ndarray[int64_t] indexer, + int32_t[:, :] out, + fill_value=np.nan): + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis0_int32_int32_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + int32_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF True: + cdef: + int32_t *v + int32_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(int32_t) and + sizeof(int32_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(int32_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis1_int32_int32_memview(int32_t[:, :] values, + int64_t[:] indexer, + int32_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + int32_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_int32_int32(ndarray[int32_t, ndim=2] values, + ndarray[int64_t] indexer, + int32_t[:, :] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis1_int32_int32_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + int32_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_int32_int32(ndarray[int32_t, ndim=2] values, + indexer, + ndarray[int32_t, ndim=2] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t] idx0 = indexer[0] + ndarray[int64_t] idx1 = indexer[1] + int32_t fv + + n = len(idx0) + k = len(idx1) + + fv = fill_value + for i from 0 <= i < n: + idx = idx0[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + if idx1[j] == -1: + out[i, j] = fv + else: + out[i, j] = values[idx, idx1[j]] + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_1d_int32_int64_memview(int32_t[:] values, + int64_t[:] indexer, + int64_t[:] out, + fill_value=np.nan): + + + + cdef: + Py_ssize_t i, n, idx + int64_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_1d_int32_int64(ndarray[int32_t, ndim=1] values, + int64_t[:] indexer, + int64_t[:] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_1d_int32_int64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + + cdef: + Py_ssize_t i, n, idx + int64_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis0_int32_int64_memview(int32_t[:, :] values, + int64_t[:] indexer, + int64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + int64_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + int64_t *v + int64_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(int64_t) and + sizeof(int64_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(int64_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_int32_int64(ndarray[int32_t, ndim=2] values, + ndarray[int64_t] indexer, + int64_t[:, :] out, + fill_value=np.nan): + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis0_int32_int64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + int64_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + int64_t *v + int64_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(int64_t) and + sizeof(int64_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(int64_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis1_int32_int64_memview(int32_t[:, :] values, + int64_t[:] indexer, + int64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + int64_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_int32_int64(ndarray[int32_t, ndim=2] values, + ndarray[int64_t] indexer, + int64_t[:, :] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis1_int32_int64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + int64_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_int32_int64(ndarray[int32_t, ndim=2] values, + indexer, + ndarray[int64_t, ndim=2] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t] idx0 = indexer[0] + ndarray[int64_t] idx1 = indexer[1] + int64_t fv + + n = len(idx0) + k = len(idx1) + + fv = fill_value + for i from 0 <= i < n: + idx = idx0[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + if idx1[j] == -1: + out[i, j] = fv + else: + out[i, j] = values[idx, idx1[j]] + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_1d_int32_float64_memview(int32_t[:] values, + int64_t[:] indexer, + float64_t[:] out, + fill_value=np.nan): + + + + cdef: + Py_ssize_t i, n, idx + float64_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_1d_int32_float64(ndarray[int32_t, ndim=1] values, + int64_t[:] indexer, + float64_t[:] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_1d_int32_float64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + + cdef: + Py_ssize_t i, n, idx + float64_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis0_int32_float64_memview(int32_t[:, :] values, + int64_t[:] indexer, + float64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + float64_t *v + float64_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(float64_t) and + sizeof(float64_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(float64_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_int32_float64(ndarray[int32_t, ndim=2] values, + ndarray[int64_t] indexer, + float64_t[:, :] out, + fill_value=np.nan): + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis0_int32_float64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + float64_t *v + float64_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(float64_t) and + sizeof(float64_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(float64_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis1_int32_float64_memview(int32_t[:, :] values, + int64_t[:] indexer, + float64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_int32_float64(ndarray[int32_t, ndim=2] values, + ndarray[int64_t] indexer, + float64_t[:, :] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis1_int32_float64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_int32_float64(ndarray[int32_t, ndim=2] values, + indexer, + ndarray[float64_t, ndim=2] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t] idx0 = indexer[0] + ndarray[int64_t] idx1 = indexer[1] + float64_t fv + + n = len(idx0) + k = len(idx1) + + fv = fill_value + for i from 0 <= i < n: + idx = idx0[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + if idx1[j] == -1: + out[i, j] = fv + else: + out[i, j] = values[idx, idx1[j]] + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_1d_int64_int64_memview(int64_t[:] values, + int64_t[:] indexer, + int64_t[:] out, + fill_value=np.nan): + + + + cdef: + Py_ssize_t i, n, idx + int64_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_1d_int64_int64(ndarray[int64_t, ndim=1] values, + int64_t[:] indexer, + int64_t[:] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_1d_int64_int64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + + cdef: + Py_ssize_t i, n, idx + int64_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis0_int64_int64_memview(int64_t[:, :] values, + int64_t[:] indexer, + int64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + int64_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF True: + cdef: + int64_t *v + int64_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(int64_t) and + sizeof(int64_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(int64_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_int64_int64(ndarray[int64_t, ndim=2] values, + ndarray[int64_t] indexer, + int64_t[:, :] out, + fill_value=np.nan): + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis0_int64_int64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + int64_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF True: + cdef: + int64_t *v + int64_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(int64_t) and + sizeof(int64_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(int64_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis1_int64_int64_memview(int64_t[:, :] values, + int64_t[:] indexer, + int64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + int64_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_int64_int64(ndarray[int64_t, ndim=2] values, + ndarray[int64_t] indexer, + int64_t[:, :] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis1_int64_int64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + int64_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_int64_int64(ndarray[int64_t, ndim=2] values, + indexer, + ndarray[int64_t, ndim=2] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t] idx0 = indexer[0] + ndarray[int64_t] idx1 = indexer[1] + int64_t fv + + n = len(idx0) + k = len(idx1) + + fv = fill_value + for i from 0 <= i < n: + idx = idx0[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + if idx1[j] == -1: + out[i, j] = fv + else: + out[i, j] = values[idx, idx1[j]] + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_1d_int64_float64_memview(int64_t[:] values, + int64_t[:] indexer, + float64_t[:] out, + fill_value=np.nan): + + + + cdef: + Py_ssize_t i, n, idx + float64_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_1d_int64_float64(ndarray[int64_t, ndim=1] values, + int64_t[:] indexer, + float64_t[:] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_1d_int64_float64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + + cdef: + Py_ssize_t i, n, idx + float64_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis0_int64_float64_memview(int64_t[:, :] values, + int64_t[:] indexer, + float64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + float64_t *v + float64_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(float64_t) and + sizeof(float64_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(float64_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_int64_float64(ndarray[int64_t, ndim=2] values, + ndarray[int64_t] indexer, + float64_t[:, :] out, + fill_value=np.nan): + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis0_int64_float64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + float64_t *v + float64_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(float64_t) and + sizeof(float64_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(float64_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis1_int64_float64_memview(int64_t[:, :] values, + int64_t[:] indexer, + float64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_int64_float64(ndarray[int64_t, ndim=2] values, + ndarray[int64_t] indexer, + float64_t[:, :] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis1_int64_float64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_int64_float64(ndarray[int64_t, ndim=2] values, + indexer, + ndarray[float64_t, ndim=2] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t] idx0 = indexer[0] + ndarray[int64_t] idx1 = indexer[1] + float64_t fv + + n = len(idx0) + k = len(idx1) + + fv = fill_value + for i from 0 <= i < n: + idx = idx0[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + if idx1[j] == -1: + out[i, j] = fv + else: + out[i, j] = values[idx, idx1[j]] + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_1d_float32_float32_memview(float32_t[:] values, + int64_t[:] indexer, + float32_t[:] out, + fill_value=np.nan): + + + + cdef: + Py_ssize_t i, n, idx + float32_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_1d_float32_float32(ndarray[float32_t, ndim=1] values, + int64_t[:] indexer, + float32_t[:] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_1d_float32_float32_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + + cdef: + Py_ssize_t i, n, idx + float32_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis0_float32_float32_memview(float32_t[:, :] values, + int64_t[:] indexer, + float32_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + float32_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF True: + cdef: + float32_t *v + float32_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(float32_t) and + sizeof(float32_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(float32_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_float32_float32(ndarray[float32_t, ndim=2] values, + ndarray[int64_t] indexer, + float32_t[:, :] out, + fill_value=np.nan): + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis0_float32_float32_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + float32_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF True: + cdef: + float32_t *v + float32_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(float32_t) and + sizeof(float32_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(float32_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis1_float32_float32_memview(float32_t[:, :] values, + int64_t[:] indexer, + float32_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + float32_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_float32_float32(ndarray[float32_t, ndim=2] values, + ndarray[int64_t] indexer, + float32_t[:, :] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis1_float32_float32_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + float32_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_float32_float32(ndarray[float32_t, ndim=2] values, + indexer, + ndarray[float32_t, ndim=2] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t] idx0 = indexer[0] + ndarray[int64_t] idx1 = indexer[1] + float32_t fv + + n = len(idx0) + k = len(idx1) + + fv = fill_value + for i from 0 <= i < n: + idx = idx0[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + if idx1[j] == -1: + out[i, j] = fv + else: + out[i, j] = values[idx, idx1[j]] + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_1d_float32_float64_memview(float32_t[:] values, + int64_t[:] indexer, + float64_t[:] out, + fill_value=np.nan): + + + + cdef: + Py_ssize_t i, n, idx + float64_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_1d_float32_float64(ndarray[float32_t, ndim=1] values, + int64_t[:] indexer, + float64_t[:] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_1d_float32_float64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + + cdef: + Py_ssize_t i, n, idx + float64_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis0_float32_float64_memview(float32_t[:, :] values, + int64_t[:] indexer, + float64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + float64_t *v + float64_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(float64_t) and + sizeof(float64_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(float64_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_float32_float64(ndarray[float32_t, ndim=2] values, + ndarray[int64_t] indexer, + float64_t[:, :] out, + fill_value=np.nan): + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis0_float32_float64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + float64_t *v + float64_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(float64_t) and + sizeof(float64_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(float64_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis1_float32_float64_memview(float32_t[:, :] values, + int64_t[:] indexer, + float64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_float32_float64(ndarray[float32_t, ndim=2] values, + ndarray[int64_t] indexer, + float64_t[:, :] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis1_float32_float64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_float32_float64(ndarray[float32_t, ndim=2] values, + indexer, + ndarray[float64_t, ndim=2] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t] idx0 = indexer[0] + ndarray[int64_t] idx1 = indexer[1] + float64_t fv + + n = len(idx0) + k = len(idx1) + + fv = fill_value + for i from 0 <= i < n: + idx = idx0[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + if idx1[j] == -1: + out[i, j] = fv + else: + out[i, j] = values[idx, idx1[j]] + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_1d_float64_float64_memview(float64_t[:] values, + int64_t[:] indexer, + float64_t[:] out, + fill_value=np.nan): + + + + cdef: + Py_ssize_t i, n, idx + float64_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_1d_float64_float64(ndarray[float64_t, ndim=1] values, + int64_t[:] indexer, + float64_t[:] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_1d_float64_float64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + + cdef: + Py_ssize_t i, n, idx + float64_t fv + + n = indexer.shape[0] + + fv = fill_value + + with nogil: + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis0_float64_float64_memview(float64_t[:, :] values, + int64_t[:] indexer, + float64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF True: + cdef: + float64_t *v + float64_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(float64_t) and + sizeof(float64_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(float64_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_float64_float64(ndarray[float64_t, ndim=2] values, + ndarray[int64_t] indexer, + float64_t[:, :] out, + fill_value=np.nan): + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis0_float64_float64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF True: + cdef: + float64_t *v + float64_t *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(float64_t) and + sizeof(float64_t) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(float64_t) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis1_float64_float64_memview(float64_t[:, :] values, + int64_t[:] indexer, + float64_t[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_float64_float64(ndarray[float64_t, ndim=2] values, + ndarray[int64_t] indexer, + float64_t[:, :] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis1_float64_float64_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + float64_t fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_float64_float64(ndarray[float64_t, ndim=2] values, + indexer, + ndarray[float64_t, ndim=2] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t] idx0 = indexer[0] + ndarray[int64_t] idx1 = indexer[1] + float64_t fv + + n = len(idx0) + k = len(idx1) + + fv = fill_value + for i from 0 <= i < n: + idx = idx0[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + if idx1[j] == -1: + out[i, j] = fv + else: + out[i, j] = values[idx, idx1[j]] + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_1d_object_object_memview(object[:] values, + int64_t[:] indexer, + object[:] out, + fill_value=np.nan): + + + + cdef: + Py_ssize_t i, n, idx + object fv + + n = indexer.shape[0] + + fv = fill_value + + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_1d_object_object(ndarray[object, ndim=1] values, + int64_t[:] indexer, + object[:] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_1d_object_object_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + + cdef: + Py_ssize_t i, n, idx + object fv + + n = indexer.shape[0] + + fv = fill_value + + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + out[i] = fv + else: + out[i] = values[idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis0_object_object_memview(object[:, :] values, + int64_t[:] indexer, + object[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + object fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + object *v + object *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(object) and + sizeof(object) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(object) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_object_object(ndarray[object, ndim=2] values, + ndarray[int64_t] indexer, + object[:, :] out, + fill_value=np.nan): + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis0_object_object_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + object fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF False: + cdef: + object *v + object *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(object) and + sizeof(object) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(object) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = values[idx, j] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis1_object_object_memview(object[:, :] values, + int64_t[:] indexer, + object[:, :] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + object fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_object_object(ndarray[object, ndim=2] values, + ndarray[int64_t] indexer, + object[:, :] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis1_object_object_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. + cdef: + Py_ssize_t i, j, k, n, idx + object fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = values[i, idx] + + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_object_object(ndarray[object, ndim=2] values, + indexer, + ndarray[object, ndim=2] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t] idx0 = indexer[0] + ndarray[int64_t] idx1 = indexer[1] + object fv + + n = len(idx0) + k = len(idx1) + + fv = fill_value + for i from 0 <= i < n: + idx = idx0[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + if idx1[j] == -1: + out[i, j] = fv + else: + out[i, j] = values[idx, idx1[j]] diff --git a/pandas/src/algos_take_helper.pxi.in b/pandas/src/algos_take_helper.pxi.in new file mode 100644 index 0000000000000..e9abbcd13f499 --- /dev/null +++ b/pandas/src/algos_take_helper.pxi.in @@ -0,0 +1,261 @@ +""" +Template for each `dtype` helper function for take + +WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in +""" + +#---------------------------------------------------------------------- +# take_1d, take_2d +#---------------------------------------------------------------------- + +{{py: + +# name, dest, c_type_in, c_type_out, preval, postval, can_copy, nogil +dtypes = [ + ('bool', 'bool', 'uint8_t', 'uint8_t', '', '', True, True), + ('bool', 'object', 'uint8_t', 'object', + 'True if ', ' > 0 else False', False, False), + ('int8', 'int8', 'int8_t', 'int8_t', '', '', True, False), + ('int8', 'int32', 'int8_t', 'int32_t', '', '', False, True), + ('int8', 'int64', 'int8_t', 'int64_t', '', '', False, True), + ('int8', 'float64', 'int8_t', 'float64_t', '', '', False, True), + ('int16', 'int16', 'int16_t', 'int16_t', '', '', True, True), + ('int16', 'int32', 'int16_t', 'int32_t', '', '', False, True), + ('int16', 'int64', 'int16_t', 'int64_t', '', '', False, True), + ('int16', 'float64', 'int16_t', 'float64_t', '', '', False, True), + ('int32', 'int32', 'int32_t', 'int32_t', '', '', True, True), + ('int32', 'int64', 'int32_t', 'int64_t', '', '', False, True), + ('int32', 'float64', 'int32_t', 'float64_t', '', '', False, True), + ('int64', 'int64', 'int64_t', 'int64_t', '', '', True, True), + ('int64', 'float64', 'int64_t', 'float64_t', '', '', False, True), + ('float32', 'float32', 'float32_t', 'float32_t', '', '', True, True), + ('float32', 'float64', 'float32_t', 'float64_t', '', '', False, True), + ('float64', 'float64', 'float64_t', 'float64_t', '', '', True, True), + ('object', 'object', 'object', 'object', '', '', False, False)] + + +def get_dispatch(dtypes): + + inner_take_1d_template = """ + cdef: + Py_ssize_t i, n, idx + %(c_type_out)s fv + + n = indexer.shape[0] + + fv = fill_value + + %(nogil_str)s + %(tab)sfor i from 0 <= i < n: + %(tab)s idx = indexer[i] + %(tab)s if idx == -1: + %(tab)s out[i] = fv + %(tab)s else: + %(tab)s out[i] = %(preval)svalues[idx]%(postval)s +""" + + inner_take_2d_axis0_template = """\ + cdef: + Py_ssize_t i, j, k, n, idx + %(c_type_out)s fv + + n = len(indexer) + k = values.shape[1] + + fv = fill_value + + IF %(can_copy)s: + cdef: + %(c_type_out)s *v + %(c_type_out)s *o + + #GH3130 + if (values.strides[1] == out.strides[1] and + values.strides[1] == sizeof(%(c_type_out)s) and + sizeof(%(c_type_out)s) * n >= 256): + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + v = &values[idx, 0] + o = &out[i, 0] + memmove(o, v, (sizeof(%(c_type_out)s) * k)) + return + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + out[i, j] = %(preval)svalues[idx, j]%(postval)s +""" + + inner_take_2d_axis1_template = """\ + cdef: + Py_ssize_t i, j, k, n, idx + %(c_type_out)s fv + + n = len(values) + k = len(indexer) + + if n == 0 or k == 0: + return + + fv = fill_value + + for i from 0 <= i < n: + for j from 0 <= j < k: + idx = indexer[j] + if idx == -1: + out[i, j] = fv + else: + out[i, j] = %(preval)svalues[i, idx]%(postval)s +""" + + for (name, dest, c_type_in, c_type_out, preval, postval, + can_copy, nogil) in dtypes: + if nogil: + nogil_str = "with nogil:" + tab = ' ' + else: + nogil_str = '' + tab = '' + + args = dict(name=name, dest=dest, c_type_in=c_type_in, + c_type_out=c_type_out, preval=preval, postval=postval, + can_copy=can_copy, nogil_str=nogil_str, tab=tab) + + inner_take_1d = inner_take_1d_template % args + inner_take_2d_axis0 = inner_take_2d_axis0_template % args + inner_take_2d_axis1 = inner_take_2d_axis1_template % args + + yield (name, dest, c_type_in, c_type_out, preval, postval, can_copy, + inner_take_1d, inner_take_2d_axis0, inner_take_2d_axis1) + +}} + + +{{for name, dest, c_type_in, c_type_out, preval, postval, can_copy, + inner_take_1d, inner_take_2d_axis0, inner_take_2d_axis1 + in get_dispatch(dtypes)}} + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_1d_{{name}}_{{dest}}_memview({{c_type_in}}[:] values, + int64_t[:] indexer, + {{c_type_out}}[:] out, + fill_value=np.nan): + + +{{inner_take_1d}} + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_1d_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=1] values, + int64_t[:] indexer, + {{c_type_out}}[:] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_1d_{{name}}_{{dest}}_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. +{{inner_take_1d}} + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis0_{{name}}_{{dest}}_memview({{c_type_in}}[:, :] values, + int64_t[:] indexer, + {{c_type_out}}[:, :] out, + fill_value=np.nan): +{{inner_take_2d_axis0}} + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values, + ndarray[int64_t] indexer, + {{c_type_out}}[:, :] out, + fill_value=np.nan): + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis0_{{name}}_{{dest}}_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. +{{inner_take_2d_axis0}} + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline take_2d_axis1_{{name}}_{{dest}}_memview({{c_type_in}}[:, :] values, + int64_t[:] indexer, + {{c_type_out}}[:, :] out, + fill_value=np.nan): +{{inner_take_2d_axis1}} + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values, + ndarray[int64_t] indexer, + {{c_type_out}}[:, :] out, + fill_value=np.nan): + + if values.flags.writeable: + # We can call the memoryview version of the code + take_2d_axis1_{{name}}_{{dest}}_memview(values, indexer, out, + fill_value=fill_value) + return + + # We cannot use the memoryview version on readonly-buffers due to + # a limitation of Cython's typed memoryviews. Instead we can use + # the slightly slower Cython ndarray type directly. +{{inner_take_2d_axis1}} + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values, + indexer, + ndarray[{{c_type_out}}, ndim=2] out, + fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t] idx0 = indexer[0] + ndarray[int64_t] idx1 = indexer[1] + {{c_type_out}} fv + + n = len(idx0) + k = len(idx1) + + fv = fill_value + for i from 0 <= i < n: + idx = idx0[i] + if idx == -1: + for j from 0 <= j < k: + out[i, j] = fv + else: + for j from 0 <= j < k: + if idx1[j] == -1: + out[i, j] = fv + else: + out[i, j] = {{preval}}values[idx, idx1[j]]{{postval}} + +{{endfor}} \ No newline at end of file diff --git a/pandas/src/datetime/np_datetime.c b/pandas/src/datetime/np_datetime.c index c30b404d2b8b2..80703c8b08de6 100644 --- a/pandas/src/datetime/np_datetime.c +++ b/pandas/src/datetime/np_datetime.c @@ -576,7 +576,7 @@ void pandas_datetime_to_datetimestruct(npy_datetime val, PANDAS_DATETIMEUNIT fr, } PANDAS_DATETIMEUNIT get_datetime64_unit(PyObject *obj) { - return ((PyDatetimeScalarObject *) obj)->obmeta.base; + return (PANDAS_DATETIMEUNIT)((PyDatetimeScalarObject *) obj)->obmeta.base; } diff --git a/pandas/src/datetime/np_datetime_strings.c b/pandas/src/datetime/np_datetime_strings.c index 3a1d37f86cc28..b633d6cde0820 100644 --- a/pandas/src/datetime/np_datetime_strings.c +++ b/pandas/src/datetime/np_datetime_strings.c @@ -460,7 +460,7 @@ parse_iso_8601_datetime(char *str, int len, } /* Check the casting rule */ - if (unit != -1 && !can_cast_datetime64_units(bestunit, unit, + if (!can_cast_datetime64_units(bestunit, unit, casting)) { PyErr_Format(PyExc_TypeError, "Cannot parse \"%s\" as unit " "'%s' using casting rule %s", @@ -503,7 +503,7 @@ parse_iso_8601_datetime(char *str, int len, } /* Check the casting rule */ - if (unit != -1 && !can_cast_datetime64_units(bestunit, unit, + if (!can_cast_datetime64_units(bestunit, unit, casting)) { PyErr_Format(PyExc_TypeError, "Cannot parse \"%s\" as unit " "'%s' using casting rule %s", @@ -975,7 +975,7 @@ parse_iso_8601_datetime(char *str, int len, } /* Check the casting rule */ - if (unit != -1 && !can_cast_datetime64_units(bestunit, unit, + if (!can_cast_datetime64_units(bestunit, unit, casting)) { PyErr_Format(PyExc_TypeError, "Cannot parse \"%s\" as unit " "'%s' using casting rule %s", @@ -1005,11 +1005,6 @@ get_datetime_iso_8601_strlen(int local, PANDAS_DATETIMEUNIT base) { int len = 0; - /* If no unit is provided, return the maximum length */ - if (base == -1) { - return PANDAS_DATETIME_MAX_ISO8601_STRLEN; - } - switch (base) { /* Generic units can only be used to represent NaT */ /*case PANDAS_FR_GENERIC:*/ @@ -1146,28 +1141,13 @@ make_iso_8601_datetime(pandas_datetimestruct *dts, char *outstr, int outlen, local = 0; } - /* Automatically detect a good unit */ - if (base == -1) { - base = lossless_unit_from_datetimestruct(dts); - /* - * If there's a timezone, use at least minutes precision, - * and never split up hours and minutes by default - */ - if ((base < PANDAS_FR_m && local) || base == PANDAS_FR_h) { - base = PANDAS_FR_m; - } - /* Don't split up dates by default */ - else if (base < PANDAS_FR_D) { - base = PANDAS_FR_D; - } - } /* * Print weeks with the same precision as days. * * TODO: Could print weeks with YYYY-Www format if the week * epoch is a Monday. */ - else if (base == PANDAS_FR_W) { + if (base == PANDAS_FR_W) { base = PANDAS_FR_D; } diff --git a/pandas/src/generate_code.py b/pandas/src/generate_code.py deleted file mode 100644 index 309a81b38f4e1..0000000000000 --- a/pandas/src/generate_code.py +++ /dev/null @@ -1,2182 +0,0 @@ -""" -This file generates `generated.pyx` which is then included in `../algos.pyx` -during building. To regenerate `generated.pyx`, just run: - - `python generate_code.py`. - -""" - -# flake8: noqa - -from __future__ import print_function -import os -from pandas.compat import StringIO -import numpy as np - -_int64_max = np.iinfo(np.int64).max - -warning_to_new_contributors = """ -# DO NOT EDIT THIS FILE: This file was autogenerated from generate_code.py, so -# please edit that file and then run `python2 generate_code.py` to re-generate -# this file. -""" - -header = """ -cimport numpy as np -cimport cython - -from libc.string cimport memmove - -from numpy cimport * - -from cpython cimport (PyDict_New, PyDict_GetItem, PyDict_SetItem, - PyDict_Contains, PyDict_Keys, - Py_INCREF, PyTuple_SET_ITEM, - PyTuple_SetItem, - PyTuple_New) -from cpython cimport PyFloat_Check -cimport cpython - -cdef extern from "numpy/npy_math.h": - double NAN "NPY_NAN" - -import numpy as np -isnan = np.isnan - -from datetime import datetime as pydatetime - -# this is our datetime.pxd -from datetime cimport * - -from khash cimport * - -ctypedef unsigned char UChar - -cimport util -from util cimport is_array, _checknull, _checknan, get_nat -cimport lib -from lib cimport is_null_datetimelike - -cdef int64_t iNaT = get_nat() - -# import datetime C API -PyDateTime_IMPORT - -# initialize numpy -import_array() -import_ufunc() - -cdef int PLATFORM_INT = ( np.arange(0, dtype=np.int_)).descr.type_num - -cpdef ensure_platform_int(object arr): - if util.is_array(arr): - if ( arr).descr.type_num == PLATFORM_INT: - return arr - else: - return arr.astype(np.int_) - else: - return np.array(arr, dtype=np.int_) - -cpdef ensure_object(object arr): - if util.is_array(arr): - if ( arr).descr.type_num == NPY_OBJECT: - return arr - else: - return arr.astype(np.object_) - elif hasattr(arr,'asobject'): - return arr.asobject - else: - return np.array(arr, dtype=np.object_) -""" - - -inner_take_1d_template = """\ - cdef: - Py_ssize_t i, n, idx - %(c_type_out)s fv - - n = indexer.shape[0] - - fv = fill_value - - %(nogil)s - %(tab)sfor i from 0 <= i < n: - %(tab)s idx = indexer[i] - %(tab)s if idx == -1: - %(tab)s out[i] = fv - %(tab)s else: - %(tab)s out[i] = %(preval)svalues[idx]%(postval)s -""" - -take_1d_template = """\ -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_1d_%(name)s_%(dest)s_memview(%(c_type_in)s[:] values, - int64_t[:] indexer, - %(c_type_out)s[:] out, - fill_value=np.nan): -""" + inner_take_1d_template + """ - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_1d_%(name)s_%(dest)s(ndarray[%(c_type_in)s, ndim=1] values, - int64_t[:] indexer, - %(c_type_out)s[:] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_1d_%(name)s_%(dest)s_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. -""" + inner_take_1d_template - -inner_take_2d_axis0_template = """\ - cdef: - Py_ssize_t i, j, k, n, idx - %(c_type_out)s fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF %(can_copy)s: - cdef: - %(c_type_out)s *v - %(c_type_out)s *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(%(c_type_out)s) and - sizeof(%(c_type_out)s) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(%(c_type_out)s) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = %(preval)svalues[idx, j]%(postval)s -""" - -take_2d_axis0_template = """\ -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis0_%(name)s_%(dest)s_memview(%(c_type_in)s[:, :] values, - int64_t[:] indexer, - %(c_type_out)s[:, :] out, - fill_value=np.nan): -""" + inner_take_2d_axis0_template + """ - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_%(name)s_%(dest)s(ndarray[%(c_type_in)s, ndim=2] values, - ndarray[int64_t] indexer, - %(c_type_out)s[:, :] out, - fill_value=np.nan): - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis0_%(name)s_%(dest)s_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. -""" + inner_take_2d_axis0_template - - -inner_take_2d_axis1_template = """\ - cdef: - Py_ssize_t i, j, k, n, idx - %(c_type_out)s fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = %(preval)svalues[i, idx]%(postval)s -""" - -take_2d_axis1_template = """\ -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis1_%(name)s_%(dest)s_memview(%(c_type_in)s[:, :] values, - int64_t[:] indexer, - %(c_type_out)s[:, :] out, - fill_value=np.nan): -""" + inner_take_2d_axis1_template + """ - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_%(name)s_%(dest)s(ndarray[%(c_type_in)s, ndim=2] values, - ndarray[int64_t] indexer, - %(c_type_out)s[:, :] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis1_%(name)s_%(dest)s_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. -""" + inner_take_2d_axis1_template - - -take_2d_multi_template = """@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_multi_%(name)s_%(dest)s(ndarray[%(c_type_in)s, ndim=2] values, - indexer, - ndarray[%(c_type_out)s, ndim=2] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t] idx0 = indexer[0] - ndarray[int64_t] idx1 = indexer[1] - %(c_type_out)s fv - - n = len(idx0) - k = len(idx1) - - fv = fill_value - for i from 0 <= i < n: - idx = idx0[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - if idx1[j] == -1: - out[i, j] = fv - else: - out[i, j] = %(preval)svalues[idx, idx1[j]]%(postval)s -""" - - - -""" -Backfilling logic for generating fill vector - -Diagram of what's going on - -Old New Fill vector Mask - . 0 1 - . 0 1 - . 0 1 -A A 0 1 - . 1 1 - . 1 1 - . 1 1 - . 1 1 - . 1 1 -B B 1 1 - . 2 1 - . 2 1 - . 2 1 -C C 2 1 - . 0 - . 0 -D -""" - -backfill_template = """@cython.boundscheck(False) -@cython.wraparound(False) -def backfill_%(name)s(ndarray[%(c_type)s] old, ndarray[%(c_type)s] new, - limit=None): - cdef Py_ssize_t i, j, nleft, nright - cdef ndarray[int64_t, ndim=1] indexer - cdef %(c_type)s cur, prev - cdef int lim, fill_count = 0 - - nleft = len(old) - nright = len(new) - indexer = np.empty(nright, dtype=np.int64) - indexer.fill(-1) - - if limit is None: - lim = nright - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]: - return indexer - - i = nleft - 1 - j = nright - 1 - - cur = old[nleft - 1] - - while j >= 0 and new[j] > cur: - j -= 1 - - while True: - if j < 0: - break - - if i == 0: - while j >= 0: - if new[j] == cur: - indexer[j] = i - elif new[j] < cur and fill_count < lim: - indexer[j] = i - fill_count += 1 - j -= 1 - break - - prev = old[i - 1] - - while j >= 0 and prev < new[j] <= cur: - if new[j] == cur: - indexer[j] = i - elif new[j] < cur and fill_count < lim: - indexer[j] = i - fill_count += 1 - j -= 1 - - fill_count = 0 - i -= 1 - cur = prev - - return indexer -""" - - -pad_template = """@cython.boundscheck(False) -@cython.wraparound(False) -def pad_%(name)s(ndarray[%(c_type)s] old, ndarray[%(c_type)s] new, - limit=None): - cdef Py_ssize_t i, j, nleft, nright - cdef ndarray[int64_t, ndim=1] indexer - cdef %(c_type)s cur, next - cdef int lim, fill_count = 0 - - nleft = len(old) - nright = len(new) - indexer = np.empty(nright, dtype=np.int64) - indexer.fill(-1) - - if limit is None: - lim = nright - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - if nleft == 0 or nright == 0 or new[nright - 1] < old[0]: - return indexer - - i = j = 0 - - cur = old[0] - - while j <= nright - 1 and new[j] < cur: - j += 1 - - while True: - if j == nright: - break - - if i == nleft - 1: - while j < nright: - if new[j] == cur: - indexer[j] = i - elif new[j] > cur and fill_count < lim: - indexer[j] = i - fill_count += 1 - j += 1 - break - - next = old[i + 1] - - while j < nright and cur <= new[j] < next: - if new[j] == cur: - indexer[j] = i - elif fill_count < lim: - indexer[j] = i - fill_count += 1 - j += 1 - - fill_count = 0 - i += 1 - cur = next - - return indexer -""" - -pad_1d_template = """@cython.boundscheck(False) -@cython.wraparound(False) -def pad_inplace_%(name)s(ndarray[%(c_type)s] values, - ndarray[uint8_t, cast=True] mask, - limit=None): - cdef Py_ssize_t i, N - cdef %(c_type)s val - cdef int lim, fill_count = 0 - - N = len(values) - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - val = values[0] - for i in range(N): - if mask[i]: - if fill_count >= lim: - continue - fill_count += 1 - values[i] = val - else: - fill_count = 0 - val = values[i] -""" - -pad_2d_template = """@cython.boundscheck(False) -@cython.wraparound(False) -def pad_2d_inplace_%(name)s(ndarray[%(c_type)s, ndim=2] values, - ndarray[uint8_t, ndim=2] mask, - limit=None): - cdef Py_ssize_t i, j, N, K - cdef %(c_type)s val - cdef int lim, fill_count = 0 - - K, N = ( values).shape - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - for j in range(K): - fill_count = 0 - val = values[j, 0] - for i in range(N): - if mask[j, i]: - if fill_count >= lim: - continue - fill_count += 1 - values[j, i] = val - else: - fill_count = 0 - val = values[j, i] -""" - -backfill_2d_template = """@cython.boundscheck(False) -@cython.wraparound(False) -def backfill_2d_inplace_%(name)s(ndarray[%(c_type)s, ndim=2] values, - ndarray[uint8_t, ndim=2] mask, - limit=None): - cdef Py_ssize_t i, j, N, K - cdef %(c_type)s val - cdef int lim, fill_count = 0 - - K, N = ( values).shape - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - for j in range(K): - fill_count = 0 - val = values[j, N - 1] - for i in range(N - 1, -1 , -1): - if mask[j, i]: - if fill_count >= lim: - continue - fill_count += 1 - values[j, i] = val - else: - fill_count = 0 - val = values[j, i] -""" - -backfill_1d_template = """@cython.boundscheck(False) -@cython.wraparound(False) -def backfill_inplace_%(name)s(ndarray[%(c_type)s] values, - ndarray[uint8_t, cast=True] mask, - limit=None): - cdef Py_ssize_t i, N - cdef %(c_type)s val - cdef int lim, fill_count = 0 - - N = len(values) - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - val = values[N - 1] - for i in range(N - 1, -1 , -1): - if mask[i]: - if fill_count >= lim: - continue - fill_count += 1 - values[i] = val - else: - fill_count = 0 - val = values[i] -""" - - -diff_2d_template = """@cython.boundscheck(False) -@cython.wraparound(False) -def diff_2d_%(name)s(ndarray[%(c_type)s, ndim=2] arr, - ndarray[%(dest_type2)s, ndim=2] out, - Py_ssize_t periods, int axis): - cdef: - Py_ssize_t i, j, sx, sy - - sx, sy = ( arr).shape - if arr.flags.f_contiguous: - if axis == 0: - if periods >= 0: - start, stop = periods, sx - else: - start, stop = 0, sx + periods - for j in range(sy): - for i in range(start, stop): - out[i, j] = arr[i, j] - arr[i - periods, j] - else: - if periods >= 0: - start, stop = periods, sy - else: - start, stop = 0, sy + periods - for j in range(start, stop): - for i in range(sx): - out[i, j] = arr[i, j] - arr[i, j - periods] - else: - if axis == 0: - if periods >= 0: - start, stop = periods, sx - else: - start, stop = 0, sx + periods - for i in range(start, stop): - for j in range(sy): - out[i, j] = arr[i, j] - arr[i - periods, j] - else: - if periods >= 0: - start, stop = periods, sy - else: - start, stop = 0, sy + periods - for i in range(sx): - for j in range(start, stop): - out[i, j] = arr[i, j] - arr[i, j - periods] -""" - -is_monotonic_template = '''@cython.boundscheck(False) -@cython.wraparound(False) -def is_monotonic_%(name)s(ndarray[%(c_type)s] arr, bint timelike): - """ - Returns - ------- - is_monotonic_inc, is_monotonic_dec - """ - cdef: - Py_ssize_t i, n - %(c_type)s prev, cur - bint is_monotonic_inc = 1 - bint is_monotonic_dec = 1 - - n = len(arr) - - if n == 1: - if arr[0] != arr[0] or (timelike and arr[0] == iNaT): - # single value is NaN - return False, False - else: - return True, True - elif n < 2: - return True, True - - if timelike and arr[0] == iNaT: - return False, False - - %(nogil)s - %(tab)sprev = arr[0] - %(tab)sfor i in range(1, n): - %(tab)s cur = arr[i] - %(tab)s if timelike and cur == iNaT: - %(tab)s is_monotonic_inc = 0 - %(tab)s is_monotonic_dec = 0 - %(tab)s break - %(tab)s if cur < prev: - %(tab)s is_monotonic_inc = 0 - %(tab)s elif cur > prev: - %(tab)s is_monotonic_dec = 0 - %(tab)s elif cur == prev: - %(tab)s pass # is_unique = 0 - %(tab)s else: - %(tab)s # cur or prev is NaN - %(tab)s is_monotonic_inc = 0 - %(tab)s is_monotonic_dec = 0 - %(tab)s break - %(tab)s if not is_monotonic_inc and not is_monotonic_dec: - %(tab)s is_monotonic_inc = 0 - %(tab)s is_monotonic_dec = 0 - %(tab)s break - %(tab)s prev = cur - return is_monotonic_inc, is_monotonic_dec -''' - -map_indices_template = '''@cython.wraparound(False) -@cython.boundscheck(False) -cpdef map_indices_%(name)s(ndarray[%(c_type)s] index): - """ - Produce a dict mapping the values of the input array to their respective - locations. - - Example: - array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1} - - Better to do this with Cython because of the enormous speed boost. - """ - cdef Py_ssize_t i, length - cdef dict result = {} - - length = len(index) - - for i in range(length): - result[index[i]] = i - - return result -''' - -groupby_template = '''@cython.wraparound(False) -@cython.boundscheck(False) -def groupby_%(name)s(ndarray[%(c_type)s] index, ndarray labels): - cdef dict result = {} - cdef Py_ssize_t i, length - cdef list members - cdef object idx, key - - length = len(index) - - if not length == len(labels): - raise AssertionError("len(index) != len(labels)") - - for i in range(length): - key = util.get_value_1d(labels, i) - - if is_null_datetimelike(key): - continue - - idx = index[i] - if key in result: - members = result[key] - members.append(idx) - else: - result[key] = [idx] - - return result -''' - -group_last_template = '''@cython.wraparound(False) -@cython.boundscheck(False) -def group_last_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, - ndarray[int64_t] counts, - ndarray[%(c_type)s, ndim=2] values, - ndarray[int64_t] labels): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - %(dest_type2)s val, count - ndarray[%(dest_type2)s, ndim=2] resx - ndarray[int64_t, ndim=2] nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros(( out).shape, dtype=np.int64) - resx = np.empty_like(out) - - N, K = ( values).shape - - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val and val != %(nan_val)s: - nobs[lab, j] += 1 - resx[lab, j] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = %(nan_val)s - else: - out[i, j] = resx[i, j] -''' - -group_nth_template = '''@cython.wraparound(False) -@cython.boundscheck(False) -def group_nth_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, - ndarray[int64_t] counts, - ndarray[%(c_type)s, ndim=2] values, - ndarray[int64_t] labels, int64_t rank): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - %(dest_type2)s val, count - ndarray[%(dest_type2)s, ndim=2] resx - ndarray[int64_t, ndim=2] nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros(( out).shape, dtype=np.int64) - resx = np.empty_like(out) - - N, K = ( values).shape - - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val and val != %(nan_val)s: - nobs[lab, j] += 1 - if nobs[lab, j] == rank: - resx[lab, j] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = %(nan_val)s - else: - out[i, j] = resx[i, j] -''' - -group_add_template = '''@cython.wraparound(False) -@cython.boundscheck(False) -def group_add_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, - ndarray[int64_t] counts, - ndarray[%(c_type)s, ndim=2] values, - ndarray[int64_t] labels): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - %(dest_type2)s val, count - ndarray[%(dest_type2)s, ndim=2] sumx, nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - sumx = np.zeros_like(out) - - N, K = ( values).shape - - - with nogil: - - if K > 1: - - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[lab, j] += 1 - sumx[lab, j] += val - - else: - - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - val = values[i, 0] - - # not nan - if val == val: - nobs[lab, 0] += 1 - sumx[lab, 0] += val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = NAN - else: - out[i, j] = sumx[i, j] -''' - -group_prod_template = '''@cython.wraparound(False) -@cython.boundscheck(False) -def group_prod_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, - ndarray[int64_t] counts, - ndarray[%(c_type)s, ndim=2] values, - ndarray[int64_t] labels): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - %(dest_type2)s val, count - ndarray[%(dest_type2)s, ndim=2] prodx, nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - prodx = np.ones_like(out) - - N, K = ( values).shape - - with nogil: - if K > 1: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[lab, j] += 1 - prodx[lab, j] *= val - else: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - val = values[i, 0] - - # not nan - if val == val: - nobs[lab, 0] += 1 - prodx[lab, 0] *= val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = NAN - else: - out[i, j] = prodx[i, j] -''' - -group_var_template = '''@cython.wraparound(False) -@cython.boundscheck(False) -@cython.cdivision(True) -def group_var_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, - ndarray[int64_t] counts, - ndarray[%(dest_type2)s, ndim=2] values, - ndarray[int64_t] labels): - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - %(dest_type2)s val, ct, oldmean - ndarray[%(dest_type2)s, ndim=2] nobs, mean - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - mean = np.zeros_like(out) - - N, K = ( values).shape - - out[:, :] = 0.0 - - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[lab, j] += 1 - oldmean = mean[lab, j] - mean[lab, j] += (val - oldmean) / nobs[lab, j] - out[lab, j] += (val - mean[lab, j]) * (val - oldmean) - - for i in range(ncounts): - for j in range(K): - ct = nobs[i, j] - if ct < 2: - out[i, j] = NAN - else: - out[i, j] /= (ct - 1) - -''' - -# add passing bin edges, instead of labels - - -#---------------------------------------------------------------------- -# group_min, group_max - -group_max_template = '''@cython.wraparound(False) -@cython.boundscheck(False) -def group_max_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, - ndarray[int64_t] counts, - ndarray[%(dest_type2)s, ndim=2] values, - ndarray[int64_t] labels): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - %(dest_type2)s val, count - ndarray[%(dest_type2)s, ndim=2] maxx, nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - - maxx = np.empty_like(out) - maxx.fill(-%(inf_val)s) - - N, K = ( values).shape - - with nogil: - if K > 1: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val and val != %(nan_val)s: - nobs[lab, j] += 1 - if val > maxx[lab, j]: - maxx[lab, j] = val - else: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - val = values[i, 0] - - # not nan - if val == val and val != %(nan_val)s: - nobs[lab, 0] += 1 - if val > maxx[lab, 0]: - maxx[lab, 0] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = %(nan_val)s - else: - out[i, j] = maxx[i, j] -''' - -group_min_template = '''@cython.wraparound(False) -@cython.boundscheck(False) -def group_min_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, - ndarray[int64_t] counts, - ndarray[%(dest_type2)s, ndim=2] values, - ndarray[int64_t] labels): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - %(dest_type2)s val, count - ndarray[%(dest_type2)s, ndim=2] minx, nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - - minx = np.empty_like(out) - minx.fill(%(inf_val)s) - - N, K = ( values).shape - - with nogil: - if K > 1: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val and val != %(nan_val)s: - - nobs[lab, j] += 1 - if val < minx[lab, j]: - minx[lab, j] = val - else: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - val = values[i, 0] - - # not nan - if val == val and val != %(nan_val)s: - nobs[lab, 0] += 1 - if val < minx[lab, 0]: - minx[lab, 0] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = %(nan_val)s - else: - out[i, j] = minx[i, j] -''' - - -group_mean_template = '''@cython.wraparound(False) -@cython.boundscheck(False) -def group_mean_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, - ndarray[int64_t] counts, - ndarray[%(dest_type2)s, ndim=2] values, - ndarray[int64_t] labels): - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - %(dest_type2)s val, count - ndarray[%(dest_type2)s, ndim=2] sumx, nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - sumx = np.zeros_like(out) - - N, K = ( values).shape - - with nogil: - if K > 1: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - # not nan - if val == val: - nobs[lab, j] += 1 - sumx[lab, j] += val - else: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - val = values[i, 0] - # not nan - if val == val: - nobs[lab, 0] += 1 - sumx[lab, 0] += val - - for i in range(ncounts): - for j in range(K): - count = nobs[i, j] - if nobs[i, j] == 0: - out[i, j] = NAN - else: - out[i, j] = sumx[i, j] / count -''' - -group_ohlc_template = '''@cython.wraparound(False) -@cython.boundscheck(False) -def group_ohlc_%(name)s(ndarray[%(dest_type2)s, ndim=2] out, - ndarray[int64_t] counts, - ndarray[%(dest_type2)s, ndim=2] values, - ndarray[int64_t] labels): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab - %(dest_type2)s val, count - Py_ssize_t ngroups = len(counts) - - if len(labels) == 0: - return - - N, K = ( values).shape - - if out.shape[1] != 4: - raise ValueError('Output array must have 4 columns') - - if K > 1: - raise NotImplementedError("Argument 'values' must have only " - "one dimension") - out.fill(np.nan) - - with nogil: - for i in range(N): - lab = labels[i] - if lab == -1: - continue - - counts[lab] += 1 - val = values[i, 0] - if val != val: - continue - - if out[lab, 0] != out[lab, 0]: - out[lab, 0] = out[lab, 1] = out[lab, 2] = out[lab, 3] = val - else: - out[lab, 1] = max(out[lab, 1], val) - out[lab, 2] = min(out[lab, 2], val) - out[lab, 3] = val -''' - -arrmap_template = '''@cython.wraparound(False) -@cython.boundscheck(False) -def arrmap_%(name)s(ndarray[%(c_type)s] index, object func): - cdef Py_ssize_t length = index.shape[0] - cdef Py_ssize_t i = 0 - - cdef ndarray[object] result = np.empty(length, dtype=np.object_) - - from pandas.lib import maybe_convert_objects - - for i in range(length): - result[i] = func(index[i]) - - return maybe_convert_objects(result) -''' - -#---------------------------------------------------------------------- -# Joins on ordered, unique indices - -# right might contain non-unique values - -left_join_unique_template = '''@cython.wraparound(False) -@cython.boundscheck(False) -def left_join_indexer_unique_%(name)s(ndarray[%(c_type)s] left, - ndarray[%(c_type)s] right): - cdef: - Py_ssize_t i, j, nleft, nright - ndarray[int64_t] indexer - %(c_type)s lval, rval - - i = 0 - j = 0 - nleft = len(left) - nright = len(right) - - indexer = np.empty(nleft, dtype=np.int64) - while True: - if i == nleft: - break - - if j == nright: - indexer[i] = -1 - i += 1 - continue - - rval = right[j] - - while i < nleft - 1 and left[i] == rval: - indexer[i] = j - i += 1 - - if left[i] == right[j]: - indexer[i] = j - i += 1 - while i < nleft - 1 and left[i] == rval: - indexer[i] = j - i += 1 - j += 1 - elif left[i] > rval: - indexer[i] = -1 - j += 1 - else: - indexer[i] = -1 - i += 1 - return indexer -''' - -# @cython.wraparound(False) -# @cython.boundscheck(False) - -left_join_template = '''def left_join_indexer_%(name)s(ndarray[%(c_type)s] left, - ndarray[%(c_type)s] right): - """ - Two-pass algorithm for monotonic indexes. Handles many-to-one merges - """ - cdef: - Py_ssize_t i, j, k, nright, nleft, count - %(c_type)s lval, rval - ndarray[int64_t] lindexer, rindexer - ndarray[%(c_type)s] result - - nleft = len(left) - nright = len(right) - - i = 0 - j = 0 - count = 0 - if nleft > 0: - while i < nleft: - if j == nright: - count += nleft - i - break - - lval = left[i] - rval = right[j] - - if lval == rval: - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - count += 1 - i += 1 - else: - j += 1 - - # do it again now that result size is known - - lindexer = np.empty(count, dtype=np.int64) - rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype=%(dtype)s) - - i = 0 - j = 0 - count = 0 - if nleft > 0: - while i < nleft: - if j == nright: - while i < nleft: - lindexer[count] = i - rindexer[count] = -1 - result[count] = left[i] - i += 1 - count += 1 - break - - lval = left[i] - rval = right[j] - - if lval == rval: - lindexer[count] = i - rindexer[count] = j - result[count] = lval - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - lindexer[count] = i - rindexer[count] = -1 - result[count] = left[i] - count += 1 - i += 1 - else: - j += 1 - - return result, lindexer, rindexer -''' - - -inner_join_template = '''@cython.wraparound(False) -@cython.boundscheck(False) -def inner_join_indexer_%(name)s(ndarray[%(c_type)s] left, - ndarray[%(c_type)s] right): - """ - Two-pass algorithm for monotonic indexes. Handles many-to-one merges - """ - cdef: - Py_ssize_t i, j, k, nright, nleft, count - %(c_type)s lval, rval - ndarray[int64_t] lindexer, rindexer - ndarray[%(c_type)s] result - - nleft = len(left) - nright = len(right) - - i = 0 - j = 0 - count = 0 - if nleft > 0 and nright > 0: - while True: - if i == nleft: - break - if j == nright: - break - - lval = left[i] - rval = right[j] - if lval == rval: - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - i += 1 - else: - j += 1 - - # do it again now that result size is known - - lindexer = np.empty(count, dtype=np.int64) - rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype=%(dtype)s) - - i = 0 - j = 0 - count = 0 - if nleft > 0 and nright > 0: - while True: - if i == nleft: - break - if j == nright: - break - - lval = left[i] - rval = right[j] - if lval == rval: - lindexer[count] = i - rindexer[count] = j - result[count] = rval - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - i += 1 - else: - j += 1 - - return result, lindexer, rindexer -''' - - -outer_join_template2 = '''@cython.wraparound(False) -@cython.boundscheck(False) -def outer_join_indexer_%(name)s(ndarray[%(c_type)s] left, - ndarray[%(c_type)s] right): - cdef: - Py_ssize_t i, j, nright, nleft, count - %(c_type)s lval, rval - ndarray[int64_t] lindexer, rindexer - ndarray[%(c_type)s] result - - nleft = len(left) - nright = len(right) - - i = 0 - j = 0 - count = 0 - if nleft == 0: - count = nright - elif nright == 0: - count = nleft - else: - while True: - if i == nleft: - count += nright - j - break - if j == nright: - count += nleft - i - break - - lval = left[i] - rval = right[j] - if lval == rval: - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - count += 1 - i += 1 - else: - count += 1 - j += 1 - - lindexer = np.empty(count, dtype=np.int64) - rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype=%(dtype)s) - - # do it again, but populate the indexers / result - - i = 0 - j = 0 - count = 0 - if nleft == 0: - for j in range(nright): - lindexer[j] = -1 - rindexer[j] = j - result[j] = right[j] - elif nright == 0: - for i in range(nleft): - lindexer[i] = i - rindexer[i] = -1 - result[i] = left[i] - else: - while True: - if i == nleft: - while j < nright: - lindexer[count] = -1 - rindexer[count] = j - result[count] = right[j] - count += 1 - j += 1 - break - if j == nright: - while i < nleft: - lindexer[count] = i - rindexer[count] = -1 - result[count] = left[i] - count += 1 - i += 1 - break - - lval = left[i] - rval = right[j] - - if lval == rval: - lindexer[count] = i - rindexer[count] = j - result[count] = lval - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - lindexer[count] = i - rindexer[count] = -1 - result[count] = lval - count += 1 - i += 1 - else: - lindexer[count] = -1 - rindexer[count] = j - result[count] = rval - count += 1 - j += 1 - - return result, lindexer, rindexer -''' - -outer_join_template = '''@cython.wraparound(False) -@cython.boundscheck(False) -def outer_join_indexer_%(name)s(ndarray[%(c_type)s] left, - ndarray[%(c_type)s] right): - cdef: - Py_ssize_t i, j, nright, nleft, count - %(c_type)s lval, rval - ndarray[int64_t] lindexer, rindexer - ndarray[%(c_type)s] result - - nleft = len(left) - nright = len(right) - - i = 0 - j = 0 - count = 0 - while True: - if i == nleft: - if j == nright: - # we are done - break - else: - while j < nright: - j += 1 - count += 1 - break - elif j == nright: - while i < nleft: - i += 1 - count += 1 - break - else: - if left[i] == right[j]: - i += 1 - j += 1 - elif left[i] < right[j]: - i += 1 - else: - j += 1 - - count += 1 - - lindexer = np.empty(count, dtype=np.int64) - rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype=%(dtype)s) - - # do it again, but populate the indexers / result - - i = 0 - j = 0 - count = 0 - while True: - if i == nleft: - if j == nright: - # we are done - break - else: - while j < nright: - lindexer[count] = -1 - rindexer[count] = j - result[count] = right[j] - j += 1 - count += 1 - break - elif j == nright: - while i < nleft: - lindexer[count] = i - rindexer[count] = -1 - result[count] = left[i] - i += 1 - count += 1 - break - else: - lval = left[i] - rval = right[j] - if lval == rval: - lindexer[count] = i - rindexer[count] = j - result[count] = lval - i += 1 - j += 1 - elif lval < rval: - lindexer[count] = i - rindexer[count] = -1 - result[count] = lval - i += 1 - else: - lindexer[count] = -1 - rindexer[count] = j - result[count] = rval - j += 1 - - count += 1 - - return result, lindexer, rindexer -''' - -# ensure_dtype functions - -ensure_dtype_template = """ -cpdef ensure_%(name)s(object arr): - if util.is_array(arr): - if ( arr).descr.type_num == NPY_%(ctype)s: - return arr - else: - return arr.astype(np.%(dtype)s) - else: - return np.array(arr, dtype=np.%(dtype)s) -""" - -ensure_functions = [ - ('float64', 'FLOAT64', 'float64'), - ('float32', 'FLOAT32', 'float32'), - ('int8', 'INT8', 'int8'), - ('int16', 'INT16', 'int16'), - ('int32', 'INT32', 'int32'), - ('int64', 'INT64', 'int64'), - # ('platform_int', 'INT', 'int_'), - #('object', 'OBJECT', 'object_'), -] - -def generate_ensure_dtypes(): - output = StringIO() - for name, ctype, dtype in ensure_functions: - filled = ensure_dtype_template % locals() - output.write(filled) - return output.getvalue() - -#---------------------------------------------------------------------- -# Fast "put" logic for speeding up interleaving logic - -put2d_template = """ -def put2d_%(name)s_%(dest_type)s(ndarray[%(c_type)s, ndim=2, cast=True] values, - ndarray[int64_t] indexer, Py_ssize_t loc, - ndarray[%(dest_type2)s] out): - cdef: - Py_ssize_t i, j, k - - k = len(values) - for j from 0 <= j < k: - i = indexer[j] - out[i] = values[j, loc] -""" - -#---------------------------------------------------------------------- -# other grouping functions not needing a template -grouping_no_template = '''def group_median_float64(ndarray[float64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, - ndarray[int64_t] labels): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, ngroups, size - ndarray[int64_t] _counts - ndarray data - float64_t* ptr - ngroups = len(counts) - N, K = ( values).shape - - indexer, _counts = groupsort_indexer(labels, ngroups) - counts[:] = _counts[1:] - - data = np.empty((K, N), dtype=np.float64) - ptr = data.data - - take_2d_axis1_float64_float64(values.T, indexer, out=data) - - for i in range(K): - # exclude NA group - ptr += _counts[0] - for j in range(ngroups): - size = _counts[j + 1] - out[j, i] = _median_linear(ptr, size) - ptr += size - - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_cumprod_float64(float64_t[:,:] out, - float64_t[:,:] values, - int64_t[:] labels, - float64_t[:,:] accum): - """ - Only transforms on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, size - float64_t val - int64_t lab - - N, K = ( values).shape - accum = np.ones_like(accum) - - with nogil: - for i in range(N): - lab = labels[i] - - if lab < 0: - continue - for j in range(K): - val = values[i, j] - if val == val: - accum[lab, j] *= val - out[i, j] = accum[lab, j] - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_cumsum(numeric[:,:] out, - numeric[:,:] values, - int64_t[:] labels, - numeric[:,:] accum): - """ - Only transforms on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, size - numeric val - int64_t lab - - N, K = ( values).shape - accum = np.zeros_like(accum) - - with nogil: - for i in range(N): - lab = labels[i] - - if lab < 0: - continue - for j in range(K): - val = values[i,j] - if val == val: - accum[lab,j] += val - out[i,j] = accum[lab,j] - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_shift_indexer(int64_t[:] out, int64_t[:] labels, - int ngroups, int periods): - cdef: - Py_ssize_t N, i, j, ii - int offset, sign - int64_t lab, idxer, idxer_slot - int64_t[:] label_seen = np.zeros(ngroups, dtype=np.int64) - int64_t[:,:] label_indexer - - N, = ( labels).shape - - if periods < 0: - periods = -periods - offset = N - 1 - sign = -1 - elif periods > 0: - offset = 0 - sign = 1 - - if periods == 0: - with nogil: - for i in range(N): - out[i] = i - else: - # array of each previous indexer seen - label_indexer = np.zeros((ngroups, periods), dtype=np.int64) - with nogil: - for i in range(N): - ## reverse iterator if shifting backwards - ii = offset + sign * i - lab = labels[ii] - label_seen[lab] += 1 - - idxer_slot = label_seen[lab] % periods - idxer = label_indexer[lab, idxer_slot] - - if label_seen[lab] > periods: - out[ii] = idxer - else: - out[ii] = -1 - - label_indexer[lab, idxer_slot] = ii -''' - - -#------------------------------------------------------------------------- -# Generators - -def generate_put_template(template, use_ints=True, use_floats=True, - use_objects=False, use_datelikes=False): - floats_list = [ - ('float64', 'float64_t', 'float64_t', 'np.float64', True), - ('float32', 'float32_t', 'float32_t', 'np.float32', True), - ] - ints_list = [ - ('int8', 'int8_t', 'float32_t', 'np.float32', True), - ('int16', 'int16_t', 'float32_t', 'np.float32', True), - ('int32', 'int32_t', 'float64_t', 'np.float64', True), - ('int64', 'int64_t', 'float64_t', 'np.float64', True), - ] - date_like_list = [ - ('int64', 'int64_t', 'float64_t', 'np.float64', True), - ] - object_list = [('object', 'object', 'object', 'np.object_', False)] - function_list = [] - if use_floats: - function_list.extend(floats_list) - if use_ints: - function_list.extend(ints_list) - if use_objects: - function_list.extend(object_list) - if use_datelikes: - function_list.extend(date_like_list) - - output = StringIO() - for name, c_type, dest_type, dest_dtype, nogil in function_list: - func = template % {'name': name, - 'c_type': c_type, - 'dest_type': dest_type.replace('_t', ''), - 'dest_type2': dest_type, - 'dest_dtype': dest_dtype, - 'nogil' : 'with nogil:' if nogil else '', - 'tab' : ' ' if nogil else '' } - output.write(func) - output.write("\n") - return output.getvalue() - -def generate_put_min_max_template(template, use_ints=True, use_floats=True, - use_objects=False, use_datelikes=False): - floats_list = [ - ('float64', 'float64_t', 'NAN', 'np.inf', True), - ('float32', 'float32_t', 'NAN', 'np.inf', True), - ] - ints_list = [ - ('int64', 'int64_t', 'iNaT', _int64_max, True), - ] - date_like_list = [ - ('int64', 'int64_t', 'iNaT', _int64_max, True), - ] - object_list = [('object', 'object', 'np.nan', 'np.inf', False)] - function_list = [] - if use_floats: - function_list.extend(floats_list) - if use_ints: - function_list.extend(ints_list) - if use_objects: - function_list.extend(object_list) - if use_datelikes: - function_list.extend(date_like_list) - - output = StringIO() - for name, dest_type, nan_val, inf_val, nogil in function_list: - func = template % {'name': name, - 'dest_type2': dest_type, - 'nan_val': nan_val, - 'inf_val': inf_val, - 'nogil' : "with nogil:" if nogil else '', - 'tab' : ' ' if nogil else '' } - output.write(func) - output.write("\n") - return output.getvalue() - -def generate_put_selection_template(template, use_ints=True, use_floats=True, - use_objects=False, use_datelikes=False): - floats_list = [ - ('float64', 'float64_t', 'float64_t', 'NAN', True), - ('float32', 'float32_t', 'float32_t', 'NAN', True), - ] - ints_list = [ - ('int64', 'int64_t', 'int64_t', 'iNaT', True), - ] - date_like_list = [ - ('int64', 'int64_t', 'int64_t', 'iNaT', True), - ] - object_list = [('object', 'object', 'object', 'np.nan', False)] - function_list = [] - if use_floats: - function_list.extend(floats_list) - if use_ints: - function_list.extend(ints_list) - if use_objects: - function_list.extend(object_list) - if use_datelikes: - function_list.extend(date_like_list) - - output = StringIO() - for name, c_type, dest_type, nan_val, nogil in function_list: - - if nogil: - nogil = "with nogil:" - tab = ' ' - else: - nogil = '' - tab = '' - - func = template % {'name': name, - 'c_type': c_type, - 'dest_type2': dest_type, - 'nan_val': nan_val, - 'nogil' : nogil, - 'tab' : tab } - output.write(func) - output.write("\n") - return output.getvalue() - -def generate_take_template(template, exclude=None): - # name, dest, ctypein, ctypeout, preval, postval, cancopy, nogil - function_list = [ - ('bool', 'bool', 'uint8_t', 'uint8_t', '', '', True, True), - ('bool', 'object', 'uint8_t', 'object', - 'True if ', ' > 0 else False', False, False), - ('int8', 'int8', 'int8_t', 'int8_t', '', '', True, False), - ('int8', 'int32', 'int8_t', 'int32_t', '', '', False, True), - ('int8', 'int64', 'int8_t', 'int64_t', '', '', False, True), - ('int8', 'float64', 'int8_t', 'float64_t', '', '', False, True), - ('int16', 'int16', 'int16_t', 'int16_t', '', '', True, True), - ('int16', 'int32', 'int16_t', 'int32_t', '', '', False, True), - ('int16', 'int64', 'int16_t', 'int64_t', '', '', False, True), - ('int16', 'float64', 'int16_t', 'float64_t', '', '', False, True), - ('int32', 'int32', 'int32_t', 'int32_t', '', '', True, True), - ('int32', 'int64', 'int32_t', 'int64_t', '', '', False, True), - ('int32', 'float64', 'int32_t', 'float64_t', '', '', False, True), - ('int64', 'int64', 'int64_t', 'int64_t', '', '', True, True), - ('int64', 'float64', 'int64_t', 'float64_t', '', '', False, True), - ('float32', 'float32', 'float32_t', 'float32_t', '', '', True, True), - ('float32', 'float64', 'float32_t', 'float64_t', '', '', False, True), - ('float64', 'float64', 'float64_t', 'float64_t', '', '', True, True), - ('object', 'object', 'object', 'object', '', '', False, False), - ] - - output = StringIO() - for (name, dest, c_type_in, c_type_out, - preval, postval, can_copy, nogil) in function_list: - - if exclude is not None and name in exclude: - continue - - if nogil: - nogil = "with nogil:" - tab = ' ' - else: - nogil = '' - tab = '' - - func = template % {'name': name, 'dest': dest, - 'c_type_in': c_type_in, 'c_type_out': c_type_out, - 'preval': preval, 'postval': postval, - 'can_copy': 'True' if can_copy else 'False', - 'nogil' : nogil, - 'tab' : tab } - output.write(func) - output.write("\n") - return output.getvalue() - -def generate_from_template(template, exclude=None): - # name, ctype, capable of holding NA - function_list = [ - ('float64', 'float64_t', 'np.float64', True, True), - ('float32', 'float32_t', 'np.float32', True, True), - ('object', 'object', 'object', True, False), - ('int32', 'int32_t', 'np.int32', False, True), - ('int64', 'int64_t', 'np.int64', False, True), - ('bool', 'uint8_t', 'np.bool', False, True) - ] - - output = StringIO() - for name, c_type, dtype, can_hold_na, nogil in function_list: - if exclude is not None and name in exclude: - continue - - func = template % {'name': name, 'c_type': c_type, - 'dtype': dtype, - 'raise_on_na': 'False' if can_hold_na else 'True', - 'nogil' : 'with nogil:' if nogil else '', - 'tab' : ' ' if nogil else '' } - output.write(func) - output.write("\n") - return output.getvalue() - -put_2d = [diff_2d_template] - -groupbys = [group_add_template, - group_prod_template, - group_var_template, - group_mean_template, - group_ohlc_template] - -groupby_selection = [group_last_template, - group_nth_template] - -groupby_min_max = [group_min_template, - group_max_template] - -templates_1d = [map_indices_template, - pad_template, - backfill_template, - pad_1d_template, - backfill_1d_template, - pad_2d_template, - backfill_2d_template, - is_monotonic_template, - groupby_template, - arrmap_template] - -nobool_1d_templates = [left_join_unique_template, - left_join_template, - outer_join_template2, - inner_join_template] - -take_templates = [take_1d_template, - take_2d_axis0_template, - take_2d_axis1_template, - take_2d_multi_template] - - -def generate_take_cython_file(): - # Put `generated.pyx` in the same directory as this file - directory = os.path.dirname(os.path.realpath(__file__)) - filename = 'generated.pyx' - path = os.path.join(directory, filename) - - with open(path, 'w') as f: - print(warning_to_new_contributors, file=f) - print(header, file=f) - - print(generate_ensure_dtypes(), file=f) - - for template in templates_1d: - print(generate_from_template(template), file=f) - - for template in take_templates: - print(generate_take_template(template), file=f) - - for template in put_2d: - print(generate_put_template(template), file=f) - - for template in groupbys: - print(generate_put_template(template, use_ints=False), file=f) - - for template in groupby_selection: - print(generate_put_selection_template(template, use_ints=True), - file=f) - - for template in groupby_min_max: - print(generate_put_min_max_template(template, use_ints=True), - file=f) - - print(grouping_no_template, file=f) - - for template in nobool_1d_templates: - print(generate_from_template(template, exclude=['bool']), file=f) - - -if __name__ == '__main__': - generate_take_cython_file() diff --git a/pandas/src/generated.pyx b/pandas/src/generated.pyx deleted file mode 100644 index c6dcd609a2c6e..0000000000000 --- a/pandas/src/generated.pyx +++ /dev/null @@ -1,10522 +0,0 @@ - -# DO NOT EDIT THIS FILE: This file was autogenerated from generate_code.py, so -# please edit that file and then run `python2 generate_code.py` to re-generate -# this file. - - -cimport numpy as np -cimport cython - -from libc.string cimport memmove - -from numpy cimport * - -from cpython cimport (PyDict_New, PyDict_GetItem, PyDict_SetItem, - PyDict_Contains, PyDict_Keys, - Py_INCREF, PyTuple_SET_ITEM, - PyTuple_SetItem, - PyTuple_New) -from cpython cimport PyFloat_Check -cimport cpython - -cdef extern from "numpy/npy_math.h": - double NAN "NPY_NAN" - -import numpy as np -isnan = np.isnan - -from datetime import datetime as pydatetime - -# this is our datetime.pxd -from datetime cimport * - -from khash cimport * - -ctypedef unsigned char UChar - -cimport util -from util cimport is_array, _checknull, _checknan, get_nat -cimport lib -from lib cimport is_null_datetimelike - -cdef int64_t iNaT = get_nat() - -# import datetime C API -PyDateTime_IMPORT - -# initialize numpy -import_array() -import_ufunc() - -cdef int PLATFORM_INT = ( np.arange(0, dtype=np.int_)).descr.type_num - -cpdef ensure_platform_int(object arr): - if util.is_array(arr): - if ( arr).descr.type_num == PLATFORM_INT: - return arr - else: - return arr.astype(np.int_) - else: - return np.array(arr, dtype=np.int_) - -cpdef ensure_object(object arr): - if util.is_array(arr): - if ( arr).descr.type_num == NPY_OBJECT: - return arr - else: - return arr.astype(np.object_) - elif hasattr(arr,'asobject'): - return arr.asobject - else: - return np.array(arr, dtype=np.object_) - - -cpdef ensure_float64(object arr): - if util.is_array(arr): - if ( arr).descr.type_num == NPY_FLOAT64: - return arr - else: - return arr.astype(np.float64) - else: - return np.array(arr, dtype=np.float64) - -cpdef ensure_float32(object arr): - if util.is_array(arr): - if ( arr).descr.type_num == NPY_FLOAT32: - return arr - else: - return arr.astype(np.float32) - else: - return np.array(arr, dtype=np.float32) - -cpdef ensure_int8(object arr): - if util.is_array(arr): - if ( arr).descr.type_num == NPY_INT8: - return arr - else: - return arr.astype(np.int8) - else: - return np.array(arr, dtype=np.int8) - -cpdef ensure_int16(object arr): - if util.is_array(arr): - if ( arr).descr.type_num == NPY_INT16: - return arr - else: - return arr.astype(np.int16) - else: - return np.array(arr, dtype=np.int16) - -cpdef ensure_int32(object arr): - if util.is_array(arr): - if ( arr).descr.type_num == NPY_INT32: - return arr - else: - return arr.astype(np.int32) - else: - return np.array(arr, dtype=np.int32) - -cpdef ensure_int64(object arr): - if util.is_array(arr): - if ( arr).descr.type_num == NPY_INT64: - return arr - else: - return arr.astype(np.int64) - else: - return np.array(arr, dtype=np.int64) - -@cython.wraparound(False) -@cython.boundscheck(False) -cpdef map_indices_float64(ndarray[float64_t] index): - """ - Produce a dict mapping the values of the input array to their respective - locations. - - Example: - array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1} - - Better to do this with Cython because of the enormous speed boost. - """ - cdef Py_ssize_t i, length - cdef dict result = {} - - length = len(index) - - for i in range(length): - result[index[i]] = i - - return result - -@cython.wraparound(False) -@cython.boundscheck(False) -cpdef map_indices_float32(ndarray[float32_t] index): - """ - Produce a dict mapping the values of the input array to their respective - locations. - - Example: - array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1} - - Better to do this with Cython because of the enormous speed boost. - """ - cdef Py_ssize_t i, length - cdef dict result = {} - - length = len(index) - - for i in range(length): - result[index[i]] = i - - return result - -@cython.wraparound(False) -@cython.boundscheck(False) -cpdef map_indices_object(ndarray[object] index): - """ - Produce a dict mapping the values of the input array to their respective - locations. - - Example: - array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1} - - Better to do this with Cython because of the enormous speed boost. - """ - cdef Py_ssize_t i, length - cdef dict result = {} - - length = len(index) - - for i in range(length): - result[index[i]] = i - - return result - -@cython.wraparound(False) -@cython.boundscheck(False) -cpdef map_indices_int32(ndarray[int32_t] index): - """ - Produce a dict mapping the values of the input array to their respective - locations. - - Example: - array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1} - - Better to do this with Cython because of the enormous speed boost. - """ - cdef Py_ssize_t i, length - cdef dict result = {} - - length = len(index) - - for i in range(length): - result[index[i]] = i - - return result - -@cython.wraparound(False) -@cython.boundscheck(False) -cpdef map_indices_int64(ndarray[int64_t] index): - """ - Produce a dict mapping the values of the input array to their respective - locations. - - Example: - array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1} - - Better to do this with Cython because of the enormous speed boost. - """ - cdef Py_ssize_t i, length - cdef dict result = {} - - length = len(index) - - for i in range(length): - result[index[i]] = i - - return result - -@cython.wraparound(False) -@cython.boundscheck(False) -cpdef map_indices_bool(ndarray[uint8_t] index): - """ - Produce a dict mapping the values of the input array to their respective - locations. - - Example: - array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1} - - Better to do this with Cython because of the enormous speed boost. - """ - cdef Py_ssize_t i, length - cdef dict result = {} - - length = len(index) - - for i in range(length): - result[index[i]] = i - - return result - - -@cython.boundscheck(False) -@cython.wraparound(False) -def pad_float64(ndarray[float64_t] old, ndarray[float64_t] new, - limit=None): - cdef Py_ssize_t i, j, nleft, nright - cdef ndarray[int64_t, ndim=1] indexer - cdef float64_t cur, next - cdef int lim, fill_count = 0 - - nleft = len(old) - nright = len(new) - indexer = np.empty(nright, dtype=np.int64) - indexer.fill(-1) - - if limit is None: - lim = nright - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - if nleft == 0 or nright == 0 or new[nright - 1] < old[0]: - return indexer - - i = j = 0 - - cur = old[0] - - while j <= nright - 1 and new[j] < cur: - j += 1 - - while True: - if j == nright: - break - - if i == nleft - 1: - while j < nright: - if new[j] == cur: - indexer[j] = i - elif new[j] > cur and fill_count < lim: - indexer[j] = i - fill_count += 1 - j += 1 - break - - next = old[i + 1] - - while j < nright and cur <= new[j] < next: - if new[j] == cur: - indexer[j] = i - elif fill_count < lim: - indexer[j] = i - fill_count += 1 - j += 1 - - fill_count = 0 - i += 1 - cur = next - - return indexer - -@cython.boundscheck(False) -@cython.wraparound(False) -def pad_float32(ndarray[float32_t] old, ndarray[float32_t] new, - limit=None): - cdef Py_ssize_t i, j, nleft, nright - cdef ndarray[int64_t, ndim=1] indexer - cdef float32_t cur, next - cdef int lim, fill_count = 0 - - nleft = len(old) - nright = len(new) - indexer = np.empty(nright, dtype=np.int64) - indexer.fill(-1) - - if limit is None: - lim = nright - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - if nleft == 0 or nright == 0 or new[nright - 1] < old[0]: - return indexer - - i = j = 0 - - cur = old[0] - - while j <= nright - 1 and new[j] < cur: - j += 1 - - while True: - if j == nright: - break - - if i == nleft - 1: - while j < nright: - if new[j] == cur: - indexer[j] = i - elif new[j] > cur and fill_count < lim: - indexer[j] = i - fill_count += 1 - j += 1 - break - - next = old[i + 1] - - while j < nright and cur <= new[j] < next: - if new[j] == cur: - indexer[j] = i - elif fill_count < lim: - indexer[j] = i - fill_count += 1 - j += 1 - - fill_count = 0 - i += 1 - cur = next - - return indexer - -@cython.boundscheck(False) -@cython.wraparound(False) -def pad_object(ndarray[object] old, ndarray[object] new, - limit=None): - cdef Py_ssize_t i, j, nleft, nright - cdef ndarray[int64_t, ndim=1] indexer - cdef object cur, next - cdef int lim, fill_count = 0 - - nleft = len(old) - nright = len(new) - indexer = np.empty(nright, dtype=np.int64) - indexer.fill(-1) - - if limit is None: - lim = nright - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - if nleft == 0 or nright == 0 or new[nright - 1] < old[0]: - return indexer - - i = j = 0 - - cur = old[0] - - while j <= nright - 1 and new[j] < cur: - j += 1 - - while True: - if j == nright: - break - - if i == nleft - 1: - while j < nright: - if new[j] == cur: - indexer[j] = i - elif new[j] > cur and fill_count < lim: - indexer[j] = i - fill_count += 1 - j += 1 - break - - next = old[i + 1] - - while j < nright and cur <= new[j] < next: - if new[j] == cur: - indexer[j] = i - elif fill_count < lim: - indexer[j] = i - fill_count += 1 - j += 1 - - fill_count = 0 - i += 1 - cur = next - - return indexer - -@cython.boundscheck(False) -@cython.wraparound(False) -def pad_int32(ndarray[int32_t] old, ndarray[int32_t] new, - limit=None): - cdef Py_ssize_t i, j, nleft, nright - cdef ndarray[int64_t, ndim=1] indexer - cdef int32_t cur, next - cdef int lim, fill_count = 0 - - nleft = len(old) - nright = len(new) - indexer = np.empty(nright, dtype=np.int64) - indexer.fill(-1) - - if limit is None: - lim = nright - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - if nleft == 0 or nright == 0 or new[nright - 1] < old[0]: - return indexer - - i = j = 0 - - cur = old[0] - - while j <= nright - 1 and new[j] < cur: - j += 1 - - while True: - if j == nright: - break - - if i == nleft - 1: - while j < nright: - if new[j] == cur: - indexer[j] = i - elif new[j] > cur and fill_count < lim: - indexer[j] = i - fill_count += 1 - j += 1 - break - - next = old[i + 1] - - while j < nright and cur <= new[j] < next: - if new[j] == cur: - indexer[j] = i - elif fill_count < lim: - indexer[j] = i - fill_count += 1 - j += 1 - - fill_count = 0 - i += 1 - cur = next - - return indexer - -@cython.boundscheck(False) -@cython.wraparound(False) -def pad_int64(ndarray[int64_t] old, ndarray[int64_t] new, - limit=None): - cdef Py_ssize_t i, j, nleft, nright - cdef ndarray[int64_t, ndim=1] indexer - cdef int64_t cur, next - cdef int lim, fill_count = 0 - - nleft = len(old) - nright = len(new) - indexer = np.empty(nright, dtype=np.int64) - indexer.fill(-1) - - if limit is None: - lim = nright - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - if nleft == 0 or nright == 0 or new[nright - 1] < old[0]: - return indexer - - i = j = 0 - - cur = old[0] - - while j <= nright - 1 and new[j] < cur: - j += 1 - - while True: - if j == nright: - break - - if i == nleft - 1: - while j < nright: - if new[j] == cur: - indexer[j] = i - elif new[j] > cur and fill_count < lim: - indexer[j] = i - fill_count += 1 - j += 1 - break - - next = old[i + 1] - - while j < nright and cur <= new[j] < next: - if new[j] == cur: - indexer[j] = i - elif fill_count < lim: - indexer[j] = i - fill_count += 1 - j += 1 - - fill_count = 0 - i += 1 - cur = next - - return indexer - -@cython.boundscheck(False) -@cython.wraparound(False) -def pad_bool(ndarray[uint8_t] old, ndarray[uint8_t] new, - limit=None): - cdef Py_ssize_t i, j, nleft, nright - cdef ndarray[int64_t, ndim=1] indexer - cdef uint8_t cur, next - cdef int lim, fill_count = 0 - - nleft = len(old) - nright = len(new) - indexer = np.empty(nright, dtype=np.int64) - indexer.fill(-1) - - if limit is None: - lim = nright - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - if nleft == 0 or nright == 0 or new[nright - 1] < old[0]: - return indexer - - i = j = 0 - - cur = old[0] - - while j <= nright - 1 and new[j] < cur: - j += 1 - - while True: - if j == nright: - break - - if i == nleft - 1: - while j < nright: - if new[j] == cur: - indexer[j] = i - elif new[j] > cur and fill_count < lim: - indexer[j] = i - fill_count += 1 - j += 1 - break - - next = old[i + 1] - - while j < nright and cur <= new[j] < next: - if new[j] == cur: - indexer[j] = i - elif fill_count < lim: - indexer[j] = i - fill_count += 1 - j += 1 - - fill_count = 0 - i += 1 - cur = next - - return indexer - - -@cython.boundscheck(False) -@cython.wraparound(False) -def backfill_float64(ndarray[float64_t] old, ndarray[float64_t] new, - limit=None): - cdef Py_ssize_t i, j, nleft, nright - cdef ndarray[int64_t, ndim=1] indexer - cdef float64_t cur, prev - cdef int lim, fill_count = 0 - - nleft = len(old) - nright = len(new) - indexer = np.empty(nright, dtype=np.int64) - indexer.fill(-1) - - if limit is None: - lim = nright - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]: - return indexer - - i = nleft - 1 - j = nright - 1 - - cur = old[nleft - 1] - - while j >= 0 and new[j] > cur: - j -= 1 - - while True: - if j < 0: - break - - if i == 0: - while j >= 0: - if new[j] == cur: - indexer[j] = i - elif new[j] < cur and fill_count < lim: - indexer[j] = i - fill_count += 1 - j -= 1 - break - - prev = old[i - 1] - - while j >= 0 and prev < new[j] <= cur: - if new[j] == cur: - indexer[j] = i - elif new[j] < cur and fill_count < lim: - indexer[j] = i - fill_count += 1 - j -= 1 - - fill_count = 0 - i -= 1 - cur = prev - - return indexer - -@cython.boundscheck(False) -@cython.wraparound(False) -def backfill_float32(ndarray[float32_t] old, ndarray[float32_t] new, - limit=None): - cdef Py_ssize_t i, j, nleft, nright - cdef ndarray[int64_t, ndim=1] indexer - cdef float32_t cur, prev - cdef int lim, fill_count = 0 - - nleft = len(old) - nright = len(new) - indexer = np.empty(nright, dtype=np.int64) - indexer.fill(-1) - - if limit is None: - lim = nright - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]: - return indexer - - i = nleft - 1 - j = nright - 1 - - cur = old[nleft - 1] - - while j >= 0 and new[j] > cur: - j -= 1 - - while True: - if j < 0: - break - - if i == 0: - while j >= 0: - if new[j] == cur: - indexer[j] = i - elif new[j] < cur and fill_count < lim: - indexer[j] = i - fill_count += 1 - j -= 1 - break - - prev = old[i - 1] - - while j >= 0 and prev < new[j] <= cur: - if new[j] == cur: - indexer[j] = i - elif new[j] < cur and fill_count < lim: - indexer[j] = i - fill_count += 1 - j -= 1 - - fill_count = 0 - i -= 1 - cur = prev - - return indexer - -@cython.boundscheck(False) -@cython.wraparound(False) -def backfill_object(ndarray[object] old, ndarray[object] new, - limit=None): - cdef Py_ssize_t i, j, nleft, nright - cdef ndarray[int64_t, ndim=1] indexer - cdef object cur, prev - cdef int lim, fill_count = 0 - - nleft = len(old) - nright = len(new) - indexer = np.empty(nright, dtype=np.int64) - indexer.fill(-1) - - if limit is None: - lim = nright - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]: - return indexer - - i = nleft - 1 - j = nright - 1 - - cur = old[nleft - 1] - - while j >= 0 and new[j] > cur: - j -= 1 - - while True: - if j < 0: - break - - if i == 0: - while j >= 0: - if new[j] == cur: - indexer[j] = i - elif new[j] < cur and fill_count < lim: - indexer[j] = i - fill_count += 1 - j -= 1 - break - - prev = old[i - 1] - - while j >= 0 and prev < new[j] <= cur: - if new[j] == cur: - indexer[j] = i - elif new[j] < cur and fill_count < lim: - indexer[j] = i - fill_count += 1 - j -= 1 - - fill_count = 0 - i -= 1 - cur = prev - - return indexer - -@cython.boundscheck(False) -@cython.wraparound(False) -def backfill_int32(ndarray[int32_t] old, ndarray[int32_t] new, - limit=None): - cdef Py_ssize_t i, j, nleft, nright - cdef ndarray[int64_t, ndim=1] indexer - cdef int32_t cur, prev - cdef int lim, fill_count = 0 - - nleft = len(old) - nright = len(new) - indexer = np.empty(nright, dtype=np.int64) - indexer.fill(-1) - - if limit is None: - lim = nright - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]: - return indexer - - i = nleft - 1 - j = nright - 1 - - cur = old[nleft - 1] - - while j >= 0 and new[j] > cur: - j -= 1 - - while True: - if j < 0: - break - - if i == 0: - while j >= 0: - if new[j] == cur: - indexer[j] = i - elif new[j] < cur and fill_count < lim: - indexer[j] = i - fill_count += 1 - j -= 1 - break - - prev = old[i - 1] - - while j >= 0 and prev < new[j] <= cur: - if new[j] == cur: - indexer[j] = i - elif new[j] < cur and fill_count < lim: - indexer[j] = i - fill_count += 1 - j -= 1 - - fill_count = 0 - i -= 1 - cur = prev - - return indexer - -@cython.boundscheck(False) -@cython.wraparound(False) -def backfill_int64(ndarray[int64_t] old, ndarray[int64_t] new, - limit=None): - cdef Py_ssize_t i, j, nleft, nright - cdef ndarray[int64_t, ndim=1] indexer - cdef int64_t cur, prev - cdef int lim, fill_count = 0 - - nleft = len(old) - nright = len(new) - indexer = np.empty(nright, dtype=np.int64) - indexer.fill(-1) - - if limit is None: - lim = nright - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]: - return indexer - - i = nleft - 1 - j = nright - 1 - - cur = old[nleft - 1] - - while j >= 0 and new[j] > cur: - j -= 1 - - while True: - if j < 0: - break - - if i == 0: - while j >= 0: - if new[j] == cur: - indexer[j] = i - elif new[j] < cur and fill_count < lim: - indexer[j] = i - fill_count += 1 - j -= 1 - break - - prev = old[i - 1] - - while j >= 0 and prev < new[j] <= cur: - if new[j] == cur: - indexer[j] = i - elif new[j] < cur and fill_count < lim: - indexer[j] = i - fill_count += 1 - j -= 1 - - fill_count = 0 - i -= 1 - cur = prev - - return indexer - -@cython.boundscheck(False) -@cython.wraparound(False) -def backfill_bool(ndarray[uint8_t] old, ndarray[uint8_t] new, - limit=None): - cdef Py_ssize_t i, j, nleft, nright - cdef ndarray[int64_t, ndim=1] indexer - cdef uint8_t cur, prev - cdef int lim, fill_count = 0 - - nleft = len(old) - nright = len(new) - indexer = np.empty(nright, dtype=np.int64) - indexer.fill(-1) - - if limit is None: - lim = nright - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]: - return indexer - - i = nleft - 1 - j = nright - 1 - - cur = old[nleft - 1] - - while j >= 0 and new[j] > cur: - j -= 1 - - while True: - if j < 0: - break - - if i == 0: - while j >= 0: - if new[j] == cur: - indexer[j] = i - elif new[j] < cur and fill_count < lim: - indexer[j] = i - fill_count += 1 - j -= 1 - break - - prev = old[i - 1] - - while j >= 0 and prev < new[j] <= cur: - if new[j] == cur: - indexer[j] = i - elif new[j] < cur and fill_count < lim: - indexer[j] = i - fill_count += 1 - j -= 1 - - fill_count = 0 - i -= 1 - cur = prev - - return indexer - - -@cython.boundscheck(False) -@cython.wraparound(False) -def pad_inplace_float64(ndarray[float64_t] values, - ndarray[uint8_t, cast=True] mask, - limit=None): - cdef Py_ssize_t i, N - cdef float64_t val - cdef int lim, fill_count = 0 - - N = len(values) - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - val = values[0] - for i in range(N): - if mask[i]: - if fill_count >= lim: - continue - fill_count += 1 - values[i] = val - else: - fill_count = 0 - val = values[i] - -@cython.boundscheck(False) -@cython.wraparound(False) -def pad_inplace_float32(ndarray[float32_t] values, - ndarray[uint8_t, cast=True] mask, - limit=None): - cdef Py_ssize_t i, N - cdef float32_t val - cdef int lim, fill_count = 0 - - N = len(values) - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - val = values[0] - for i in range(N): - if mask[i]: - if fill_count >= lim: - continue - fill_count += 1 - values[i] = val - else: - fill_count = 0 - val = values[i] - -@cython.boundscheck(False) -@cython.wraparound(False) -def pad_inplace_object(ndarray[object] values, - ndarray[uint8_t, cast=True] mask, - limit=None): - cdef Py_ssize_t i, N - cdef object val - cdef int lim, fill_count = 0 - - N = len(values) - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - val = values[0] - for i in range(N): - if mask[i]: - if fill_count >= lim: - continue - fill_count += 1 - values[i] = val - else: - fill_count = 0 - val = values[i] - -@cython.boundscheck(False) -@cython.wraparound(False) -def pad_inplace_int32(ndarray[int32_t] values, - ndarray[uint8_t, cast=True] mask, - limit=None): - cdef Py_ssize_t i, N - cdef int32_t val - cdef int lim, fill_count = 0 - - N = len(values) - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - val = values[0] - for i in range(N): - if mask[i]: - if fill_count >= lim: - continue - fill_count += 1 - values[i] = val - else: - fill_count = 0 - val = values[i] - -@cython.boundscheck(False) -@cython.wraparound(False) -def pad_inplace_int64(ndarray[int64_t] values, - ndarray[uint8_t, cast=True] mask, - limit=None): - cdef Py_ssize_t i, N - cdef int64_t val - cdef int lim, fill_count = 0 - - N = len(values) - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - val = values[0] - for i in range(N): - if mask[i]: - if fill_count >= lim: - continue - fill_count += 1 - values[i] = val - else: - fill_count = 0 - val = values[i] - -@cython.boundscheck(False) -@cython.wraparound(False) -def pad_inplace_bool(ndarray[uint8_t] values, - ndarray[uint8_t, cast=True] mask, - limit=None): - cdef Py_ssize_t i, N - cdef uint8_t val - cdef int lim, fill_count = 0 - - N = len(values) - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - val = values[0] - for i in range(N): - if mask[i]: - if fill_count >= lim: - continue - fill_count += 1 - values[i] = val - else: - fill_count = 0 - val = values[i] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def backfill_inplace_float64(ndarray[float64_t] values, - ndarray[uint8_t, cast=True] mask, - limit=None): - cdef Py_ssize_t i, N - cdef float64_t val - cdef int lim, fill_count = 0 - - N = len(values) - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - val = values[N - 1] - for i in range(N - 1, -1 , -1): - if mask[i]: - if fill_count >= lim: - continue - fill_count += 1 - values[i] = val - else: - fill_count = 0 - val = values[i] - -@cython.boundscheck(False) -@cython.wraparound(False) -def backfill_inplace_float32(ndarray[float32_t] values, - ndarray[uint8_t, cast=True] mask, - limit=None): - cdef Py_ssize_t i, N - cdef float32_t val - cdef int lim, fill_count = 0 - - N = len(values) - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - val = values[N - 1] - for i in range(N - 1, -1 , -1): - if mask[i]: - if fill_count >= lim: - continue - fill_count += 1 - values[i] = val - else: - fill_count = 0 - val = values[i] - -@cython.boundscheck(False) -@cython.wraparound(False) -def backfill_inplace_object(ndarray[object] values, - ndarray[uint8_t, cast=True] mask, - limit=None): - cdef Py_ssize_t i, N - cdef object val - cdef int lim, fill_count = 0 - - N = len(values) - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - val = values[N - 1] - for i in range(N - 1, -1 , -1): - if mask[i]: - if fill_count >= lim: - continue - fill_count += 1 - values[i] = val - else: - fill_count = 0 - val = values[i] - -@cython.boundscheck(False) -@cython.wraparound(False) -def backfill_inplace_int32(ndarray[int32_t] values, - ndarray[uint8_t, cast=True] mask, - limit=None): - cdef Py_ssize_t i, N - cdef int32_t val - cdef int lim, fill_count = 0 - - N = len(values) - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - val = values[N - 1] - for i in range(N - 1, -1 , -1): - if mask[i]: - if fill_count >= lim: - continue - fill_count += 1 - values[i] = val - else: - fill_count = 0 - val = values[i] - -@cython.boundscheck(False) -@cython.wraparound(False) -def backfill_inplace_int64(ndarray[int64_t] values, - ndarray[uint8_t, cast=True] mask, - limit=None): - cdef Py_ssize_t i, N - cdef int64_t val - cdef int lim, fill_count = 0 - - N = len(values) - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - val = values[N - 1] - for i in range(N - 1, -1 , -1): - if mask[i]: - if fill_count >= lim: - continue - fill_count += 1 - values[i] = val - else: - fill_count = 0 - val = values[i] - -@cython.boundscheck(False) -@cython.wraparound(False) -def backfill_inplace_bool(ndarray[uint8_t] values, - ndarray[uint8_t, cast=True] mask, - limit=None): - cdef Py_ssize_t i, N - cdef uint8_t val - cdef int lim, fill_count = 0 - - N = len(values) - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - val = values[N - 1] - for i in range(N - 1, -1 , -1): - if mask[i]: - if fill_count >= lim: - continue - fill_count += 1 - values[i] = val - else: - fill_count = 0 - val = values[i] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def pad_2d_inplace_float64(ndarray[float64_t, ndim=2] values, - ndarray[uint8_t, ndim=2] mask, - limit=None): - cdef Py_ssize_t i, j, N, K - cdef float64_t val - cdef int lim, fill_count = 0 - - K, N = ( values).shape - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - for j in range(K): - fill_count = 0 - val = values[j, 0] - for i in range(N): - if mask[j, i]: - if fill_count >= lim: - continue - fill_count += 1 - values[j, i] = val - else: - fill_count = 0 - val = values[j, i] - -@cython.boundscheck(False) -@cython.wraparound(False) -def pad_2d_inplace_float32(ndarray[float32_t, ndim=2] values, - ndarray[uint8_t, ndim=2] mask, - limit=None): - cdef Py_ssize_t i, j, N, K - cdef float32_t val - cdef int lim, fill_count = 0 - - K, N = ( values).shape - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - for j in range(K): - fill_count = 0 - val = values[j, 0] - for i in range(N): - if mask[j, i]: - if fill_count >= lim: - continue - fill_count += 1 - values[j, i] = val - else: - fill_count = 0 - val = values[j, i] - -@cython.boundscheck(False) -@cython.wraparound(False) -def pad_2d_inplace_object(ndarray[object, ndim=2] values, - ndarray[uint8_t, ndim=2] mask, - limit=None): - cdef Py_ssize_t i, j, N, K - cdef object val - cdef int lim, fill_count = 0 - - K, N = ( values).shape - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - for j in range(K): - fill_count = 0 - val = values[j, 0] - for i in range(N): - if mask[j, i]: - if fill_count >= lim: - continue - fill_count += 1 - values[j, i] = val - else: - fill_count = 0 - val = values[j, i] - -@cython.boundscheck(False) -@cython.wraparound(False) -def pad_2d_inplace_int32(ndarray[int32_t, ndim=2] values, - ndarray[uint8_t, ndim=2] mask, - limit=None): - cdef Py_ssize_t i, j, N, K - cdef int32_t val - cdef int lim, fill_count = 0 - - K, N = ( values).shape - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - for j in range(K): - fill_count = 0 - val = values[j, 0] - for i in range(N): - if mask[j, i]: - if fill_count >= lim: - continue - fill_count += 1 - values[j, i] = val - else: - fill_count = 0 - val = values[j, i] - -@cython.boundscheck(False) -@cython.wraparound(False) -def pad_2d_inplace_int64(ndarray[int64_t, ndim=2] values, - ndarray[uint8_t, ndim=2] mask, - limit=None): - cdef Py_ssize_t i, j, N, K - cdef int64_t val - cdef int lim, fill_count = 0 - - K, N = ( values).shape - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - for j in range(K): - fill_count = 0 - val = values[j, 0] - for i in range(N): - if mask[j, i]: - if fill_count >= lim: - continue - fill_count += 1 - values[j, i] = val - else: - fill_count = 0 - val = values[j, i] - -@cython.boundscheck(False) -@cython.wraparound(False) -def pad_2d_inplace_bool(ndarray[uint8_t, ndim=2] values, - ndarray[uint8_t, ndim=2] mask, - limit=None): - cdef Py_ssize_t i, j, N, K - cdef uint8_t val - cdef int lim, fill_count = 0 - - K, N = ( values).shape - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - for j in range(K): - fill_count = 0 - val = values[j, 0] - for i in range(N): - if mask[j, i]: - if fill_count >= lim: - continue - fill_count += 1 - values[j, i] = val - else: - fill_count = 0 - val = values[j, i] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def backfill_2d_inplace_float64(ndarray[float64_t, ndim=2] values, - ndarray[uint8_t, ndim=2] mask, - limit=None): - cdef Py_ssize_t i, j, N, K - cdef float64_t val - cdef int lim, fill_count = 0 - - K, N = ( values).shape - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - for j in range(K): - fill_count = 0 - val = values[j, N - 1] - for i in range(N - 1, -1 , -1): - if mask[j, i]: - if fill_count >= lim: - continue - fill_count += 1 - values[j, i] = val - else: - fill_count = 0 - val = values[j, i] - -@cython.boundscheck(False) -@cython.wraparound(False) -def backfill_2d_inplace_float32(ndarray[float32_t, ndim=2] values, - ndarray[uint8_t, ndim=2] mask, - limit=None): - cdef Py_ssize_t i, j, N, K - cdef float32_t val - cdef int lim, fill_count = 0 - - K, N = ( values).shape - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - for j in range(K): - fill_count = 0 - val = values[j, N - 1] - for i in range(N - 1, -1 , -1): - if mask[j, i]: - if fill_count >= lim: - continue - fill_count += 1 - values[j, i] = val - else: - fill_count = 0 - val = values[j, i] - -@cython.boundscheck(False) -@cython.wraparound(False) -def backfill_2d_inplace_object(ndarray[object, ndim=2] values, - ndarray[uint8_t, ndim=2] mask, - limit=None): - cdef Py_ssize_t i, j, N, K - cdef object val - cdef int lim, fill_count = 0 - - K, N = ( values).shape - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - for j in range(K): - fill_count = 0 - val = values[j, N - 1] - for i in range(N - 1, -1 , -1): - if mask[j, i]: - if fill_count >= lim: - continue - fill_count += 1 - values[j, i] = val - else: - fill_count = 0 - val = values[j, i] - -@cython.boundscheck(False) -@cython.wraparound(False) -def backfill_2d_inplace_int32(ndarray[int32_t, ndim=2] values, - ndarray[uint8_t, ndim=2] mask, - limit=None): - cdef Py_ssize_t i, j, N, K - cdef int32_t val - cdef int lim, fill_count = 0 - - K, N = ( values).shape - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - for j in range(K): - fill_count = 0 - val = values[j, N - 1] - for i in range(N - 1, -1 , -1): - if mask[j, i]: - if fill_count >= lim: - continue - fill_count += 1 - values[j, i] = val - else: - fill_count = 0 - val = values[j, i] - -@cython.boundscheck(False) -@cython.wraparound(False) -def backfill_2d_inplace_int64(ndarray[int64_t, ndim=2] values, - ndarray[uint8_t, ndim=2] mask, - limit=None): - cdef Py_ssize_t i, j, N, K - cdef int64_t val - cdef int lim, fill_count = 0 - - K, N = ( values).shape - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - for j in range(K): - fill_count = 0 - val = values[j, N - 1] - for i in range(N - 1, -1 , -1): - if mask[j, i]: - if fill_count >= lim: - continue - fill_count += 1 - values[j, i] = val - else: - fill_count = 0 - val = values[j, i] - -@cython.boundscheck(False) -@cython.wraparound(False) -def backfill_2d_inplace_bool(ndarray[uint8_t, ndim=2] values, - ndarray[uint8_t, ndim=2] mask, - limit=None): - cdef Py_ssize_t i, j, N, K - cdef uint8_t val - cdef int lim, fill_count = 0 - - K, N = ( values).shape - - # GH 2778 - if N == 0: - return - - if limit is None: - lim = N - else: - if limit < 0: - raise ValueError('Limit must be non-negative') - lim = limit - - for j in range(K): - fill_count = 0 - val = values[j, N - 1] - for i in range(N - 1, -1 , -1): - if mask[j, i]: - if fill_count >= lim: - continue - fill_count += 1 - values[j, i] = val - else: - fill_count = 0 - val = values[j, i] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def is_monotonic_float64(ndarray[float64_t] arr, bint timelike): - """ - Returns - ------- - is_monotonic_inc, is_monotonic_dec - """ - cdef: - Py_ssize_t i, n - float64_t prev, cur - bint is_monotonic_inc = 1 - bint is_monotonic_dec = 1 - - n = len(arr) - - if n == 1: - if arr[0] != arr[0] or (timelike and arr[0] == iNaT): - # single value is NaN - return False, False - else: - return True, True - elif n < 2: - return True, True - - if timelike and arr[0] == iNaT: - return False, False - - with nogil: - prev = arr[0] - for i in range(1, n): - cur = arr[i] - if timelike and cur == iNaT: - is_monotonic_inc = 0 - is_monotonic_dec = 0 - break - if cur < prev: - is_monotonic_inc = 0 - elif cur > prev: - is_monotonic_dec = 0 - elif cur == prev: - pass # is_unique = 0 - else: - # cur or prev is NaN - is_monotonic_inc = 0 - is_monotonic_dec = 0 - break - if not is_monotonic_inc and not is_monotonic_dec: - is_monotonic_inc = 0 - is_monotonic_dec = 0 - break - prev = cur - return is_monotonic_inc, is_monotonic_dec - -@cython.boundscheck(False) -@cython.wraparound(False) -def is_monotonic_float32(ndarray[float32_t] arr, bint timelike): - """ - Returns - ------- - is_monotonic_inc, is_monotonic_dec - """ - cdef: - Py_ssize_t i, n - float32_t prev, cur - bint is_monotonic_inc = 1 - bint is_monotonic_dec = 1 - - n = len(arr) - - if n == 1: - if arr[0] != arr[0] or (timelike and arr[0] == iNaT): - # single value is NaN - return False, False - else: - return True, True - elif n < 2: - return True, True - - if timelike and arr[0] == iNaT: - return False, False - - with nogil: - prev = arr[0] - for i in range(1, n): - cur = arr[i] - if timelike and cur == iNaT: - is_monotonic_inc = 0 - is_monotonic_dec = 0 - break - if cur < prev: - is_monotonic_inc = 0 - elif cur > prev: - is_monotonic_dec = 0 - elif cur == prev: - pass # is_unique = 0 - else: - # cur or prev is NaN - is_monotonic_inc = 0 - is_monotonic_dec = 0 - break - if not is_monotonic_inc and not is_monotonic_dec: - is_monotonic_inc = 0 - is_monotonic_dec = 0 - break - prev = cur - return is_monotonic_inc, is_monotonic_dec - -@cython.boundscheck(False) -@cython.wraparound(False) -def is_monotonic_object(ndarray[object] arr, bint timelike): - """ - Returns - ------- - is_monotonic_inc, is_monotonic_dec - """ - cdef: - Py_ssize_t i, n - object prev, cur - bint is_monotonic_inc = 1 - bint is_monotonic_dec = 1 - - n = len(arr) - - if n == 1: - if arr[0] != arr[0] or (timelike and arr[0] == iNaT): - # single value is NaN - return False, False - else: - return True, True - elif n < 2: - return True, True - - if timelike and arr[0] == iNaT: - return False, False - - - prev = arr[0] - for i in range(1, n): - cur = arr[i] - if timelike and cur == iNaT: - is_monotonic_inc = 0 - is_monotonic_dec = 0 - break - if cur < prev: - is_monotonic_inc = 0 - elif cur > prev: - is_monotonic_dec = 0 - elif cur == prev: - pass # is_unique = 0 - else: - # cur or prev is NaN - is_monotonic_inc = 0 - is_monotonic_dec = 0 - break - if not is_monotonic_inc and not is_monotonic_dec: - is_monotonic_inc = 0 - is_monotonic_dec = 0 - break - prev = cur - return is_monotonic_inc, is_monotonic_dec - -@cython.boundscheck(False) -@cython.wraparound(False) -def is_monotonic_int32(ndarray[int32_t] arr, bint timelike): - """ - Returns - ------- - is_monotonic_inc, is_monotonic_dec - """ - cdef: - Py_ssize_t i, n - int32_t prev, cur - bint is_monotonic_inc = 1 - bint is_monotonic_dec = 1 - - n = len(arr) - - if n == 1: - if arr[0] != arr[0] or (timelike and arr[0] == iNaT): - # single value is NaN - return False, False - else: - return True, True - elif n < 2: - return True, True - - if timelike and arr[0] == iNaT: - return False, False - - with nogil: - prev = arr[0] - for i in range(1, n): - cur = arr[i] - if timelike and cur == iNaT: - is_monotonic_inc = 0 - is_monotonic_dec = 0 - break - if cur < prev: - is_monotonic_inc = 0 - elif cur > prev: - is_monotonic_dec = 0 - elif cur == prev: - pass # is_unique = 0 - else: - # cur or prev is NaN - is_monotonic_inc = 0 - is_monotonic_dec = 0 - break - if not is_monotonic_inc and not is_monotonic_dec: - is_monotonic_inc = 0 - is_monotonic_dec = 0 - break - prev = cur - return is_monotonic_inc, is_monotonic_dec - -@cython.boundscheck(False) -@cython.wraparound(False) -def is_monotonic_int64(ndarray[int64_t] arr, bint timelike): - """ - Returns - ------- - is_monotonic_inc, is_monotonic_dec - """ - cdef: - Py_ssize_t i, n - int64_t prev, cur - bint is_monotonic_inc = 1 - bint is_monotonic_dec = 1 - - n = len(arr) - - if n == 1: - if arr[0] != arr[0] or (timelike and arr[0] == iNaT): - # single value is NaN - return False, False - else: - return True, True - elif n < 2: - return True, True - - if timelike and arr[0] == iNaT: - return False, False - - with nogil: - prev = arr[0] - for i in range(1, n): - cur = arr[i] - if timelike and cur == iNaT: - is_monotonic_inc = 0 - is_monotonic_dec = 0 - break - if cur < prev: - is_monotonic_inc = 0 - elif cur > prev: - is_monotonic_dec = 0 - elif cur == prev: - pass # is_unique = 0 - else: - # cur or prev is NaN - is_monotonic_inc = 0 - is_monotonic_dec = 0 - break - if not is_monotonic_inc and not is_monotonic_dec: - is_monotonic_inc = 0 - is_monotonic_dec = 0 - break - prev = cur - return is_monotonic_inc, is_monotonic_dec - -@cython.boundscheck(False) -@cython.wraparound(False) -def is_monotonic_bool(ndarray[uint8_t] arr, bint timelike): - """ - Returns - ------- - is_monotonic_inc, is_monotonic_dec - """ - cdef: - Py_ssize_t i, n - uint8_t prev, cur - bint is_monotonic_inc = 1 - bint is_monotonic_dec = 1 - - n = len(arr) - - if n == 1: - if arr[0] != arr[0] or (timelike and arr[0] == iNaT): - # single value is NaN - return False, False - else: - return True, True - elif n < 2: - return True, True - - if timelike and arr[0] == iNaT: - return False, False - - with nogil: - prev = arr[0] - for i in range(1, n): - cur = arr[i] - if timelike and cur == iNaT: - is_monotonic_inc = 0 - is_monotonic_dec = 0 - break - if cur < prev: - is_monotonic_inc = 0 - elif cur > prev: - is_monotonic_dec = 0 - elif cur == prev: - pass # is_unique = 0 - else: - # cur or prev is NaN - is_monotonic_inc = 0 - is_monotonic_dec = 0 - break - if not is_monotonic_inc and not is_monotonic_dec: - is_monotonic_inc = 0 - is_monotonic_dec = 0 - break - prev = cur - return is_monotonic_inc, is_monotonic_dec - - -@cython.wraparound(False) -@cython.boundscheck(False) -def groupby_float64(ndarray[float64_t] index, ndarray labels): - cdef dict result = {} - cdef Py_ssize_t i, length - cdef list members - cdef object idx, key - - length = len(index) - - if not length == len(labels): - raise AssertionError("len(index) != len(labels)") - - for i in range(length): - key = util.get_value_1d(labels, i) - - if is_null_datetimelike(key): - continue - - idx = index[i] - if key in result: - members = result[key] - members.append(idx) - else: - result[key] = [idx] - - return result - -@cython.wraparound(False) -@cython.boundscheck(False) -def groupby_float32(ndarray[float32_t] index, ndarray labels): - cdef dict result = {} - cdef Py_ssize_t i, length - cdef list members - cdef object idx, key - - length = len(index) - - if not length == len(labels): - raise AssertionError("len(index) != len(labels)") - - for i in range(length): - key = util.get_value_1d(labels, i) - - if is_null_datetimelike(key): - continue - - idx = index[i] - if key in result: - members = result[key] - members.append(idx) - else: - result[key] = [idx] - - return result - -@cython.wraparound(False) -@cython.boundscheck(False) -def groupby_object(ndarray[object] index, ndarray labels): - cdef dict result = {} - cdef Py_ssize_t i, length - cdef list members - cdef object idx, key - - length = len(index) - - if not length == len(labels): - raise AssertionError("len(index) != len(labels)") - - for i in range(length): - key = util.get_value_1d(labels, i) - - if is_null_datetimelike(key): - continue - - idx = index[i] - if key in result: - members = result[key] - members.append(idx) - else: - result[key] = [idx] - - return result - -@cython.wraparound(False) -@cython.boundscheck(False) -def groupby_int32(ndarray[int32_t] index, ndarray labels): - cdef dict result = {} - cdef Py_ssize_t i, length - cdef list members - cdef object idx, key - - length = len(index) - - if not length == len(labels): - raise AssertionError("len(index) != len(labels)") - - for i in range(length): - key = util.get_value_1d(labels, i) - - if is_null_datetimelike(key): - continue - - idx = index[i] - if key in result: - members = result[key] - members.append(idx) - else: - result[key] = [idx] - - return result - -@cython.wraparound(False) -@cython.boundscheck(False) -def groupby_int64(ndarray[int64_t] index, ndarray labels): - cdef dict result = {} - cdef Py_ssize_t i, length - cdef list members - cdef object idx, key - - length = len(index) - - if not length == len(labels): - raise AssertionError("len(index) != len(labels)") - - for i in range(length): - key = util.get_value_1d(labels, i) - - if is_null_datetimelike(key): - continue - - idx = index[i] - if key in result: - members = result[key] - members.append(idx) - else: - result[key] = [idx] - - return result - -@cython.wraparound(False) -@cython.boundscheck(False) -def groupby_bool(ndarray[uint8_t] index, ndarray labels): - cdef dict result = {} - cdef Py_ssize_t i, length - cdef list members - cdef object idx, key - - length = len(index) - - if not length == len(labels): - raise AssertionError("len(index) != len(labels)") - - for i in range(length): - key = util.get_value_1d(labels, i) - - if is_null_datetimelike(key): - continue - - idx = index[i] - if key in result: - members = result[key] - members.append(idx) - else: - result[key] = [idx] - - return result - - -@cython.wraparound(False) -@cython.boundscheck(False) -def arrmap_float64(ndarray[float64_t] index, object func): - cdef Py_ssize_t length = index.shape[0] - cdef Py_ssize_t i = 0 - - cdef ndarray[object] result = np.empty(length, dtype=np.object_) - - from pandas.lib import maybe_convert_objects - - for i in range(length): - result[i] = func(index[i]) - - return maybe_convert_objects(result) - -@cython.wraparound(False) -@cython.boundscheck(False) -def arrmap_float32(ndarray[float32_t] index, object func): - cdef Py_ssize_t length = index.shape[0] - cdef Py_ssize_t i = 0 - - cdef ndarray[object] result = np.empty(length, dtype=np.object_) - - from pandas.lib import maybe_convert_objects - - for i in range(length): - result[i] = func(index[i]) - - return maybe_convert_objects(result) - -@cython.wraparound(False) -@cython.boundscheck(False) -def arrmap_object(ndarray[object] index, object func): - cdef Py_ssize_t length = index.shape[0] - cdef Py_ssize_t i = 0 - - cdef ndarray[object] result = np.empty(length, dtype=np.object_) - - from pandas.lib import maybe_convert_objects - - for i in range(length): - result[i] = func(index[i]) - - return maybe_convert_objects(result) - -@cython.wraparound(False) -@cython.boundscheck(False) -def arrmap_int32(ndarray[int32_t] index, object func): - cdef Py_ssize_t length = index.shape[0] - cdef Py_ssize_t i = 0 - - cdef ndarray[object] result = np.empty(length, dtype=np.object_) - - from pandas.lib import maybe_convert_objects - - for i in range(length): - result[i] = func(index[i]) - - return maybe_convert_objects(result) - -@cython.wraparound(False) -@cython.boundscheck(False) -def arrmap_int64(ndarray[int64_t] index, object func): - cdef Py_ssize_t length = index.shape[0] - cdef Py_ssize_t i = 0 - - cdef ndarray[object] result = np.empty(length, dtype=np.object_) - - from pandas.lib import maybe_convert_objects - - for i in range(length): - result[i] = func(index[i]) - - return maybe_convert_objects(result) - -@cython.wraparound(False) -@cython.boundscheck(False) -def arrmap_bool(ndarray[uint8_t] index, object func): - cdef Py_ssize_t length = index.shape[0] - cdef Py_ssize_t i = 0 - - cdef ndarray[object] result = np.empty(length, dtype=np.object_) - - from pandas.lib import maybe_convert_objects - - for i in range(length): - result[i] = func(index[i]) - - return maybe_convert_objects(result) - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_1d_bool_bool_memview(uint8_t[:] values, - int64_t[:] indexer, - uint8_t[:] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, n, idx - uint8_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_1d_bool_bool(ndarray[uint8_t, ndim=1] values, - int64_t[:] indexer, - uint8_t[:] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_1d_bool_bool_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, n, idx - uint8_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_1d_bool_object_memview(uint8_t[:] values, - int64_t[:] indexer, - object[:] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, n, idx - object fv - - n = indexer.shape[0] - - fv = fill_value - - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = True if values[idx] > 0 else False - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_1d_bool_object(ndarray[uint8_t, ndim=1] values, - int64_t[:] indexer, - object[:] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_1d_bool_object_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, n, idx - object fv - - n = indexer.shape[0] - - fv = fill_value - - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = True if values[idx] > 0 else False - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_1d_int8_int8_memview(int8_t[:] values, - int64_t[:] indexer, - int8_t[:] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, n, idx - int8_t fv - - n = indexer.shape[0] - - fv = fill_value - - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_1d_int8_int8(ndarray[int8_t, ndim=1] values, - int64_t[:] indexer, - int8_t[:] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_1d_int8_int8_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, n, idx - int8_t fv - - n = indexer.shape[0] - - fv = fill_value - - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_1d_int8_int32_memview(int8_t[:] values, - int64_t[:] indexer, - int32_t[:] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, n, idx - int32_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_1d_int8_int32(ndarray[int8_t, ndim=1] values, - int64_t[:] indexer, - int32_t[:] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_1d_int8_int32_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, n, idx - int32_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_1d_int8_int64_memview(int8_t[:] values, - int64_t[:] indexer, - int64_t[:] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, n, idx - int64_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_1d_int8_int64(ndarray[int8_t, ndim=1] values, - int64_t[:] indexer, - int64_t[:] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_1d_int8_int64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, n, idx - int64_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_1d_int8_float64_memview(int8_t[:] values, - int64_t[:] indexer, - float64_t[:] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, n, idx - float64_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_1d_int8_float64(ndarray[int8_t, ndim=1] values, - int64_t[:] indexer, - float64_t[:] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_1d_int8_float64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, n, idx - float64_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_1d_int16_int16_memview(int16_t[:] values, - int64_t[:] indexer, - int16_t[:] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, n, idx - int16_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_1d_int16_int16(ndarray[int16_t, ndim=1] values, - int64_t[:] indexer, - int16_t[:] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_1d_int16_int16_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, n, idx - int16_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_1d_int16_int32_memview(int16_t[:] values, - int64_t[:] indexer, - int32_t[:] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, n, idx - int32_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_1d_int16_int32(ndarray[int16_t, ndim=1] values, - int64_t[:] indexer, - int32_t[:] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_1d_int16_int32_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, n, idx - int32_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_1d_int16_int64_memview(int16_t[:] values, - int64_t[:] indexer, - int64_t[:] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, n, idx - int64_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_1d_int16_int64(ndarray[int16_t, ndim=1] values, - int64_t[:] indexer, - int64_t[:] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_1d_int16_int64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, n, idx - int64_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_1d_int16_float64_memview(int16_t[:] values, - int64_t[:] indexer, - float64_t[:] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, n, idx - float64_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_1d_int16_float64(ndarray[int16_t, ndim=1] values, - int64_t[:] indexer, - float64_t[:] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_1d_int16_float64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, n, idx - float64_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_1d_int32_int32_memview(int32_t[:] values, - int64_t[:] indexer, - int32_t[:] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, n, idx - int32_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_1d_int32_int32(ndarray[int32_t, ndim=1] values, - int64_t[:] indexer, - int32_t[:] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_1d_int32_int32_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, n, idx - int32_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_1d_int32_int64_memview(int32_t[:] values, - int64_t[:] indexer, - int64_t[:] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, n, idx - int64_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_1d_int32_int64(ndarray[int32_t, ndim=1] values, - int64_t[:] indexer, - int64_t[:] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_1d_int32_int64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, n, idx - int64_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_1d_int32_float64_memview(int32_t[:] values, - int64_t[:] indexer, - float64_t[:] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, n, idx - float64_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_1d_int32_float64(ndarray[int32_t, ndim=1] values, - int64_t[:] indexer, - float64_t[:] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_1d_int32_float64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, n, idx - float64_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_1d_int64_int64_memview(int64_t[:] values, - int64_t[:] indexer, - int64_t[:] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, n, idx - int64_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_1d_int64_int64(ndarray[int64_t, ndim=1] values, - int64_t[:] indexer, - int64_t[:] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_1d_int64_int64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, n, idx - int64_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_1d_int64_float64_memview(int64_t[:] values, - int64_t[:] indexer, - float64_t[:] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, n, idx - float64_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_1d_int64_float64(ndarray[int64_t, ndim=1] values, - int64_t[:] indexer, - float64_t[:] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_1d_int64_float64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, n, idx - float64_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_1d_float32_float32_memview(float32_t[:] values, - int64_t[:] indexer, - float32_t[:] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, n, idx - float32_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_1d_float32_float32(ndarray[float32_t, ndim=1] values, - int64_t[:] indexer, - float32_t[:] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_1d_float32_float32_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, n, idx - float32_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_1d_float32_float64_memview(float32_t[:] values, - int64_t[:] indexer, - float64_t[:] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, n, idx - float64_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_1d_float32_float64(ndarray[float32_t, ndim=1] values, - int64_t[:] indexer, - float64_t[:] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_1d_float32_float64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, n, idx - float64_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_1d_float64_float64_memview(float64_t[:] values, - int64_t[:] indexer, - float64_t[:] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, n, idx - float64_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_1d_float64_float64(ndarray[float64_t, ndim=1] values, - int64_t[:] indexer, - float64_t[:] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_1d_float64_float64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, n, idx - float64_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_1d_object_object_memview(object[:] values, - int64_t[:] indexer, - object[:] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, n, idx - object fv - - n = indexer.shape[0] - - fv = fill_value - - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_1d_object_object(ndarray[object, ndim=1] values, - int64_t[:] indexer, - object[:] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_1d_object_object_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, n, idx - object fv - - n = indexer.shape[0] - - fv = fill_value - - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis0_bool_bool_memview(uint8_t[:, :] values, - int64_t[:] indexer, - uint8_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - uint8_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF True: - cdef: - uint8_t *v - uint8_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(uint8_t) and - sizeof(uint8_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(uint8_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_bool_bool(ndarray[uint8_t, ndim=2] values, - ndarray[int64_t] indexer, - uint8_t[:, :] out, - fill_value=np.nan): - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis0_bool_bool_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - uint8_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF True: - cdef: - uint8_t *v - uint8_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(uint8_t) and - sizeof(uint8_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(uint8_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis0_bool_object_memview(uint8_t[:, :] values, - int64_t[:] indexer, - object[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - object fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - object *v - object *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(object) and - sizeof(object) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(object) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = True if values[idx, j] > 0 else False - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_bool_object(ndarray[uint8_t, ndim=2] values, - ndarray[int64_t] indexer, - object[:, :] out, - fill_value=np.nan): - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis0_bool_object_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - object fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - object *v - object *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(object) and - sizeof(object) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(object) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = True if values[idx, j] > 0 else False - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis0_int8_int8_memview(int8_t[:, :] values, - int64_t[:] indexer, - int8_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - int8_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF True: - cdef: - int8_t *v - int8_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(int8_t) and - sizeof(int8_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(int8_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_int8_int8(ndarray[int8_t, ndim=2] values, - ndarray[int64_t] indexer, - int8_t[:, :] out, - fill_value=np.nan): - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis0_int8_int8_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - int8_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF True: - cdef: - int8_t *v - int8_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(int8_t) and - sizeof(int8_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(int8_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis0_int8_int32_memview(int8_t[:, :] values, - int64_t[:] indexer, - int32_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - int32_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - int32_t *v - int32_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(int32_t) and - sizeof(int32_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(int32_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_int8_int32(ndarray[int8_t, ndim=2] values, - ndarray[int64_t] indexer, - int32_t[:, :] out, - fill_value=np.nan): - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis0_int8_int32_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - int32_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - int32_t *v - int32_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(int32_t) and - sizeof(int32_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(int32_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis0_int8_int64_memview(int8_t[:, :] values, - int64_t[:] indexer, - int64_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - int64_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - int64_t *v - int64_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(int64_t) and - sizeof(int64_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(int64_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_int8_int64(ndarray[int8_t, ndim=2] values, - ndarray[int64_t] indexer, - int64_t[:, :] out, - fill_value=np.nan): - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis0_int8_int64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - int64_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - int64_t *v - int64_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(int64_t) and - sizeof(int64_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(int64_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis0_int8_float64_memview(int8_t[:, :] values, - int64_t[:] indexer, - float64_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - float64_t *v - float64_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(float64_t) and - sizeof(float64_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(float64_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_int8_float64(ndarray[int8_t, ndim=2] values, - ndarray[int64_t] indexer, - float64_t[:, :] out, - fill_value=np.nan): - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis0_int8_float64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - float64_t *v - float64_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(float64_t) and - sizeof(float64_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(float64_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis0_int16_int16_memview(int16_t[:, :] values, - int64_t[:] indexer, - int16_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - int16_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF True: - cdef: - int16_t *v - int16_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(int16_t) and - sizeof(int16_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(int16_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_int16_int16(ndarray[int16_t, ndim=2] values, - ndarray[int64_t] indexer, - int16_t[:, :] out, - fill_value=np.nan): - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis0_int16_int16_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - int16_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF True: - cdef: - int16_t *v - int16_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(int16_t) and - sizeof(int16_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(int16_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis0_int16_int32_memview(int16_t[:, :] values, - int64_t[:] indexer, - int32_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - int32_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - int32_t *v - int32_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(int32_t) and - sizeof(int32_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(int32_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_int16_int32(ndarray[int16_t, ndim=2] values, - ndarray[int64_t] indexer, - int32_t[:, :] out, - fill_value=np.nan): - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis0_int16_int32_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - int32_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - int32_t *v - int32_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(int32_t) and - sizeof(int32_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(int32_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis0_int16_int64_memview(int16_t[:, :] values, - int64_t[:] indexer, - int64_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - int64_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - int64_t *v - int64_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(int64_t) and - sizeof(int64_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(int64_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_int16_int64(ndarray[int16_t, ndim=2] values, - ndarray[int64_t] indexer, - int64_t[:, :] out, - fill_value=np.nan): - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis0_int16_int64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - int64_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - int64_t *v - int64_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(int64_t) and - sizeof(int64_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(int64_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis0_int16_float64_memview(int16_t[:, :] values, - int64_t[:] indexer, - float64_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - float64_t *v - float64_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(float64_t) and - sizeof(float64_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(float64_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_int16_float64(ndarray[int16_t, ndim=2] values, - ndarray[int64_t] indexer, - float64_t[:, :] out, - fill_value=np.nan): - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis0_int16_float64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - float64_t *v - float64_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(float64_t) and - sizeof(float64_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(float64_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis0_int32_int32_memview(int32_t[:, :] values, - int64_t[:] indexer, - int32_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - int32_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF True: - cdef: - int32_t *v - int32_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(int32_t) and - sizeof(int32_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(int32_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_int32_int32(ndarray[int32_t, ndim=2] values, - ndarray[int64_t] indexer, - int32_t[:, :] out, - fill_value=np.nan): - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis0_int32_int32_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - int32_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF True: - cdef: - int32_t *v - int32_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(int32_t) and - sizeof(int32_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(int32_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis0_int32_int64_memview(int32_t[:, :] values, - int64_t[:] indexer, - int64_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - int64_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - int64_t *v - int64_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(int64_t) and - sizeof(int64_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(int64_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_int32_int64(ndarray[int32_t, ndim=2] values, - ndarray[int64_t] indexer, - int64_t[:, :] out, - fill_value=np.nan): - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis0_int32_int64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - int64_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - int64_t *v - int64_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(int64_t) and - sizeof(int64_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(int64_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis0_int32_float64_memview(int32_t[:, :] values, - int64_t[:] indexer, - float64_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - float64_t *v - float64_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(float64_t) and - sizeof(float64_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(float64_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_int32_float64(ndarray[int32_t, ndim=2] values, - ndarray[int64_t] indexer, - float64_t[:, :] out, - fill_value=np.nan): - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis0_int32_float64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - float64_t *v - float64_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(float64_t) and - sizeof(float64_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(float64_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis0_int64_int64_memview(int64_t[:, :] values, - int64_t[:] indexer, - int64_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - int64_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF True: - cdef: - int64_t *v - int64_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(int64_t) and - sizeof(int64_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(int64_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_int64_int64(ndarray[int64_t, ndim=2] values, - ndarray[int64_t] indexer, - int64_t[:, :] out, - fill_value=np.nan): - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis0_int64_int64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - int64_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF True: - cdef: - int64_t *v - int64_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(int64_t) and - sizeof(int64_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(int64_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis0_int64_float64_memview(int64_t[:, :] values, - int64_t[:] indexer, - float64_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - float64_t *v - float64_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(float64_t) and - sizeof(float64_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(float64_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_int64_float64(ndarray[int64_t, ndim=2] values, - ndarray[int64_t] indexer, - float64_t[:, :] out, - fill_value=np.nan): - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis0_int64_float64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - float64_t *v - float64_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(float64_t) and - sizeof(float64_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(float64_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis0_float32_float32_memview(float32_t[:, :] values, - int64_t[:] indexer, - float32_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - float32_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF True: - cdef: - float32_t *v - float32_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(float32_t) and - sizeof(float32_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(float32_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_float32_float32(ndarray[float32_t, ndim=2] values, - ndarray[int64_t] indexer, - float32_t[:, :] out, - fill_value=np.nan): - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis0_float32_float32_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - float32_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF True: - cdef: - float32_t *v - float32_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(float32_t) and - sizeof(float32_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(float32_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis0_float32_float64_memview(float32_t[:, :] values, - int64_t[:] indexer, - float64_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - float64_t *v - float64_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(float64_t) and - sizeof(float64_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(float64_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_float32_float64(ndarray[float32_t, ndim=2] values, - ndarray[int64_t] indexer, - float64_t[:, :] out, - fill_value=np.nan): - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis0_float32_float64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - float64_t *v - float64_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(float64_t) and - sizeof(float64_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(float64_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis0_float64_float64_memview(float64_t[:, :] values, - int64_t[:] indexer, - float64_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF True: - cdef: - float64_t *v - float64_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(float64_t) and - sizeof(float64_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(float64_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_float64_float64(ndarray[float64_t, ndim=2] values, - ndarray[int64_t] indexer, - float64_t[:, :] out, - fill_value=np.nan): - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis0_float64_float64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF True: - cdef: - float64_t *v - float64_t *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(float64_t) and - sizeof(float64_t) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(float64_t) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis0_object_object_memview(object[:, :] values, - int64_t[:] indexer, - object[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - object fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - object *v - object *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(object) and - sizeof(object) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(object) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis0_object_object(ndarray[object, ndim=2] values, - ndarray[int64_t] indexer, - object[:, :] out, - fill_value=np.nan): - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis0_object_object_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - object fv - - n = len(indexer) - k = values.shape[1] - - fv = fill_value - - IF False: - cdef: - object *v - object *o - - #GH3130 - if (values.strides[1] == out.strides[1] and - values.strides[1] == sizeof(object) and - sizeof(object) * n >= 256): - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - v = &values[idx, 0] - o = &out[i, 0] - memmove(o, v, (sizeof(object) * k)) - return - - for i from 0 <= i < n: - idx = indexer[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - out[i, j] = values[idx, j] - - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis1_bool_bool_memview(uint8_t[:, :] values, - int64_t[:] indexer, - uint8_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - uint8_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_bool_bool(ndarray[uint8_t, ndim=2] values, - ndarray[int64_t] indexer, - uint8_t[:, :] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis1_bool_bool_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - uint8_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis1_bool_object_memview(uint8_t[:, :] values, - int64_t[:] indexer, - object[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - object fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = True if values[i, idx] > 0 else False - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_bool_object(ndarray[uint8_t, ndim=2] values, - ndarray[int64_t] indexer, - object[:, :] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis1_bool_object_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - object fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = True if values[i, idx] > 0 else False - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis1_int8_int8_memview(int8_t[:, :] values, - int64_t[:] indexer, - int8_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - int8_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_int8_int8(ndarray[int8_t, ndim=2] values, - ndarray[int64_t] indexer, - int8_t[:, :] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis1_int8_int8_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - int8_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis1_int8_int32_memview(int8_t[:, :] values, - int64_t[:] indexer, - int32_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - int32_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_int8_int32(ndarray[int8_t, ndim=2] values, - ndarray[int64_t] indexer, - int32_t[:, :] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis1_int8_int32_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - int32_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis1_int8_int64_memview(int8_t[:, :] values, - int64_t[:] indexer, - int64_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - int64_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_int8_int64(ndarray[int8_t, ndim=2] values, - ndarray[int64_t] indexer, - int64_t[:, :] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis1_int8_int64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - int64_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis1_int8_float64_memview(int8_t[:, :] values, - int64_t[:] indexer, - float64_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_int8_float64(ndarray[int8_t, ndim=2] values, - ndarray[int64_t] indexer, - float64_t[:, :] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis1_int8_float64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis1_int16_int16_memview(int16_t[:, :] values, - int64_t[:] indexer, - int16_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - int16_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_int16_int16(ndarray[int16_t, ndim=2] values, - ndarray[int64_t] indexer, - int16_t[:, :] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis1_int16_int16_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - int16_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis1_int16_int32_memview(int16_t[:, :] values, - int64_t[:] indexer, - int32_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - int32_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_int16_int32(ndarray[int16_t, ndim=2] values, - ndarray[int64_t] indexer, - int32_t[:, :] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis1_int16_int32_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - int32_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis1_int16_int64_memview(int16_t[:, :] values, - int64_t[:] indexer, - int64_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - int64_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_int16_int64(ndarray[int16_t, ndim=2] values, - ndarray[int64_t] indexer, - int64_t[:, :] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis1_int16_int64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - int64_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis1_int16_float64_memview(int16_t[:, :] values, - int64_t[:] indexer, - float64_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_int16_float64(ndarray[int16_t, ndim=2] values, - ndarray[int64_t] indexer, - float64_t[:, :] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis1_int16_float64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis1_int32_int32_memview(int32_t[:, :] values, - int64_t[:] indexer, - int32_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - int32_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_int32_int32(ndarray[int32_t, ndim=2] values, - ndarray[int64_t] indexer, - int32_t[:, :] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis1_int32_int32_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - int32_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis1_int32_int64_memview(int32_t[:, :] values, - int64_t[:] indexer, - int64_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - int64_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_int32_int64(ndarray[int32_t, ndim=2] values, - ndarray[int64_t] indexer, - int64_t[:, :] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis1_int32_int64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - int64_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis1_int32_float64_memview(int32_t[:, :] values, - int64_t[:] indexer, - float64_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_int32_float64(ndarray[int32_t, ndim=2] values, - ndarray[int64_t] indexer, - float64_t[:, :] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis1_int32_float64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis1_int64_int64_memview(int64_t[:, :] values, - int64_t[:] indexer, - int64_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - int64_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_int64_int64(ndarray[int64_t, ndim=2] values, - ndarray[int64_t] indexer, - int64_t[:, :] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis1_int64_int64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - int64_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis1_int64_float64_memview(int64_t[:, :] values, - int64_t[:] indexer, - float64_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_int64_float64(ndarray[int64_t, ndim=2] values, - ndarray[int64_t] indexer, - float64_t[:, :] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis1_int64_float64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis1_float32_float32_memview(float32_t[:, :] values, - int64_t[:] indexer, - float32_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - float32_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_float32_float32(ndarray[float32_t, ndim=2] values, - ndarray[int64_t] indexer, - float32_t[:, :] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis1_float32_float32_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - float32_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis1_float32_float64_memview(float32_t[:, :] values, - int64_t[:] indexer, - float64_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_float32_float64(ndarray[float32_t, ndim=2] values, - ndarray[int64_t] indexer, - float64_t[:, :] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis1_float32_float64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis1_float64_float64_memview(float64_t[:, :] values, - int64_t[:] indexer, - float64_t[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_float64_float64(ndarray[float64_t, ndim=2] values, - ndarray[int64_t] indexer, - float64_t[:, :] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis1_float64_float64_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - float64_t fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - -@cython.wraparound(False) -@cython.boundscheck(False) -cdef inline take_2d_axis1_object_object_memview(object[:, :] values, - int64_t[:] indexer, - object[:, :] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - object fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_axis1_object_object(ndarray[object, ndim=2] values, - ndarray[int64_t] indexer, - object[:, :] out, - fill_value=np.nan): - - if values.flags.writeable: - # We can call the memoryview version of the code - take_2d_axis1_object_object_memview(values, indexer, out, - fill_value=fill_value) - return - - # We cannot use the memoryview version on readonly-buffers due to - # a limitation of Cython's typed memoryviews. Instead we can use - # the slightly slower Cython ndarray type directly. - cdef: - Py_ssize_t i, j, k, n, idx - object fv - - n = len(values) - k = len(indexer) - - if n == 0 or k == 0: - return - - fv = fill_value - - for i from 0 <= i < n: - for j from 0 <= j < k: - idx = indexer[j] - if idx == -1: - out[i, j] = fv - else: - out[i, j] = values[i, idx] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_multi_bool_bool(ndarray[uint8_t, ndim=2] values, - indexer, - ndarray[uint8_t, ndim=2] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t] idx0 = indexer[0] - ndarray[int64_t] idx1 = indexer[1] - uint8_t fv - - n = len(idx0) - k = len(idx1) - - fv = fill_value - for i from 0 <= i < n: - idx = idx0[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - if idx1[j] == -1: - out[i, j] = fv - else: - out[i, j] = values[idx, idx1[j]] - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_multi_bool_object(ndarray[uint8_t, ndim=2] values, - indexer, - ndarray[object, ndim=2] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t] idx0 = indexer[0] - ndarray[int64_t] idx1 = indexer[1] - object fv - - n = len(idx0) - k = len(idx1) - - fv = fill_value - for i from 0 <= i < n: - idx = idx0[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - if idx1[j] == -1: - out[i, j] = fv - else: - out[i, j] = True if values[idx, idx1[j]] > 0 else False - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_multi_int8_int8(ndarray[int8_t, ndim=2] values, - indexer, - ndarray[int8_t, ndim=2] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t] idx0 = indexer[0] - ndarray[int64_t] idx1 = indexer[1] - int8_t fv - - n = len(idx0) - k = len(idx1) - - fv = fill_value - for i from 0 <= i < n: - idx = idx0[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - if idx1[j] == -1: - out[i, j] = fv - else: - out[i, j] = values[idx, idx1[j]] - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_multi_int8_int32(ndarray[int8_t, ndim=2] values, - indexer, - ndarray[int32_t, ndim=2] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t] idx0 = indexer[0] - ndarray[int64_t] idx1 = indexer[1] - int32_t fv - - n = len(idx0) - k = len(idx1) - - fv = fill_value - for i from 0 <= i < n: - idx = idx0[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - if idx1[j] == -1: - out[i, j] = fv - else: - out[i, j] = values[idx, idx1[j]] - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_multi_int8_int64(ndarray[int8_t, ndim=2] values, - indexer, - ndarray[int64_t, ndim=2] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t] idx0 = indexer[0] - ndarray[int64_t] idx1 = indexer[1] - int64_t fv - - n = len(idx0) - k = len(idx1) - - fv = fill_value - for i from 0 <= i < n: - idx = idx0[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - if idx1[j] == -1: - out[i, j] = fv - else: - out[i, j] = values[idx, idx1[j]] - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_multi_int8_float64(ndarray[int8_t, ndim=2] values, - indexer, - ndarray[float64_t, ndim=2] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t] idx0 = indexer[0] - ndarray[int64_t] idx1 = indexer[1] - float64_t fv - - n = len(idx0) - k = len(idx1) - - fv = fill_value - for i from 0 <= i < n: - idx = idx0[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - if idx1[j] == -1: - out[i, j] = fv - else: - out[i, j] = values[idx, idx1[j]] - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_multi_int16_int16(ndarray[int16_t, ndim=2] values, - indexer, - ndarray[int16_t, ndim=2] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t] idx0 = indexer[0] - ndarray[int64_t] idx1 = indexer[1] - int16_t fv - - n = len(idx0) - k = len(idx1) - - fv = fill_value - for i from 0 <= i < n: - idx = idx0[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - if idx1[j] == -1: - out[i, j] = fv - else: - out[i, j] = values[idx, idx1[j]] - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_multi_int16_int32(ndarray[int16_t, ndim=2] values, - indexer, - ndarray[int32_t, ndim=2] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t] idx0 = indexer[0] - ndarray[int64_t] idx1 = indexer[1] - int32_t fv - - n = len(idx0) - k = len(idx1) - - fv = fill_value - for i from 0 <= i < n: - idx = idx0[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - if idx1[j] == -1: - out[i, j] = fv - else: - out[i, j] = values[idx, idx1[j]] - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_multi_int16_int64(ndarray[int16_t, ndim=2] values, - indexer, - ndarray[int64_t, ndim=2] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t] idx0 = indexer[0] - ndarray[int64_t] idx1 = indexer[1] - int64_t fv - - n = len(idx0) - k = len(idx1) - - fv = fill_value - for i from 0 <= i < n: - idx = idx0[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - if idx1[j] == -1: - out[i, j] = fv - else: - out[i, j] = values[idx, idx1[j]] - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_multi_int16_float64(ndarray[int16_t, ndim=2] values, - indexer, - ndarray[float64_t, ndim=2] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t] idx0 = indexer[0] - ndarray[int64_t] idx1 = indexer[1] - float64_t fv - - n = len(idx0) - k = len(idx1) - - fv = fill_value - for i from 0 <= i < n: - idx = idx0[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - if idx1[j] == -1: - out[i, j] = fv - else: - out[i, j] = values[idx, idx1[j]] - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_multi_int32_int32(ndarray[int32_t, ndim=2] values, - indexer, - ndarray[int32_t, ndim=2] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t] idx0 = indexer[0] - ndarray[int64_t] idx1 = indexer[1] - int32_t fv - - n = len(idx0) - k = len(idx1) - - fv = fill_value - for i from 0 <= i < n: - idx = idx0[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - if idx1[j] == -1: - out[i, j] = fv - else: - out[i, j] = values[idx, idx1[j]] - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_multi_int32_int64(ndarray[int32_t, ndim=2] values, - indexer, - ndarray[int64_t, ndim=2] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t] idx0 = indexer[0] - ndarray[int64_t] idx1 = indexer[1] - int64_t fv - - n = len(idx0) - k = len(idx1) - - fv = fill_value - for i from 0 <= i < n: - idx = idx0[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - if idx1[j] == -1: - out[i, j] = fv - else: - out[i, j] = values[idx, idx1[j]] - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_multi_int32_float64(ndarray[int32_t, ndim=2] values, - indexer, - ndarray[float64_t, ndim=2] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t] idx0 = indexer[0] - ndarray[int64_t] idx1 = indexer[1] - float64_t fv - - n = len(idx0) - k = len(idx1) - - fv = fill_value - for i from 0 <= i < n: - idx = idx0[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - if idx1[j] == -1: - out[i, j] = fv - else: - out[i, j] = values[idx, idx1[j]] - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_multi_int64_int64(ndarray[int64_t, ndim=2] values, - indexer, - ndarray[int64_t, ndim=2] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t] idx0 = indexer[0] - ndarray[int64_t] idx1 = indexer[1] - int64_t fv - - n = len(idx0) - k = len(idx1) - - fv = fill_value - for i from 0 <= i < n: - idx = idx0[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - if idx1[j] == -1: - out[i, j] = fv - else: - out[i, j] = values[idx, idx1[j]] - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_multi_int64_float64(ndarray[int64_t, ndim=2] values, - indexer, - ndarray[float64_t, ndim=2] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t] idx0 = indexer[0] - ndarray[int64_t] idx1 = indexer[1] - float64_t fv - - n = len(idx0) - k = len(idx1) - - fv = fill_value - for i from 0 <= i < n: - idx = idx0[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - if idx1[j] == -1: - out[i, j] = fv - else: - out[i, j] = values[idx, idx1[j]] - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_multi_float32_float32(ndarray[float32_t, ndim=2] values, - indexer, - ndarray[float32_t, ndim=2] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t] idx0 = indexer[0] - ndarray[int64_t] idx1 = indexer[1] - float32_t fv - - n = len(idx0) - k = len(idx1) - - fv = fill_value - for i from 0 <= i < n: - idx = idx0[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - if idx1[j] == -1: - out[i, j] = fv - else: - out[i, j] = values[idx, idx1[j]] - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_multi_float32_float64(ndarray[float32_t, ndim=2] values, - indexer, - ndarray[float64_t, ndim=2] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t] idx0 = indexer[0] - ndarray[int64_t] idx1 = indexer[1] - float64_t fv - - n = len(idx0) - k = len(idx1) - - fv = fill_value - for i from 0 <= i < n: - idx = idx0[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - if idx1[j] == -1: - out[i, j] = fv - else: - out[i, j] = values[idx, idx1[j]] - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_multi_float64_float64(ndarray[float64_t, ndim=2] values, - indexer, - ndarray[float64_t, ndim=2] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t] idx0 = indexer[0] - ndarray[int64_t] idx1 = indexer[1] - float64_t fv - - n = len(idx0) - k = len(idx1) - - fv = fill_value - for i from 0 <= i < n: - idx = idx0[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - if idx1[j] == -1: - out[i, j] = fv - else: - out[i, j] = values[idx, idx1[j]] - -@cython.wraparound(False) -@cython.boundscheck(False) -def take_2d_multi_object_object(ndarray[object, ndim=2] values, - indexer, - ndarray[object, ndim=2] out, - fill_value=np.nan): - cdef: - Py_ssize_t i, j, k, n, idx - ndarray[int64_t] idx0 = indexer[0] - ndarray[int64_t] idx1 = indexer[1] - object fv - - n = len(idx0) - k = len(idx1) - - fv = fill_value - for i from 0 <= i < n: - idx = idx0[i] - if idx == -1: - for j from 0 <= j < k: - out[i, j] = fv - else: - for j from 0 <= j < k: - if idx1[j] == -1: - out[i, j] = fv - else: - out[i, j] = values[idx, idx1[j]] - - -@cython.boundscheck(False) -@cython.wraparound(False) -def diff_2d_float64(ndarray[float64_t, ndim=2] arr, - ndarray[float64_t, ndim=2] out, - Py_ssize_t periods, int axis): - cdef: - Py_ssize_t i, j, sx, sy - - sx, sy = ( arr).shape - if arr.flags.f_contiguous: - if axis == 0: - if periods >= 0: - start, stop = periods, sx - else: - start, stop = 0, sx + periods - for j in range(sy): - for i in range(start, stop): - out[i, j] = arr[i, j] - arr[i - periods, j] - else: - if periods >= 0: - start, stop = periods, sy - else: - start, stop = 0, sy + periods - for j in range(start, stop): - for i in range(sx): - out[i, j] = arr[i, j] - arr[i, j - periods] - else: - if axis == 0: - if periods >= 0: - start, stop = periods, sx - else: - start, stop = 0, sx + periods - for i in range(start, stop): - for j in range(sy): - out[i, j] = arr[i, j] - arr[i - periods, j] - else: - if periods >= 0: - start, stop = periods, sy - else: - start, stop = 0, sy + periods - for i in range(sx): - for j in range(start, stop): - out[i, j] = arr[i, j] - arr[i, j - periods] - -@cython.boundscheck(False) -@cython.wraparound(False) -def diff_2d_float32(ndarray[float32_t, ndim=2] arr, - ndarray[float32_t, ndim=2] out, - Py_ssize_t periods, int axis): - cdef: - Py_ssize_t i, j, sx, sy - - sx, sy = ( arr).shape - if arr.flags.f_contiguous: - if axis == 0: - if periods >= 0: - start, stop = periods, sx - else: - start, stop = 0, sx + periods - for j in range(sy): - for i in range(start, stop): - out[i, j] = arr[i, j] - arr[i - periods, j] - else: - if periods >= 0: - start, stop = periods, sy - else: - start, stop = 0, sy + periods - for j in range(start, stop): - for i in range(sx): - out[i, j] = arr[i, j] - arr[i, j - periods] - else: - if axis == 0: - if periods >= 0: - start, stop = periods, sx - else: - start, stop = 0, sx + periods - for i in range(start, stop): - for j in range(sy): - out[i, j] = arr[i, j] - arr[i - periods, j] - else: - if periods >= 0: - start, stop = periods, sy - else: - start, stop = 0, sy + periods - for i in range(sx): - for j in range(start, stop): - out[i, j] = arr[i, j] - arr[i, j - periods] - -@cython.boundscheck(False) -@cython.wraparound(False) -def diff_2d_int8(ndarray[int8_t, ndim=2] arr, - ndarray[float32_t, ndim=2] out, - Py_ssize_t periods, int axis): - cdef: - Py_ssize_t i, j, sx, sy - - sx, sy = ( arr).shape - if arr.flags.f_contiguous: - if axis == 0: - if periods >= 0: - start, stop = periods, sx - else: - start, stop = 0, sx + periods - for j in range(sy): - for i in range(start, stop): - out[i, j] = arr[i, j] - arr[i - periods, j] - else: - if periods >= 0: - start, stop = periods, sy - else: - start, stop = 0, sy + periods - for j in range(start, stop): - for i in range(sx): - out[i, j] = arr[i, j] - arr[i, j - periods] - else: - if axis == 0: - if periods >= 0: - start, stop = periods, sx - else: - start, stop = 0, sx + periods - for i in range(start, stop): - for j in range(sy): - out[i, j] = arr[i, j] - arr[i - periods, j] - else: - if periods >= 0: - start, stop = periods, sy - else: - start, stop = 0, sy + periods - for i in range(sx): - for j in range(start, stop): - out[i, j] = arr[i, j] - arr[i, j - periods] - -@cython.boundscheck(False) -@cython.wraparound(False) -def diff_2d_int16(ndarray[int16_t, ndim=2] arr, - ndarray[float32_t, ndim=2] out, - Py_ssize_t periods, int axis): - cdef: - Py_ssize_t i, j, sx, sy - - sx, sy = ( arr).shape - if arr.flags.f_contiguous: - if axis == 0: - if periods >= 0: - start, stop = periods, sx - else: - start, stop = 0, sx + periods - for j in range(sy): - for i in range(start, stop): - out[i, j] = arr[i, j] - arr[i - periods, j] - else: - if periods >= 0: - start, stop = periods, sy - else: - start, stop = 0, sy + periods - for j in range(start, stop): - for i in range(sx): - out[i, j] = arr[i, j] - arr[i, j - periods] - else: - if axis == 0: - if periods >= 0: - start, stop = periods, sx - else: - start, stop = 0, sx + periods - for i in range(start, stop): - for j in range(sy): - out[i, j] = arr[i, j] - arr[i - periods, j] - else: - if periods >= 0: - start, stop = periods, sy - else: - start, stop = 0, sy + periods - for i in range(sx): - for j in range(start, stop): - out[i, j] = arr[i, j] - arr[i, j - periods] - -@cython.boundscheck(False) -@cython.wraparound(False) -def diff_2d_int32(ndarray[int32_t, ndim=2] arr, - ndarray[float64_t, ndim=2] out, - Py_ssize_t periods, int axis): - cdef: - Py_ssize_t i, j, sx, sy - - sx, sy = ( arr).shape - if arr.flags.f_contiguous: - if axis == 0: - if periods >= 0: - start, stop = periods, sx - else: - start, stop = 0, sx + periods - for j in range(sy): - for i in range(start, stop): - out[i, j] = arr[i, j] - arr[i - periods, j] - else: - if periods >= 0: - start, stop = periods, sy - else: - start, stop = 0, sy + periods - for j in range(start, stop): - for i in range(sx): - out[i, j] = arr[i, j] - arr[i, j - periods] - else: - if axis == 0: - if periods >= 0: - start, stop = periods, sx - else: - start, stop = 0, sx + periods - for i in range(start, stop): - for j in range(sy): - out[i, j] = arr[i, j] - arr[i - periods, j] - else: - if periods >= 0: - start, stop = periods, sy - else: - start, stop = 0, sy + periods - for i in range(sx): - for j in range(start, stop): - out[i, j] = arr[i, j] - arr[i, j - periods] - -@cython.boundscheck(False) -@cython.wraparound(False) -def diff_2d_int64(ndarray[int64_t, ndim=2] arr, - ndarray[float64_t, ndim=2] out, - Py_ssize_t periods, int axis): - cdef: - Py_ssize_t i, j, sx, sy - - sx, sy = ( arr).shape - if arr.flags.f_contiguous: - if axis == 0: - if periods >= 0: - start, stop = periods, sx - else: - start, stop = 0, sx + periods - for j in range(sy): - for i in range(start, stop): - out[i, j] = arr[i, j] - arr[i - periods, j] - else: - if periods >= 0: - start, stop = periods, sy - else: - start, stop = 0, sy + periods - for j in range(start, stop): - for i in range(sx): - out[i, j] = arr[i, j] - arr[i, j - periods] - else: - if axis == 0: - if periods >= 0: - start, stop = periods, sx - else: - start, stop = 0, sx + periods - for i in range(start, stop): - for j in range(sy): - out[i, j] = arr[i, j] - arr[i - periods, j] - else: - if periods >= 0: - start, stop = periods, sy - else: - start, stop = 0, sy + periods - for i in range(sx): - for j in range(start, stop): - out[i, j] = arr[i, j] - arr[i, j - periods] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_add_float64(ndarray[float64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, - ndarray[int64_t] labels): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - float64_t val, count - ndarray[float64_t, ndim=2] sumx, nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - sumx = np.zeros_like(out) - - N, K = ( values).shape - - - with nogil: - - if K > 1: - - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[lab, j] += 1 - sumx[lab, j] += val - - else: - - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - val = values[i, 0] - - # not nan - if val == val: - nobs[lab, 0] += 1 - sumx[lab, 0] += val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = NAN - else: - out[i, j] = sumx[i, j] - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_add_float32(ndarray[float32_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float32_t, ndim=2] values, - ndarray[int64_t] labels): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - float32_t val, count - ndarray[float32_t, ndim=2] sumx, nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - sumx = np.zeros_like(out) - - N, K = ( values).shape - - - with nogil: - - if K > 1: - - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[lab, j] += 1 - sumx[lab, j] += val - - else: - - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - val = values[i, 0] - - # not nan - if val == val: - nobs[lab, 0] += 1 - sumx[lab, 0] += val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = NAN - else: - out[i, j] = sumx[i, j] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_prod_float64(ndarray[float64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, - ndarray[int64_t] labels): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - float64_t val, count - ndarray[float64_t, ndim=2] prodx, nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - prodx = np.ones_like(out) - - N, K = ( values).shape - - with nogil: - if K > 1: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[lab, j] += 1 - prodx[lab, j] *= val - else: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - val = values[i, 0] - - # not nan - if val == val: - nobs[lab, 0] += 1 - prodx[lab, 0] *= val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = NAN - else: - out[i, j] = prodx[i, j] - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_prod_float32(ndarray[float32_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float32_t, ndim=2] values, - ndarray[int64_t] labels): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - float32_t val, count - ndarray[float32_t, ndim=2] prodx, nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - prodx = np.ones_like(out) - - N, K = ( values).shape - - with nogil: - if K > 1: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[lab, j] += 1 - prodx[lab, j] *= val - else: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - val = values[i, 0] - - # not nan - if val == val: - nobs[lab, 0] += 1 - prodx[lab, 0] *= val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = NAN - else: - out[i, j] = prodx[i, j] - - -@cython.wraparound(False) -@cython.boundscheck(False) -@cython.cdivision(True) -def group_var_float64(ndarray[float64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, - ndarray[int64_t] labels): - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - float64_t val, ct, oldmean - ndarray[float64_t, ndim=2] nobs, mean - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - mean = np.zeros_like(out) - - N, K = ( values).shape - - out[:, :] = 0.0 - - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[lab, j] += 1 - oldmean = mean[lab, j] - mean[lab, j] += (val - oldmean) / nobs[lab, j] - out[lab, j] += (val - mean[lab, j]) * (val - oldmean) - - for i in range(ncounts): - for j in range(K): - ct = nobs[i, j] - if ct < 2: - out[i, j] = NAN - else: - out[i, j] /= (ct - 1) - - -@cython.wraparound(False) -@cython.boundscheck(False) -@cython.cdivision(True) -def group_var_float32(ndarray[float32_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float32_t, ndim=2] values, - ndarray[int64_t] labels): - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - float32_t val, ct, oldmean - ndarray[float32_t, ndim=2] nobs, mean - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - mean = np.zeros_like(out) - - N, K = ( values).shape - - out[:, :] = 0.0 - - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[lab, j] += 1 - oldmean = mean[lab, j] - mean[lab, j] += (val - oldmean) / nobs[lab, j] - out[lab, j] += (val - mean[lab, j]) * (val - oldmean) - - for i in range(ncounts): - for j in range(K): - ct = nobs[i, j] - if ct < 2: - out[i, j] = NAN - else: - out[i, j] /= (ct - 1) - - - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_mean_float64(ndarray[float64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, - ndarray[int64_t] labels): - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - float64_t val, count - ndarray[float64_t, ndim=2] sumx, nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - sumx = np.zeros_like(out) - - N, K = ( values).shape - - with nogil: - if K > 1: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - # not nan - if val == val: - nobs[lab, j] += 1 - sumx[lab, j] += val - else: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - val = values[i, 0] - # not nan - if val == val: - nobs[lab, 0] += 1 - sumx[lab, 0] += val - - for i in range(ncounts): - for j in range(K): - count = nobs[i, j] - if nobs[i, j] == 0: - out[i, j] = NAN - else: - out[i, j] = sumx[i, j] / count - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_mean_float32(ndarray[float32_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float32_t, ndim=2] values, - ndarray[int64_t] labels): - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - float32_t val, count - ndarray[float32_t, ndim=2] sumx, nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - sumx = np.zeros_like(out) - - N, K = ( values).shape - - with nogil: - if K > 1: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - # not nan - if val == val: - nobs[lab, j] += 1 - sumx[lab, j] += val - else: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - val = values[i, 0] - # not nan - if val == val: - nobs[lab, 0] += 1 - sumx[lab, 0] += val - - for i in range(ncounts): - for j in range(K): - count = nobs[i, j] - if nobs[i, j] == 0: - out[i, j] = NAN - else: - out[i, j] = sumx[i, j] / count - - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_ohlc_float64(ndarray[float64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, - ndarray[int64_t] labels): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab - float64_t val, count - Py_ssize_t ngroups = len(counts) - - if len(labels) == 0: - return - - N, K = ( values).shape - - if out.shape[1] != 4: - raise ValueError('Output array must have 4 columns') - - if K > 1: - raise NotImplementedError("Argument 'values' must have only " - "one dimension") - out.fill(np.nan) - - with nogil: - for i in range(N): - lab = labels[i] - if lab == -1: - continue - - counts[lab] += 1 - val = values[i, 0] - if val != val: - continue - - if out[lab, 0] != out[lab, 0]: - out[lab, 0] = out[lab, 1] = out[lab, 2] = out[lab, 3] = val - else: - out[lab, 1] = max(out[lab, 1], val) - out[lab, 2] = min(out[lab, 2], val) - out[lab, 3] = val - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_ohlc_float32(ndarray[float32_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float32_t, ndim=2] values, - ndarray[int64_t] labels): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab - float32_t val, count - Py_ssize_t ngroups = len(counts) - - if len(labels) == 0: - return - - N, K = ( values).shape - - if out.shape[1] != 4: - raise ValueError('Output array must have 4 columns') - - if K > 1: - raise NotImplementedError("Argument 'values' must have only " - "one dimension") - out.fill(np.nan) - - with nogil: - for i in range(N): - lab = labels[i] - if lab == -1: - continue - - counts[lab] += 1 - val = values[i, 0] - if val != val: - continue - - if out[lab, 0] != out[lab, 0]: - out[lab, 0] = out[lab, 1] = out[lab, 2] = out[lab, 3] = val - else: - out[lab, 1] = max(out[lab, 1], val) - out[lab, 2] = min(out[lab, 2], val) - out[lab, 3] = val - - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_last_float64(ndarray[float64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, - ndarray[int64_t] labels): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - float64_t val, count - ndarray[float64_t, ndim=2] resx - ndarray[int64_t, ndim=2] nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros(( out).shape, dtype=np.int64) - resx = np.empty_like(out) - - N, K = ( values).shape - - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val and val != NAN: - nobs[lab, j] += 1 - resx[lab, j] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = NAN - else: - out[i, j] = resx[i, j] - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_last_float32(ndarray[float32_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float32_t, ndim=2] values, - ndarray[int64_t] labels): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - float32_t val, count - ndarray[float32_t, ndim=2] resx - ndarray[int64_t, ndim=2] nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros(( out).shape, dtype=np.int64) - resx = np.empty_like(out) - - N, K = ( values).shape - - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val and val != NAN: - nobs[lab, j] += 1 - resx[lab, j] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = NAN - else: - out[i, j] = resx[i, j] - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_last_int64(ndarray[int64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[int64_t, ndim=2] values, - ndarray[int64_t] labels): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - int64_t val, count - ndarray[int64_t, ndim=2] resx - ndarray[int64_t, ndim=2] nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros(( out).shape, dtype=np.int64) - resx = np.empty_like(out) - - N, K = ( values).shape - - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val and val != iNaT: - nobs[lab, j] += 1 - resx[lab, j] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = iNaT - else: - out[i, j] = resx[i, j] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_nth_float64(ndarray[float64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, - ndarray[int64_t] labels, int64_t rank): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - float64_t val, count - ndarray[float64_t, ndim=2] resx - ndarray[int64_t, ndim=2] nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros(( out).shape, dtype=np.int64) - resx = np.empty_like(out) - - N, K = ( values).shape - - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val and val != NAN: - nobs[lab, j] += 1 - if nobs[lab, j] == rank: - resx[lab, j] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = NAN - else: - out[i, j] = resx[i, j] - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_nth_float32(ndarray[float32_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float32_t, ndim=2] values, - ndarray[int64_t] labels, int64_t rank): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - float32_t val, count - ndarray[float32_t, ndim=2] resx - ndarray[int64_t, ndim=2] nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros(( out).shape, dtype=np.int64) - resx = np.empty_like(out) - - N, K = ( values).shape - - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val and val != NAN: - nobs[lab, j] += 1 - if nobs[lab, j] == rank: - resx[lab, j] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = NAN - else: - out[i, j] = resx[i, j] - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_nth_int64(ndarray[int64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[int64_t, ndim=2] values, - ndarray[int64_t] labels, int64_t rank): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - int64_t val, count - ndarray[int64_t, ndim=2] resx - ndarray[int64_t, ndim=2] nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros(( out).shape, dtype=np.int64) - resx = np.empty_like(out) - - N, K = ( values).shape - - with nogil: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val and val != iNaT: - nobs[lab, j] += 1 - if nobs[lab, j] == rank: - resx[lab, j] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = iNaT - else: - out[i, j] = resx[i, j] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_min_float64(ndarray[float64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, - ndarray[int64_t] labels): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - float64_t val, count - ndarray[float64_t, ndim=2] minx, nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - - minx = np.empty_like(out) - minx.fill(np.inf) - - N, K = ( values).shape - - with nogil: - if K > 1: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val and val != NAN: - - nobs[lab, j] += 1 - if val < minx[lab, j]: - minx[lab, j] = val - else: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - val = values[i, 0] - - # not nan - if val == val and val != NAN: - nobs[lab, 0] += 1 - if val < minx[lab, 0]: - minx[lab, 0] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = NAN - else: - out[i, j] = minx[i, j] - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_min_float32(ndarray[float32_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float32_t, ndim=2] values, - ndarray[int64_t] labels): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - float32_t val, count - ndarray[float32_t, ndim=2] minx, nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - - minx = np.empty_like(out) - minx.fill(np.inf) - - N, K = ( values).shape - - with nogil: - if K > 1: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val and val != NAN: - - nobs[lab, j] += 1 - if val < minx[lab, j]: - minx[lab, j] = val - else: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - val = values[i, 0] - - # not nan - if val == val and val != NAN: - nobs[lab, 0] += 1 - if val < minx[lab, 0]: - minx[lab, 0] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = NAN - else: - out[i, j] = minx[i, j] - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_min_int64(ndarray[int64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[int64_t, ndim=2] values, - ndarray[int64_t] labels): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - int64_t val, count - ndarray[int64_t, ndim=2] minx, nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - - minx = np.empty_like(out) - minx.fill(9223372036854775807) - - N, K = ( values).shape - - with nogil: - if K > 1: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val and val != iNaT: - - nobs[lab, j] += 1 - if val < minx[lab, j]: - minx[lab, j] = val - else: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - val = values[i, 0] - - # not nan - if val == val and val != iNaT: - nobs[lab, 0] += 1 - if val < minx[lab, 0]: - minx[lab, 0] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = iNaT - else: - out[i, j] = minx[i, j] - - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_max_float64(ndarray[float64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, - ndarray[int64_t] labels): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - float64_t val, count - ndarray[float64_t, ndim=2] maxx, nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - - maxx = np.empty_like(out) - maxx.fill(-np.inf) - - N, K = ( values).shape - - with nogil: - if K > 1: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val and val != NAN: - nobs[lab, j] += 1 - if val > maxx[lab, j]: - maxx[lab, j] = val - else: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - val = values[i, 0] - - # not nan - if val == val and val != NAN: - nobs[lab, 0] += 1 - if val > maxx[lab, 0]: - maxx[lab, 0] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = NAN - else: - out[i, j] = maxx[i, j] - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_max_float32(ndarray[float32_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float32_t, ndim=2] values, - ndarray[int64_t] labels): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - float32_t val, count - ndarray[float32_t, ndim=2] maxx, nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - - maxx = np.empty_like(out) - maxx.fill(-np.inf) - - N, K = ( values).shape - - with nogil: - if K > 1: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val and val != NAN: - nobs[lab, j] += 1 - if val > maxx[lab, j]: - maxx[lab, j] = val - else: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - val = values[i, 0] - - # not nan - if val == val and val != NAN: - nobs[lab, 0] += 1 - if val > maxx[lab, 0]: - maxx[lab, 0] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = NAN - else: - out[i, j] = maxx[i, j] - -@cython.wraparound(False) -@cython.boundscheck(False) -def group_max_int64(ndarray[int64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[int64_t, ndim=2] values, - ndarray[int64_t] labels): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - int64_t val, count - ndarray[int64_t, ndim=2] maxx, nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - - maxx = np.empty_like(out) - maxx.fill(-9223372036854775807) - - N, K = ( values).shape - - with nogil: - if K > 1: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val and val != iNaT: - nobs[lab, j] += 1 - if val > maxx[lab, j]: - maxx[lab, j] = val - else: - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - val = values[i, 0] - - # not nan - if val == val and val != iNaT: - nobs[lab, 0] += 1 - if val > maxx[lab, 0]: - maxx[lab, 0] = val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] == 0: - out[i, j] = iNaT - else: - out[i, j] = maxx[i, j] - - -def group_median_float64(ndarray[float64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, - ndarray[int64_t] labels): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, ngroups, size - ndarray[int64_t] _counts - ndarray data - float64_t* ptr - ngroups = len(counts) - N, K = ( values).shape - - indexer, _counts = groupsort_indexer(labels, ngroups) - counts[:] = _counts[1:] - - data = np.empty((K, N), dtype=np.float64) - ptr = data.data - - take_2d_axis1_float64_float64(values.T, indexer, out=data) - - for i in range(K): - # exclude NA group - ptr += _counts[0] - for j in range(ngroups): - size = _counts[j + 1] - out[j, i] = _median_linear(ptr, size) - ptr += size - - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_cumprod_float64(float64_t[:,:] out, - float64_t[:,:] values, - int64_t[:] labels, - float64_t[:,:] accum): - """ - Only transforms on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, size - float64_t val - int64_t lab - - N, K = ( values).shape - accum = np.ones_like(accum) - - with nogil: - for i in range(N): - lab = labels[i] - - if lab < 0: - continue - for j in range(K): - val = values[i, j] - if val == val: - accum[lab, j] *= val - out[i, j] = accum[lab, j] - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_cumsum(numeric[:,:] out, - numeric[:,:] values, - int64_t[:] labels, - numeric[:,:] accum): - """ - Only transforms on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, size - numeric val - int64_t lab - - N, K = ( values).shape - accum = np.zeros_like(accum) - - with nogil: - for i in range(N): - lab = labels[i] - - if lab < 0: - continue - for j in range(K): - val = values[i,j] - if val == val: - accum[lab,j] += val - out[i,j] = accum[lab,j] - -@cython.boundscheck(False) -@cython.wraparound(False) -def group_shift_indexer(int64_t[:] out, int64_t[:] labels, - int ngroups, int periods): - cdef: - Py_ssize_t N, i, j, ii - int offset, sign - int64_t lab, idxer, idxer_slot - int64_t[:] label_seen = np.zeros(ngroups, dtype=np.int64) - int64_t[:,:] label_indexer - - N, = ( labels).shape - - if periods < 0: - periods = -periods - offset = N - 1 - sign = -1 - elif periods > 0: - offset = 0 - sign = 1 - - if periods == 0: - with nogil: - for i in range(N): - out[i] = i - else: - # array of each previous indexer seen - label_indexer = np.zeros((ngroups, periods), dtype=np.int64) - with nogil: - for i in range(N): - ## reverse iterator if shifting backwards - ii = offset + sign * i - lab = labels[ii] - label_seen[lab] += 1 - - idxer_slot = label_seen[lab] % periods - idxer = label_indexer[lab, idxer_slot] - - if label_seen[lab] > periods: - out[ii] = idxer - else: - out[ii] = -1 - - label_indexer[lab, idxer_slot] = ii - -@cython.wraparound(False) -@cython.boundscheck(False) -def left_join_indexer_unique_float64(ndarray[float64_t] left, - ndarray[float64_t] right): - cdef: - Py_ssize_t i, j, nleft, nright - ndarray[int64_t] indexer - float64_t lval, rval - - i = 0 - j = 0 - nleft = len(left) - nright = len(right) - - indexer = np.empty(nleft, dtype=np.int64) - while True: - if i == nleft: - break - - if j == nright: - indexer[i] = -1 - i += 1 - continue - - rval = right[j] - - while i < nleft - 1 and left[i] == rval: - indexer[i] = j - i += 1 - - if left[i] == right[j]: - indexer[i] = j - i += 1 - while i < nleft - 1 and left[i] == rval: - indexer[i] = j - i += 1 - j += 1 - elif left[i] > rval: - indexer[i] = -1 - j += 1 - else: - indexer[i] = -1 - i += 1 - return indexer - -@cython.wraparound(False) -@cython.boundscheck(False) -def left_join_indexer_unique_float32(ndarray[float32_t] left, - ndarray[float32_t] right): - cdef: - Py_ssize_t i, j, nleft, nright - ndarray[int64_t] indexer - float32_t lval, rval - - i = 0 - j = 0 - nleft = len(left) - nright = len(right) - - indexer = np.empty(nleft, dtype=np.int64) - while True: - if i == nleft: - break - - if j == nright: - indexer[i] = -1 - i += 1 - continue - - rval = right[j] - - while i < nleft - 1 and left[i] == rval: - indexer[i] = j - i += 1 - - if left[i] == right[j]: - indexer[i] = j - i += 1 - while i < nleft - 1 and left[i] == rval: - indexer[i] = j - i += 1 - j += 1 - elif left[i] > rval: - indexer[i] = -1 - j += 1 - else: - indexer[i] = -1 - i += 1 - return indexer - -@cython.wraparound(False) -@cython.boundscheck(False) -def left_join_indexer_unique_object(ndarray[object] left, - ndarray[object] right): - cdef: - Py_ssize_t i, j, nleft, nright - ndarray[int64_t] indexer - object lval, rval - - i = 0 - j = 0 - nleft = len(left) - nright = len(right) - - indexer = np.empty(nleft, dtype=np.int64) - while True: - if i == nleft: - break - - if j == nright: - indexer[i] = -1 - i += 1 - continue - - rval = right[j] - - while i < nleft - 1 and left[i] == rval: - indexer[i] = j - i += 1 - - if left[i] == right[j]: - indexer[i] = j - i += 1 - while i < nleft - 1 and left[i] == rval: - indexer[i] = j - i += 1 - j += 1 - elif left[i] > rval: - indexer[i] = -1 - j += 1 - else: - indexer[i] = -1 - i += 1 - return indexer - -@cython.wraparound(False) -@cython.boundscheck(False) -def left_join_indexer_unique_int32(ndarray[int32_t] left, - ndarray[int32_t] right): - cdef: - Py_ssize_t i, j, nleft, nright - ndarray[int64_t] indexer - int32_t lval, rval - - i = 0 - j = 0 - nleft = len(left) - nright = len(right) - - indexer = np.empty(nleft, dtype=np.int64) - while True: - if i == nleft: - break - - if j == nright: - indexer[i] = -1 - i += 1 - continue - - rval = right[j] - - while i < nleft - 1 and left[i] == rval: - indexer[i] = j - i += 1 - - if left[i] == right[j]: - indexer[i] = j - i += 1 - while i < nleft - 1 and left[i] == rval: - indexer[i] = j - i += 1 - j += 1 - elif left[i] > rval: - indexer[i] = -1 - j += 1 - else: - indexer[i] = -1 - i += 1 - return indexer - -@cython.wraparound(False) -@cython.boundscheck(False) -def left_join_indexer_unique_int64(ndarray[int64_t] left, - ndarray[int64_t] right): - cdef: - Py_ssize_t i, j, nleft, nright - ndarray[int64_t] indexer - int64_t lval, rval - - i = 0 - j = 0 - nleft = len(left) - nright = len(right) - - indexer = np.empty(nleft, dtype=np.int64) - while True: - if i == nleft: - break - - if j == nright: - indexer[i] = -1 - i += 1 - continue - - rval = right[j] - - while i < nleft - 1 and left[i] == rval: - indexer[i] = j - i += 1 - - if left[i] == right[j]: - indexer[i] = j - i += 1 - while i < nleft - 1 and left[i] == rval: - indexer[i] = j - i += 1 - j += 1 - elif left[i] > rval: - indexer[i] = -1 - j += 1 - else: - indexer[i] = -1 - i += 1 - return indexer - - -def left_join_indexer_float64(ndarray[float64_t] left, - ndarray[float64_t] right): - """ - Two-pass algorithm for monotonic indexes. Handles many-to-one merges - """ - cdef: - Py_ssize_t i, j, k, nright, nleft, count - float64_t lval, rval - ndarray[int64_t] lindexer, rindexer - ndarray[float64_t] result - - nleft = len(left) - nright = len(right) - - i = 0 - j = 0 - count = 0 - if nleft > 0: - while i < nleft: - if j == nright: - count += nleft - i - break - - lval = left[i] - rval = right[j] - - if lval == rval: - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - count += 1 - i += 1 - else: - j += 1 - - # do it again now that result size is known - - lindexer = np.empty(count, dtype=np.int64) - rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype=np.float64) - - i = 0 - j = 0 - count = 0 - if nleft > 0: - while i < nleft: - if j == nright: - while i < nleft: - lindexer[count] = i - rindexer[count] = -1 - result[count] = left[i] - i += 1 - count += 1 - break - - lval = left[i] - rval = right[j] - - if lval == rval: - lindexer[count] = i - rindexer[count] = j - result[count] = lval - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - lindexer[count] = i - rindexer[count] = -1 - result[count] = left[i] - count += 1 - i += 1 - else: - j += 1 - - return result, lindexer, rindexer - -def left_join_indexer_float32(ndarray[float32_t] left, - ndarray[float32_t] right): - """ - Two-pass algorithm for monotonic indexes. Handles many-to-one merges - """ - cdef: - Py_ssize_t i, j, k, nright, nleft, count - float32_t lval, rval - ndarray[int64_t] lindexer, rindexer - ndarray[float32_t] result - - nleft = len(left) - nright = len(right) - - i = 0 - j = 0 - count = 0 - if nleft > 0: - while i < nleft: - if j == nright: - count += nleft - i - break - - lval = left[i] - rval = right[j] - - if lval == rval: - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - count += 1 - i += 1 - else: - j += 1 - - # do it again now that result size is known - - lindexer = np.empty(count, dtype=np.int64) - rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype=np.float32) - - i = 0 - j = 0 - count = 0 - if nleft > 0: - while i < nleft: - if j == nright: - while i < nleft: - lindexer[count] = i - rindexer[count] = -1 - result[count] = left[i] - i += 1 - count += 1 - break - - lval = left[i] - rval = right[j] - - if lval == rval: - lindexer[count] = i - rindexer[count] = j - result[count] = lval - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - lindexer[count] = i - rindexer[count] = -1 - result[count] = left[i] - count += 1 - i += 1 - else: - j += 1 - - return result, lindexer, rindexer - -def left_join_indexer_object(ndarray[object] left, - ndarray[object] right): - """ - Two-pass algorithm for monotonic indexes. Handles many-to-one merges - """ - cdef: - Py_ssize_t i, j, k, nright, nleft, count - object lval, rval - ndarray[int64_t] lindexer, rindexer - ndarray[object] result - - nleft = len(left) - nright = len(right) - - i = 0 - j = 0 - count = 0 - if nleft > 0: - while i < nleft: - if j == nright: - count += nleft - i - break - - lval = left[i] - rval = right[j] - - if lval == rval: - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - count += 1 - i += 1 - else: - j += 1 - - # do it again now that result size is known - - lindexer = np.empty(count, dtype=np.int64) - rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype=object) - - i = 0 - j = 0 - count = 0 - if nleft > 0: - while i < nleft: - if j == nright: - while i < nleft: - lindexer[count] = i - rindexer[count] = -1 - result[count] = left[i] - i += 1 - count += 1 - break - - lval = left[i] - rval = right[j] - - if lval == rval: - lindexer[count] = i - rindexer[count] = j - result[count] = lval - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - lindexer[count] = i - rindexer[count] = -1 - result[count] = left[i] - count += 1 - i += 1 - else: - j += 1 - - return result, lindexer, rindexer - -def left_join_indexer_int32(ndarray[int32_t] left, - ndarray[int32_t] right): - """ - Two-pass algorithm for monotonic indexes. Handles many-to-one merges - """ - cdef: - Py_ssize_t i, j, k, nright, nleft, count - int32_t lval, rval - ndarray[int64_t] lindexer, rindexer - ndarray[int32_t] result - - nleft = len(left) - nright = len(right) - - i = 0 - j = 0 - count = 0 - if nleft > 0: - while i < nleft: - if j == nright: - count += nleft - i - break - - lval = left[i] - rval = right[j] - - if lval == rval: - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - count += 1 - i += 1 - else: - j += 1 - - # do it again now that result size is known - - lindexer = np.empty(count, dtype=np.int64) - rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype=np.int32) - - i = 0 - j = 0 - count = 0 - if nleft > 0: - while i < nleft: - if j == nright: - while i < nleft: - lindexer[count] = i - rindexer[count] = -1 - result[count] = left[i] - i += 1 - count += 1 - break - - lval = left[i] - rval = right[j] - - if lval == rval: - lindexer[count] = i - rindexer[count] = j - result[count] = lval - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - lindexer[count] = i - rindexer[count] = -1 - result[count] = left[i] - count += 1 - i += 1 - else: - j += 1 - - return result, lindexer, rindexer - -def left_join_indexer_int64(ndarray[int64_t] left, - ndarray[int64_t] right): - """ - Two-pass algorithm for monotonic indexes. Handles many-to-one merges - """ - cdef: - Py_ssize_t i, j, k, nright, nleft, count - int64_t lval, rval - ndarray[int64_t] lindexer, rindexer - ndarray[int64_t] result - - nleft = len(left) - nright = len(right) - - i = 0 - j = 0 - count = 0 - if nleft > 0: - while i < nleft: - if j == nright: - count += nleft - i - break - - lval = left[i] - rval = right[j] - - if lval == rval: - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - count += 1 - i += 1 - else: - j += 1 - - # do it again now that result size is known - - lindexer = np.empty(count, dtype=np.int64) - rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype=np.int64) - - i = 0 - j = 0 - count = 0 - if nleft > 0: - while i < nleft: - if j == nright: - while i < nleft: - lindexer[count] = i - rindexer[count] = -1 - result[count] = left[i] - i += 1 - count += 1 - break - - lval = left[i] - rval = right[j] - - if lval == rval: - lindexer[count] = i - rindexer[count] = j - result[count] = lval - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - lindexer[count] = i - rindexer[count] = -1 - result[count] = left[i] - count += 1 - i += 1 - else: - j += 1 - - return result, lindexer, rindexer - - -@cython.wraparound(False) -@cython.boundscheck(False) -def outer_join_indexer_float64(ndarray[float64_t] left, - ndarray[float64_t] right): - cdef: - Py_ssize_t i, j, nright, nleft, count - float64_t lval, rval - ndarray[int64_t] lindexer, rindexer - ndarray[float64_t] result - - nleft = len(left) - nright = len(right) - - i = 0 - j = 0 - count = 0 - if nleft == 0: - count = nright - elif nright == 0: - count = nleft - else: - while True: - if i == nleft: - count += nright - j - break - if j == nright: - count += nleft - i - break - - lval = left[i] - rval = right[j] - if lval == rval: - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - count += 1 - i += 1 - else: - count += 1 - j += 1 - - lindexer = np.empty(count, dtype=np.int64) - rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype=np.float64) - - # do it again, but populate the indexers / result - - i = 0 - j = 0 - count = 0 - if nleft == 0: - for j in range(nright): - lindexer[j] = -1 - rindexer[j] = j - result[j] = right[j] - elif nright == 0: - for i in range(nleft): - lindexer[i] = i - rindexer[i] = -1 - result[i] = left[i] - else: - while True: - if i == nleft: - while j < nright: - lindexer[count] = -1 - rindexer[count] = j - result[count] = right[j] - count += 1 - j += 1 - break - if j == nright: - while i < nleft: - lindexer[count] = i - rindexer[count] = -1 - result[count] = left[i] - count += 1 - i += 1 - break - - lval = left[i] - rval = right[j] - - if lval == rval: - lindexer[count] = i - rindexer[count] = j - result[count] = lval - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - lindexer[count] = i - rindexer[count] = -1 - result[count] = lval - count += 1 - i += 1 - else: - lindexer[count] = -1 - rindexer[count] = j - result[count] = rval - count += 1 - j += 1 - - return result, lindexer, rindexer - -@cython.wraparound(False) -@cython.boundscheck(False) -def outer_join_indexer_float32(ndarray[float32_t] left, - ndarray[float32_t] right): - cdef: - Py_ssize_t i, j, nright, nleft, count - float32_t lval, rval - ndarray[int64_t] lindexer, rindexer - ndarray[float32_t] result - - nleft = len(left) - nright = len(right) - - i = 0 - j = 0 - count = 0 - if nleft == 0: - count = nright - elif nright == 0: - count = nleft - else: - while True: - if i == nleft: - count += nright - j - break - if j == nright: - count += nleft - i - break - - lval = left[i] - rval = right[j] - if lval == rval: - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - count += 1 - i += 1 - else: - count += 1 - j += 1 - - lindexer = np.empty(count, dtype=np.int64) - rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype=np.float32) - - # do it again, but populate the indexers / result - - i = 0 - j = 0 - count = 0 - if nleft == 0: - for j in range(nright): - lindexer[j] = -1 - rindexer[j] = j - result[j] = right[j] - elif nright == 0: - for i in range(nleft): - lindexer[i] = i - rindexer[i] = -1 - result[i] = left[i] - else: - while True: - if i == nleft: - while j < nright: - lindexer[count] = -1 - rindexer[count] = j - result[count] = right[j] - count += 1 - j += 1 - break - if j == nright: - while i < nleft: - lindexer[count] = i - rindexer[count] = -1 - result[count] = left[i] - count += 1 - i += 1 - break - - lval = left[i] - rval = right[j] - - if lval == rval: - lindexer[count] = i - rindexer[count] = j - result[count] = lval - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - lindexer[count] = i - rindexer[count] = -1 - result[count] = lval - count += 1 - i += 1 - else: - lindexer[count] = -1 - rindexer[count] = j - result[count] = rval - count += 1 - j += 1 - - return result, lindexer, rindexer - -@cython.wraparound(False) -@cython.boundscheck(False) -def outer_join_indexer_object(ndarray[object] left, - ndarray[object] right): - cdef: - Py_ssize_t i, j, nright, nleft, count - object lval, rval - ndarray[int64_t] lindexer, rindexer - ndarray[object] result - - nleft = len(left) - nright = len(right) - - i = 0 - j = 0 - count = 0 - if nleft == 0: - count = nright - elif nright == 0: - count = nleft - else: - while True: - if i == nleft: - count += nright - j - break - if j == nright: - count += nleft - i - break - - lval = left[i] - rval = right[j] - if lval == rval: - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - count += 1 - i += 1 - else: - count += 1 - j += 1 - - lindexer = np.empty(count, dtype=np.int64) - rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype=object) - - # do it again, but populate the indexers / result - - i = 0 - j = 0 - count = 0 - if nleft == 0: - for j in range(nright): - lindexer[j] = -1 - rindexer[j] = j - result[j] = right[j] - elif nright == 0: - for i in range(nleft): - lindexer[i] = i - rindexer[i] = -1 - result[i] = left[i] - else: - while True: - if i == nleft: - while j < nright: - lindexer[count] = -1 - rindexer[count] = j - result[count] = right[j] - count += 1 - j += 1 - break - if j == nright: - while i < nleft: - lindexer[count] = i - rindexer[count] = -1 - result[count] = left[i] - count += 1 - i += 1 - break - - lval = left[i] - rval = right[j] - - if lval == rval: - lindexer[count] = i - rindexer[count] = j - result[count] = lval - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - lindexer[count] = i - rindexer[count] = -1 - result[count] = lval - count += 1 - i += 1 - else: - lindexer[count] = -1 - rindexer[count] = j - result[count] = rval - count += 1 - j += 1 - - return result, lindexer, rindexer - -@cython.wraparound(False) -@cython.boundscheck(False) -def outer_join_indexer_int32(ndarray[int32_t] left, - ndarray[int32_t] right): - cdef: - Py_ssize_t i, j, nright, nleft, count - int32_t lval, rval - ndarray[int64_t] lindexer, rindexer - ndarray[int32_t] result - - nleft = len(left) - nright = len(right) - - i = 0 - j = 0 - count = 0 - if nleft == 0: - count = nright - elif nright == 0: - count = nleft - else: - while True: - if i == nleft: - count += nright - j - break - if j == nright: - count += nleft - i - break - - lval = left[i] - rval = right[j] - if lval == rval: - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - count += 1 - i += 1 - else: - count += 1 - j += 1 - - lindexer = np.empty(count, dtype=np.int64) - rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype=np.int32) - - # do it again, but populate the indexers / result - - i = 0 - j = 0 - count = 0 - if nleft == 0: - for j in range(nright): - lindexer[j] = -1 - rindexer[j] = j - result[j] = right[j] - elif nright == 0: - for i in range(nleft): - lindexer[i] = i - rindexer[i] = -1 - result[i] = left[i] - else: - while True: - if i == nleft: - while j < nright: - lindexer[count] = -1 - rindexer[count] = j - result[count] = right[j] - count += 1 - j += 1 - break - if j == nright: - while i < nleft: - lindexer[count] = i - rindexer[count] = -1 - result[count] = left[i] - count += 1 - i += 1 - break - - lval = left[i] - rval = right[j] - - if lval == rval: - lindexer[count] = i - rindexer[count] = j - result[count] = lval - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - lindexer[count] = i - rindexer[count] = -1 - result[count] = lval - count += 1 - i += 1 - else: - lindexer[count] = -1 - rindexer[count] = j - result[count] = rval - count += 1 - j += 1 - - return result, lindexer, rindexer - -@cython.wraparound(False) -@cython.boundscheck(False) -def outer_join_indexer_int64(ndarray[int64_t] left, - ndarray[int64_t] right): - cdef: - Py_ssize_t i, j, nright, nleft, count - int64_t lval, rval - ndarray[int64_t] lindexer, rindexer - ndarray[int64_t] result - - nleft = len(left) - nright = len(right) - - i = 0 - j = 0 - count = 0 - if nleft == 0: - count = nright - elif nright == 0: - count = nleft - else: - while True: - if i == nleft: - count += nright - j - break - if j == nright: - count += nleft - i - break - - lval = left[i] - rval = right[j] - if lval == rval: - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - count += 1 - i += 1 - else: - count += 1 - j += 1 - - lindexer = np.empty(count, dtype=np.int64) - rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype=np.int64) - - # do it again, but populate the indexers / result - - i = 0 - j = 0 - count = 0 - if nleft == 0: - for j in range(nright): - lindexer[j] = -1 - rindexer[j] = j - result[j] = right[j] - elif nright == 0: - for i in range(nleft): - lindexer[i] = i - rindexer[i] = -1 - result[i] = left[i] - else: - while True: - if i == nleft: - while j < nright: - lindexer[count] = -1 - rindexer[count] = j - result[count] = right[j] - count += 1 - j += 1 - break - if j == nright: - while i < nleft: - lindexer[count] = i - rindexer[count] = -1 - result[count] = left[i] - count += 1 - i += 1 - break - - lval = left[i] - rval = right[j] - - if lval == rval: - lindexer[count] = i - rindexer[count] = j - result[count] = lval - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - lindexer[count] = i - rindexer[count] = -1 - result[count] = lval - count += 1 - i += 1 - else: - lindexer[count] = -1 - rindexer[count] = j - result[count] = rval - count += 1 - j += 1 - - return result, lindexer, rindexer - - -@cython.wraparound(False) -@cython.boundscheck(False) -def inner_join_indexer_float64(ndarray[float64_t] left, - ndarray[float64_t] right): - """ - Two-pass algorithm for monotonic indexes. Handles many-to-one merges - """ - cdef: - Py_ssize_t i, j, k, nright, nleft, count - float64_t lval, rval - ndarray[int64_t] lindexer, rindexer - ndarray[float64_t] result - - nleft = len(left) - nright = len(right) - - i = 0 - j = 0 - count = 0 - if nleft > 0 and nright > 0: - while True: - if i == nleft: - break - if j == nright: - break - - lval = left[i] - rval = right[j] - if lval == rval: - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - i += 1 - else: - j += 1 - - # do it again now that result size is known - - lindexer = np.empty(count, dtype=np.int64) - rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype=np.float64) - - i = 0 - j = 0 - count = 0 - if nleft > 0 and nright > 0: - while True: - if i == nleft: - break - if j == nright: - break - - lval = left[i] - rval = right[j] - if lval == rval: - lindexer[count] = i - rindexer[count] = j - result[count] = rval - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - i += 1 - else: - j += 1 - - return result, lindexer, rindexer - -@cython.wraparound(False) -@cython.boundscheck(False) -def inner_join_indexer_float32(ndarray[float32_t] left, - ndarray[float32_t] right): - """ - Two-pass algorithm for monotonic indexes. Handles many-to-one merges - """ - cdef: - Py_ssize_t i, j, k, nright, nleft, count - float32_t lval, rval - ndarray[int64_t] lindexer, rindexer - ndarray[float32_t] result - - nleft = len(left) - nright = len(right) - - i = 0 - j = 0 - count = 0 - if nleft > 0 and nright > 0: - while True: - if i == nleft: - break - if j == nright: - break - - lval = left[i] - rval = right[j] - if lval == rval: - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - i += 1 - else: - j += 1 - - # do it again now that result size is known - - lindexer = np.empty(count, dtype=np.int64) - rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype=np.float32) - - i = 0 - j = 0 - count = 0 - if nleft > 0 and nright > 0: - while True: - if i == nleft: - break - if j == nright: - break - - lval = left[i] - rval = right[j] - if lval == rval: - lindexer[count] = i - rindexer[count] = j - result[count] = rval - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - i += 1 - else: - j += 1 - - return result, lindexer, rindexer - -@cython.wraparound(False) -@cython.boundscheck(False) -def inner_join_indexer_object(ndarray[object] left, - ndarray[object] right): - """ - Two-pass algorithm for monotonic indexes. Handles many-to-one merges - """ - cdef: - Py_ssize_t i, j, k, nright, nleft, count - object lval, rval - ndarray[int64_t] lindexer, rindexer - ndarray[object] result - - nleft = len(left) - nright = len(right) - - i = 0 - j = 0 - count = 0 - if nleft > 0 and nright > 0: - while True: - if i == nleft: - break - if j == nright: - break - - lval = left[i] - rval = right[j] - if lval == rval: - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - i += 1 - else: - j += 1 - - # do it again now that result size is known - - lindexer = np.empty(count, dtype=np.int64) - rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype=object) - - i = 0 - j = 0 - count = 0 - if nleft > 0 and nright > 0: - while True: - if i == nleft: - break - if j == nright: - break - - lval = left[i] - rval = right[j] - if lval == rval: - lindexer[count] = i - rindexer[count] = j - result[count] = rval - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - i += 1 - else: - j += 1 - - return result, lindexer, rindexer - -@cython.wraparound(False) -@cython.boundscheck(False) -def inner_join_indexer_int32(ndarray[int32_t] left, - ndarray[int32_t] right): - """ - Two-pass algorithm for monotonic indexes. Handles many-to-one merges - """ - cdef: - Py_ssize_t i, j, k, nright, nleft, count - int32_t lval, rval - ndarray[int64_t] lindexer, rindexer - ndarray[int32_t] result - - nleft = len(left) - nright = len(right) - - i = 0 - j = 0 - count = 0 - if nleft > 0 and nright > 0: - while True: - if i == nleft: - break - if j == nright: - break - - lval = left[i] - rval = right[j] - if lval == rval: - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - i += 1 - else: - j += 1 - - # do it again now that result size is known - - lindexer = np.empty(count, dtype=np.int64) - rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype=np.int32) - - i = 0 - j = 0 - count = 0 - if nleft > 0 and nright > 0: - while True: - if i == nleft: - break - if j == nright: - break - - lval = left[i] - rval = right[j] - if lval == rval: - lindexer[count] = i - rindexer[count] = j - result[count] = rval - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - i += 1 - else: - j += 1 - - return result, lindexer, rindexer - -@cython.wraparound(False) -@cython.boundscheck(False) -def inner_join_indexer_int64(ndarray[int64_t] left, - ndarray[int64_t] right): - """ - Two-pass algorithm for monotonic indexes. Handles many-to-one merges - """ - cdef: - Py_ssize_t i, j, k, nright, nleft, count - int64_t lval, rval - ndarray[int64_t] lindexer, rindexer - ndarray[int64_t] result - - nleft = len(left) - nright = len(right) - - i = 0 - j = 0 - count = 0 - if nleft > 0 and nright > 0: - while True: - if i == nleft: - break - if j == nright: - break - - lval = left[i] - rval = right[j] - if lval == rval: - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - i += 1 - else: - j += 1 - - # do it again now that result size is known - - lindexer = np.empty(count, dtype=np.int64) - rindexer = np.empty(count, dtype=np.int64) - result = np.empty(count, dtype=np.int64) - - i = 0 - j = 0 - count = 0 - if nleft > 0 and nright > 0: - while True: - if i == nleft: - break - if j == nright: - break - - lval = left[i] - rval = right[j] - if lval == rval: - lindexer[count] = i - rindexer[count] = j - result[count] = rval - count += 1 - if i < nleft - 1: - if j < nright - 1 and right[j + 1] == rval: - j += 1 - else: - i += 1 - if left[i] != rval: - j += 1 - elif j < nright - 1: - j += 1 - if lval != right[j]: - i += 1 - else: - # end of the road - break - elif lval < rval: - i += 1 - else: - j += 1 - - return result, lindexer, rindexer - - diff --git a/pandas/src/hashtable_class_helper.pxi b/pandas/src/hashtable_class_helper.pxi new file mode 100644 index 0000000000000..da0c76aeca86f --- /dev/null +++ b/pandas/src/hashtable_class_helper.pxi @@ -0,0 +1,860 @@ +""" +Template for each `dtype` helper function for hashtable + +WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in +""" + +#---------------------------------------------------------------------- +# VectorData +#---------------------------------------------------------------------- + + +ctypedef struct Float64VectorData: + float64_t *data + size_t n, m + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef void append_data_float64(Float64VectorData *data, + float64_t x) nogil: + + data.data[data.n] = x + data.n += 1 + + +ctypedef struct Int64VectorData: + int64_t *data + size_t n, m + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef void append_data_int64(Int64VectorData *data, + int64_t x) nogil: + + data.data[data.n] = x + data.n += 1 + +ctypedef fused vector_data: + Int64VectorData + Float64VectorData + +cdef bint needs_resize(vector_data *data) nogil: + return data.n == data.m + +#---------------------------------------------------------------------- +# Vector +#---------------------------------------------------------------------- + +cdef class Float64Vector: + + cdef: + Float64VectorData *data + ndarray ao + + def __cinit__(self): + self.data = PyMem_Malloc( + sizeof(Float64VectorData)) + if not self.data: + raise MemoryError() + self.data.n = 0 + self.data.m = _INIT_VEC_CAP + self.ao = np.empty(self.data.m, dtype=np.float64) + self.data.data = self.ao.data + + cdef resize(self): + self.data.m = max(self.data.m * 4, _INIT_VEC_CAP) + self.ao.resize(self.data.m) + self.data.data = self.ao.data + + def __dealloc__(self): + PyMem_Free(self.data) + + def __len__(self): + return self.data.n + + def to_array(self): + self.ao.resize(self.data.n) + self.data.m = self.data.n + return self.ao + + cdef inline void append(self, float64_t x): + + if needs_resize(self.data): + self.resize() + + append_data_float64(self.data, x) + +cdef class Int64Vector: + + cdef: + Int64VectorData *data + ndarray ao + + def __cinit__(self): + self.data = PyMem_Malloc( + sizeof(Int64VectorData)) + if not self.data: + raise MemoryError() + self.data.n = 0 + self.data.m = _INIT_VEC_CAP + self.ao = np.empty(self.data.m, dtype=np.int64) + self.data.data = self.ao.data + + cdef resize(self): + self.data.m = max(self.data.m * 4, _INIT_VEC_CAP) + self.ao.resize(self.data.m) + self.data.data = self.ao.data + + def __dealloc__(self): + PyMem_Free(self.data) + + def __len__(self): + return self.data.n + + def to_array(self): + self.ao.resize(self.data.n) + self.data.m = self.data.n + return self.ao + + cdef inline void append(self, int64_t x): + + if needs_resize(self.data): + self.resize() + + append_data_int64(self.data, x) + + +cdef class ObjectVector: + + cdef: + PyObject **data + size_t n, m + ndarray ao + + def __cinit__(self): + self.n = 0 + self.m = _INIT_VEC_CAP + self.ao = np.empty(_INIT_VEC_CAP, dtype=object) + self.data = self.ao.data + + def __len__(self): + return self.n + + cdef inline append(self, object o): + if self.n == self.m: + self.m = max(self.m * 2, _INIT_VEC_CAP) + self.ao.resize(self.m) + self.data = self.ao.data + + Py_INCREF(o) + self.data[self.n] = o + self.n += 1 + + def to_array(self): + self.ao.resize(self.n) + self.m = self.n + return self.ao + + +#---------------------------------------------------------------------- +# HashTable +#---------------------------------------------------------------------- + + +cdef class HashTable: + pass + +cdef class Float64HashTable(HashTable): + + def __cinit__(self, size_hint=1): + self.table = kh_init_float64() + if size_hint is not None: + kh_resize_float64(self.table, size_hint) + + def __len__(self): + return self.table.size + + def __dealloc__(self): + kh_destroy_float64(self.table) + + def __contains__(self, object key): + cdef khiter_t k + k = kh_get_float64(self.table, key) + return k != self.table.n_buckets + + cpdef get_item(self, float64_t val): + cdef khiter_t k + k = kh_get_float64(self.table, val) + if k != self.table.n_buckets: + return self.table.vals[k] + else: + raise KeyError(val) + + def get_iter_test(self, float64_t key, Py_ssize_t iterations): + cdef Py_ssize_t i, val=0 + for i in range(iterations): + k = kh_get_float64(self.table, val) + if k != self.table.n_buckets: + val = self.table.vals[k] + + cpdef set_item(self, float64_t key, Py_ssize_t val): + cdef: + khiter_t k + int ret = 0 + + k = kh_put_float64(self.table, key, &ret) + self.table.keys[k] = key + if kh_exist_float64(self.table, k): + self.table.vals[k] = val + else: + raise KeyError(key) + + @cython.boundscheck(False) + def map(self, float64_t[:] keys, int64_t[:] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + float64_t key + khiter_t k + + with nogil: + for i in range(n): + key = keys[i] + k = kh_put_float64(self.table, key, &ret) + self.table.vals[k] = values[i] + + @cython.boundscheck(False) + def map_locations(self, ndarray[float64_t, ndim=1] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + float64_t val + khiter_t k + + with nogil: + for i in range(n): + val = values[i] + k = kh_put_float64(self.table, val, &ret) + self.table.vals[k] = i + + @cython.boundscheck(False) + def lookup(self, float64_t[:] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + float64_t val + khiter_t k + int64_t[:] locs = np.empty(n, dtype=np.int64) + + with nogil: + for i in range(n): + val = values[i] + k = kh_get_float64(self.table, val) + if k != self.table.n_buckets: + locs[i] = self.table.vals[k] + else: + locs[i] = -1 + + return np.asarray(locs) + + def factorize(self, float64_t values): + uniques = Float64Vector() + labels = self.get_labels(values, uniques, 0, 0) + return uniques.to_array(), labels + + @cython.boundscheck(False) + def get_labels(self, float64_t[:] values, Float64Vector uniques, + Py_ssize_t count_prior, Py_ssize_t na_sentinel, + bint check_null=True): + cdef: + Py_ssize_t i, n = len(values) + int64_t[:] labels + Py_ssize_t idx, count = count_prior + int ret = 0 + float64_t val + khiter_t k + Float64VectorData *ud + + labels = np.empty(n, dtype=np.int64) + ud = uniques.data + + with nogil: + for i in range(n): + val = values[i] + + if check_null and val != val: + labels[i] = na_sentinel + continue + + k = kh_get_float64(self.table, val) + + if k != self.table.n_buckets: + idx = self.table.vals[k] + labels[i] = idx + else: + k = kh_put_float64(self.table, val, &ret) + self.table.vals[k] = count + + if needs_resize(ud): + with gil: + uniques.resize() + append_data_float64(ud, val) + labels[i] = count + count += 1 + + return np.asarray(labels) + + @cython.boundscheck(False) + def get_labels_groupby(self, float64_t[:] values): + cdef: + Py_ssize_t i, n = len(values) + int64_t[:] labels + Py_ssize_t idx, count = 0 + int ret = 0 + float64_t val + khiter_t k + Float64Vector uniques = Float64Vector() + Float64VectorData *ud + + labels = np.empty(n, dtype=np.int64) + ud = uniques.data + + with nogil: + for i in range(n): + val = values[i] + + # specific for groupby + if val < 0: + labels[i] = -1 + continue + + k = kh_get_float64(self.table, val) + if k != self.table.n_buckets: + idx = self.table.vals[k] + labels[i] = idx + else: + k = kh_put_float64(self.table, val, &ret) + self.table.vals[k] = count + + if needs_resize(ud): + with gil: + uniques.resize() + append_data_float64(ud, val) + labels[i] = count + count += 1 + + arr_uniques = uniques.to_array() + + return np.asarray(labels), arr_uniques + + @cython.boundscheck(False) + def unique(self, float64_t[:] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + float64_t val + khiter_t k + bint seen_na = 0 + Float64Vector uniques = Float64Vector() + Float64VectorData *ud + + ud = uniques.data + + with nogil: + for i in range(n): + val = values[i] + + if val == val: + k = kh_get_float64(self.table, val) + if k == self.table.n_buckets: + kh_put_float64(self.table, val, &ret) + if needs_resize(ud): + with gil: + uniques.resize() + append_data_float64(ud, val) + elif not seen_na: + seen_na = 1 + if needs_resize(ud): + with gil: + uniques.resize() + append_data_float64(ud, NAN) + + return uniques.to_array() + +cdef class Int64HashTable(HashTable): + + def __cinit__(self, size_hint=1): + self.table = kh_init_int64() + if size_hint is not None: + kh_resize_int64(self.table, size_hint) + + def __len__(self): + return self.table.size + + def __dealloc__(self): + kh_destroy_int64(self.table) + + def __contains__(self, object key): + cdef khiter_t k + k = kh_get_int64(self.table, key) + return k != self.table.n_buckets + + cpdef get_item(self, int64_t val): + cdef khiter_t k + k = kh_get_int64(self.table, val) + if k != self.table.n_buckets: + return self.table.vals[k] + else: + raise KeyError(val) + + def get_iter_test(self, int64_t key, Py_ssize_t iterations): + cdef Py_ssize_t i, val=0 + for i in range(iterations): + k = kh_get_int64(self.table, val) + if k != self.table.n_buckets: + val = self.table.vals[k] + + cpdef set_item(self, int64_t key, Py_ssize_t val): + cdef: + khiter_t k + int ret = 0 + + k = kh_put_int64(self.table, key, &ret) + self.table.keys[k] = key + if kh_exist_int64(self.table, k): + self.table.vals[k] = val + else: + raise KeyError(key) + + @cython.boundscheck(False) + def map(self, int64_t[:] keys, int64_t[:] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + int64_t key + khiter_t k + + with nogil: + for i in range(n): + key = keys[i] + k = kh_put_int64(self.table, key, &ret) + self.table.vals[k] = values[i] + + @cython.boundscheck(False) + def map_locations(self, ndarray[int64_t, ndim=1] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + int64_t val + khiter_t k + + with nogil: + for i in range(n): + val = values[i] + k = kh_put_int64(self.table, val, &ret) + self.table.vals[k] = i + + @cython.boundscheck(False) + def lookup(self, int64_t[:] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + int64_t val + khiter_t k + int64_t[:] locs = np.empty(n, dtype=np.int64) + + with nogil: + for i in range(n): + val = values[i] + k = kh_get_int64(self.table, val) + if k != self.table.n_buckets: + locs[i] = self.table.vals[k] + else: + locs[i] = -1 + + return np.asarray(locs) + + def factorize(self, int64_t values): + uniques = Int64Vector() + labels = self.get_labels(values, uniques, 0, 0) + return uniques.to_array(), labels + + @cython.boundscheck(False) + def get_labels(self, int64_t[:] values, Int64Vector uniques, + Py_ssize_t count_prior, Py_ssize_t na_sentinel, + bint check_null=True): + cdef: + Py_ssize_t i, n = len(values) + int64_t[:] labels + Py_ssize_t idx, count = count_prior + int ret = 0 + int64_t val + khiter_t k + Int64VectorData *ud + + labels = np.empty(n, dtype=np.int64) + ud = uniques.data + + with nogil: + for i in range(n): + val = values[i] + + if check_null and val == iNaT: + labels[i] = na_sentinel + continue + + k = kh_get_int64(self.table, val) + + if k != self.table.n_buckets: + idx = self.table.vals[k] + labels[i] = idx + else: + k = kh_put_int64(self.table, val, &ret) + self.table.vals[k] = count + + if needs_resize(ud): + with gil: + uniques.resize() + append_data_int64(ud, val) + labels[i] = count + count += 1 + + return np.asarray(labels) + + @cython.boundscheck(False) + def get_labels_groupby(self, int64_t[:] values): + cdef: + Py_ssize_t i, n = len(values) + int64_t[:] labels + Py_ssize_t idx, count = 0 + int ret = 0 + int64_t val + khiter_t k + Int64Vector uniques = Int64Vector() + Int64VectorData *ud + + labels = np.empty(n, dtype=np.int64) + ud = uniques.data + + with nogil: + for i in range(n): + val = values[i] + + # specific for groupby + if val < 0: + labels[i] = -1 + continue + + k = kh_get_int64(self.table, val) + if k != self.table.n_buckets: + idx = self.table.vals[k] + labels[i] = idx + else: + k = kh_put_int64(self.table, val, &ret) + self.table.vals[k] = count + + if needs_resize(ud): + with gil: + uniques.resize() + append_data_int64(ud, val) + labels[i] = count + count += 1 + + arr_uniques = uniques.to_array() + + return np.asarray(labels), arr_uniques + + @cython.boundscheck(False) + def unique(self, int64_t[:] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + int64_t val + khiter_t k + bint seen_na = 0 + Int64Vector uniques = Int64Vector() + Int64VectorData *ud + + ud = uniques.data + + with nogil: + for i in range(n): + val = values[i] + + k = kh_get_int64(self.table, val) + if k == self.table.n_buckets: + kh_put_int64(self.table, val, &ret) + if needs_resize(ud): + with gil: + uniques.resize() + append_data_int64(ud, val) + + return uniques.to_array() + + +cdef class StringHashTable(HashTable): + cdef kh_str_t *table + + def __cinit__(self, int size_hint=1): + self.table = kh_init_str() + if size_hint is not None: + kh_resize_str(self.table, size_hint) + + def __dealloc__(self): + kh_destroy_str(self.table) + + cpdef get_item(self, object val): + cdef khiter_t k + k = kh_get_str(self.table, util.get_c_string(val)) + if k != self.table.n_buckets: + return self.table.vals[k] + else: + raise KeyError(val) + + def get_iter_test(self, object key, Py_ssize_t iterations): + cdef Py_ssize_t i, val + for i in range(iterations): + k = kh_get_str(self.table, util.get_c_string(key)) + if k != self.table.n_buckets: + val = self.table.vals[k] + + cpdef set_item(self, object key, Py_ssize_t val): + cdef: + khiter_t k + int ret = 0 + char* buf + + buf = util.get_c_string(key) + + k = kh_put_str(self.table, buf, &ret) + self.table.keys[k] = key + if kh_exist_str(self.table, k): + self.table.vals[k] = val + else: + raise KeyError(key) + + def get_indexer(self, ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values) + ndarray[int64_t] labels = np.empty(n, dtype=np.int64) + char *buf + int64_t *resbuf = labels.data + khiter_t k + kh_str_t *table = self.table + + for i in range(n): + buf = util.get_c_string(values[i]) + k = kh_get_str(table, buf) + if k != table.n_buckets: + resbuf[i] = table.vals[k] + else: + resbuf[i] = -1 + return labels + + def unique(self, ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + object val + char *buf + khiter_t k + ObjectVector uniques = ObjectVector() + + for i in range(n): + val = values[i] + buf = util.get_c_string(val) + k = kh_get_str(self.table, buf) + if k == self.table.n_buckets: + kh_put_str(self.table, buf, &ret) + uniques.append(val) + + return uniques.to_array() + + def factorize(self, ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values) + ndarray[int64_t] labels = np.empty(n, dtype=np.int64) + dict reverse = {} + Py_ssize_t idx, count = 0 + int ret = 0 + object val + char *buf + khiter_t k + + for i in range(n): + val = values[i] + buf = util.get_c_string(val) + k = kh_get_str(self.table, buf) + if k != self.table.n_buckets: + idx = self.table.vals[k] + labels[i] = idx + else: + k = kh_put_str(self.table, buf, &ret) + # print 'putting %s, %s' % (val, count) + + self.table.vals[k] = count + reverse[count] = val + labels[i] = count + count += 1 + + return reverse, labels + + +na_sentinel = object + +cdef class PyObjectHashTable(HashTable): + + def __init__(self, size_hint=1): + self.table = kh_init_pymap() + kh_resize_pymap(self.table, size_hint) + + def __dealloc__(self): + if self.table is not NULL: + self.destroy() + + def __len__(self): + return self.table.size + + def __contains__(self, object key): + cdef khiter_t k + hash(key) + if key != key or key is None: + key = na_sentinel + k = kh_get_pymap(self.table, key) + return k != self.table.n_buckets + + def destroy(self): + kh_destroy_pymap(self.table) + self.table = NULL + + cpdef get_item(self, object val): + cdef khiter_t k + if val != val or val is None: + val = na_sentinel + k = kh_get_pymap(self.table, val) + if k != self.table.n_buckets: + return self.table.vals[k] + else: + raise KeyError(val) + + def get_iter_test(self, object key, Py_ssize_t iterations): + cdef Py_ssize_t i, val + if key != key or key is None: + key = na_sentinel + for i in range(iterations): + k = kh_get_pymap(self.table, key) + if k != self.table.n_buckets: + val = self.table.vals[k] + + cpdef set_item(self, object key, Py_ssize_t val): + cdef: + khiter_t k + int ret = 0 + char* buf + + hash(key) + if key != key or key is None: + key = na_sentinel + k = kh_put_pymap(self.table, key, &ret) + # self.table.keys[k] = key + if kh_exist_pymap(self.table, k): + self.table.vals[k] = val + else: + raise KeyError(key) + + def map_locations(self, ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + object val + khiter_t k + + for i in range(n): + val = values[i] + hash(val) + if val != val or val is None: + val = na_sentinel + + k = kh_put_pymap(self.table, val, &ret) + self.table.vals[k] = i + + def lookup(self, ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + object val + khiter_t k + int64_t[:] locs = np.empty(n, dtype=np.int64) + + for i in range(n): + val = values[i] + hash(val) + if val != val or val is None: + val = na_sentinel + + k = kh_get_pymap(self.table, val) + if k != self.table.n_buckets: + locs[i] = self.table.vals[k] + else: + locs[i] = -1 + + return np.asarray(locs) + + def unique(self, ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + object val + khiter_t k + ObjectVector uniques = ObjectVector() + bint seen_na = 0 + + for i in range(n): + val = values[i] + hash(val) + if not _checknan(val): + k = kh_get_pymap(self.table, val) + if k == self.table.n_buckets: + kh_put_pymap(self.table, val, &ret) + uniques.append(val) + elif not seen_na: + seen_na = 1 + uniques.append(nan) + + return uniques.to_array() + + def get_labels(self, ndarray[object] values, ObjectVector uniques, + Py_ssize_t count_prior, int64_t na_sentinel, + bint check_null=True): + cdef: + Py_ssize_t i, n = len(values) + int64_t[:] labels + Py_ssize_t idx, count = count_prior + int ret = 0 + object val + khiter_t k + + labels = np.empty(n, dtype=np.int64) + + for i in range(n): + val = values[i] + hash(val) + + if check_null and val != val or val is None: + labels[i] = na_sentinel + continue + + k = kh_get_pymap(self.table, val) + if k != self.table.n_buckets: + idx = self.table.vals[k] + labels[i] = idx + else: + k = kh_put_pymap(self.table, val, &ret) + self.table.vals[k] = count + uniques.append(val) + labels[i] = count + count += 1 + + return np.asarray(labels) \ No newline at end of file diff --git a/pandas/src/hashtable_class_helper.pxi.in b/pandas/src/hashtable_class_helper.pxi.in new file mode 100644 index 0000000000000..14e5363eee20c --- /dev/null +++ b/pandas/src/hashtable_class_helper.pxi.in @@ -0,0 +1,642 @@ +""" +Template for each `dtype` helper function for hashtable + +WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in +""" + +#---------------------------------------------------------------------- +# VectorData +#---------------------------------------------------------------------- + +{{py: + +# name, dtype +dtypes = [('Float64', 'float64'), ('Int64', 'int64')] + +}} + +{{for name, dtype in dtypes}} + + +ctypedef struct {{name}}VectorData: + {{dtype}}_t *data + size_t n, m + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef void append_data_{{dtype}}({{name}}VectorData *data, + {{dtype}}_t x) nogil: + + data.data[data.n] = x + data.n += 1 + +{{endfor}} + +ctypedef fused vector_data: + Int64VectorData + Float64VectorData + +cdef bint needs_resize(vector_data *data) nogil: + return data.n == data.m + +#---------------------------------------------------------------------- +# Vector +#---------------------------------------------------------------------- + +{{py: + +# name, dtype +dtypes = [('Float64', 'float64'), ('Int64', 'int64')] + +}} + +{{for name, dtype in dtypes}} + +cdef class {{name}}Vector: + + cdef: + {{name}}VectorData *data + ndarray ao + + def __cinit__(self): + self.data = <{{name}}VectorData *>PyMem_Malloc( + sizeof({{name}}VectorData)) + if not self.data: + raise MemoryError() + self.data.n = 0 + self.data.m = _INIT_VEC_CAP + self.ao = np.empty(self.data.m, dtype=np.{{dtype}}) + self.data.data = <{{dtype}}_t*> self.ao.data + + cdef resize(self): + self.data.m = max(self.data.m * 4, _INIT_VEC_CAP) + self.ao.resize(self.data.m) + self.data.data = <{{dtype}}_t*> self.ao.data + + def __dealloc__(self): + PyMem_Free(self.data) + + def __len__(self): + return self.data.n + + def to_array(self): + self.ao.resize(self.data.n) + self.data.m = self.data.n + return self.ao + + cdef inline void append(self, {{dtype}}_t x): + + if needs_resize(self.data): + self.resize() + + append_data_{{dtype}}(self.data, x) + +{{endfor}} + + +cdef class ObjectVector: + + cdef: + PyObject **data + size_t n, m + ndarray ao + + def __cinit__(self): + self.n = 0 + self.m = _INIT_VEC_CAP + self.ao = np.empty(_INIT_VEC_CAP, dtype=object) + self.data = self.ao.data + + def __len__(self): + return self.n + + cdef inline append(self, object o): + if self.n == self.m: + self.m = max(self.m * 2, _INIT_VEC_CAP) + self.ao.resize(self.m) + self.data = self.ao.data + + Py_INCREF(o) + self.data[self.n] = o + self.n += 1 + + def to_array(self): + self.ao.resize(self.n) + self.m = self.n + return self.ao + + +#---------------------------------------------------------------------- +# HashTable +#---------------------------------------------------------------------- + + +cdef class HashTable: + pass + +{{py: + +# name, dtype, null_condition, float_group +dtypes = [('Float64', 'float64', 'val != val', True), + ('Int64', 'int64', 'val == iNaT', False)] + +}} + + +{{for name, dtype, null_condition, float_group in dtypes}} + +cdef class {{name}}HashTable(HashTable): + + def __cinit__(self, size_hint=1): + self.table = kh_init_{{dtype}}() + if size_hint is not None: + kh_resize_{{dtype}}(self.table, size_hint) + + def __len__(self): + return self.table.size + + def __dealloc__(self): + kh_destroy_{{dtype}}(self.table) + + def __contains__(self, object key): + cdef khiter_t k + k = kh_get_{{dtype}}(self.table, key) + return k != self.table.n_buckets + + cpdef get_item(self, {{dtype}}_t val): + cdef khiter_t k + k = kh_get_{{dtype}}(self.table, val) + if k != self.table.n_buckets: + return self.table.vals[k] + else: + raise KeyError(val) + + def get_iter_test(self, {{dtype}}_t key, Py_ssize_t iterations): + cdef Py_ssize_t i, val=0 + for i in range(iterations): + k = kh_get_{{dtype}}(self.table, val) + if k != self.table.n_buckets: + val = self.table.vals[k] + + cpdef set_item(self, {{dtype}}_t key, Py_ssize_t val): + cdef: + khiter_t k + int ret = 0 + + k = kh_put_{{dtype}}(self.table, key, &ret) + self.table.keys[k] = key + if kh_exist_{{dtype}}(self.table, k): + self.table.vals[k] = val + else: + raise KeyError(key) + + @cython.boundscheck(False) + def map(self, {{dtype}}_t[:] keys, int64_t[:] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + {{dtype}}_t key + khiter_t k + + with nogil: + for i in range(n): + key = keys[i] + k = kh_put_{{dtype}}(self.table, key, &ret) + self.table.vals[k] = values[i] + + @cython.boundscheck(False) + def map_locations(self, ndarray[{{dtype}}_t, ndim=1] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + {{dtype}}_t val + khiter_t k + + with nogil: + for i in range(n): + val = values[i] + k = kh_put_{{dtype}}(self.table, val, &ret) + self.table.vals[k] = i + + @cython.boundscheck(False) + def lookup(self, {{dtype}}_t[:] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + {{dtype}}_t val + khiter_t k + int64_t[:] locs = np.empty(n, dtype=np.int64) + + with nogil: + for i in range(n): + val = values[i] + k = kh_get_{{dtype}}(self.table, val) + if k != self.table.n_buckets: + locs[i] = self.table.vals[k] + else: + locs[i] = -1 + + return np.asarray(locs) + + def factorize(self, {{dtype}}_t values): + uniques = {{name}}Vector() + labels = self.get_labels(values, uniques, 0, 0) + return uniques.to_array(), labels + + @cython.boundscheck(False) + def get_labels(self, {{dtype}}_t[:] values, {{name}}Vector uniques, + Py_ssize_t count_prior, Py_ssize_t na_sentinel, + bint check_null=True): + cdef: + Py_ssize_t i, n = len(values) + int64_t[:] labels + Py_ssize_t idx, count = count_prior + int ret = 0 + {{dtype}}_t val + khiter_t k + {{name}}VectorData *ud + + labels = np.empty(n, dtype=np.int64) + ud = uniques.data + + with nogil: + for i in range(n): + val = values[i] + + if check_null and {{null_condition}}: + labels[i] = na_sentinel + continue + + k = kh_get_{{dtype}}(self.table, val) + + if k != self.table.n_buckets: + idx = self.table.vals[k] + labels[i] = idx + else: + k = kh_put_{{dtype}}(self.table, val, &ret) + self.table.vals[k] = count + + if needs_resize(ud): + with gil: + uniques.resize() + append_data_{{dtype}}(ud, val) + labels[i] = count + count += 1 + + return np.asarray(labels) + + @cython.boundscheck(False) + def get_labels_groupby(self, {{dtype}}_t[:] values): + cdef: + Py_ssize_t i, n = len(values) + int64_t[:] labels + Py_ssize_t idx, count = 0 + int ret = 0 + {{dtype}}_t val + khiter_t k + {{name}}Vector uniques = {{name}}Vector() + {{name}}VectorData *ud + + labels = np.empty(n, dtype=np.int64) + ud = uniques.data + + with nogil: + for i in range(n): + val = values[i] + + # specific for groupby + if val < 0: + labels[i] = -1 + continue + + k = kh_get_{{dtype}}(self.table, val) + if k != self.table.n_buckets: + idx = self.table.vals[k] + labels[i] = idx + else: + k = kh_put_{{dtype}}(self.table, val, &ret) + self.table.vals[k] = count + + if needs_resize(ud): + with gil: + uniques.resize() + append_data_{{dtype}}(ud, val) + labels[i] = count + count += 1 + + arr_uniques = uniques.to_array() + + return np.asarray(labels), arr_uniques + + @cython.boundscheck(False) + def unique(self, {{dtype}}_t[:] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + {{dtype}}_t val + khiter_t k + bint seen_na = 0 + {{name}}Vector uniques = {{name}}Vector() + {{name}}VectorData *ud + + ud = uniques.data + + with nogil: + for i in range(n): + val = values[i] + + {{if float_group}} + if val == val: + k = kh_get_{{dtype}}(self.table, val) + if k == self.table.n_buckets: + kh_put_{{dtype}}(self.table, val, &ret) + if needs_resize(ud): + with gil: + uniques.resize() + append_data_{{dtype}}(ud, val) + elif not seen_na: + seen_na = 1 + if needs_resize(ud): + with gil: + uniques.resize() + append_data_{{dtype}}(ud, NAN) + {{else}} + k = kh_get_{{dtype}}(self.table, val) + if k == self.table.n_buckets: + kh_put_{{dtype}}(self.table, val, &ret) + if needs_resize(ud): + with gil: + uniques.resize() + append_data_{{dtype}}(ud, val) + {{endif}} + + return uniques.to_array() + +{{endfor}} + + +cdef class StringHashTable(HashTable): + cdef kh_str_t *table + + def __cinit__(self, int size_hint=1): + self.table = kh_init_str() + if size_hint is not None: + kh_resize_str(self.table, size_hint) + + def __dealloc__(self): + kh_destroy_str(self.table) + + cpdef get_item(self, object val): + cdef khiter_t k + k = kh_get_str(self.table, util.get_c_string(val)) + if k != self.table.n_buckets: + return self.table.vals[k] + else: + raise KeyError(val) + + def get_iter_test(self, object key, Py_ssize_t iterations): + cdef Py_ssize_t i, val + for i in range(iterations): + k = kh_get_str(self.table, util.get_c_string(key)) + if k != self.table.n_buckets: + val = self.table.vals[k] + + cpdef set_item(self, object key, Py_ssize_t val): + cdef: + khiter_t k + int ret = 0 + char* buf + + buf = util.get_c_string(key) + + k = kh_put_str(self.table, buf, &ret) + self.table.keys[k] = key + if kh_exist_str(self.table, k): + self.table.vals[k] = val + else: + raise KeyError(key) + + def get_indexer(self, ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values) + ndarray[int64_t] labels = np.empty(n, dtype=np.int64) + char *buf + int64_t *resbuf = labels.data + khiter_t k + kh_str_t *table = self.table + + for i in range(n): + buf = util.get_c_string(values[i]) + k = kh_get_str(table, buf) + if k != table.n_buckets: + resbuf[i] = table.vals[k] + else: + resbuf[i] = -1 + return labels + + def unique(self, ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + object val + char *buf + khiter_t k + ObjectVector uniques = ObjectVector() + + for i in range(n): + val = values[i] + buf = util.get_c_string(val) + k = kh_get_str(self.table, buf) + if k == self.table.n_buckets: + kh_put_str(self.table, buf, &ret) + uniques.append(val) + + return uniques.to_array() + + def factorize(self, ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values) + ndarray[int64_t] labels = np.empty(n, dtype=np.int64) + dict reverse = {} + Py_ssize_t idx, count = 0 + int ret = 0 + object val + char *buf + khiter_t k + + for i in range(n): + val = values[i] + buf = util.get_c_string(val) + k = kh_get_str(self.table, buf) + if k != self.table.n_buckets: + idx = self.table.vals[k] + labels[i] = idx + else: + k = kh_put_str(self.table, buf, &ret) + # print 'putting %s, %s' % (val, count) + + self.table.vals[k] = count + reverse[count] = val + labels[i] = count + count += 1 + + return reverse, labels + + +na_sentinel = object + +cdef class PyObjectHashTable(HashTable): + + def __init__(self, size_hint=1): + self.table = kh_init_pymap() + kh_resize_pymap(self.table, size_hint) + + def __dealloc__(self): + if self.table is not NULL: + self.destroy() + + def __len__(self): + return self.table.size + + def __contains__(self, object key): + cdef khiter_t k + hash(key) + if key != key or key is None: + key = na_sentinel + k = kh_get_pymap(self.table, key) + return k != self.table.n_buckets + + def destroy(self): + kh_destroy_pymap(self.table) + self.table = NULL + + cpdef get_item(self, object val): + cdef khiter_t k + if val != val or val is None: + val = na_sentinel + k = kh_get_pymap(self.table, val) + if k != self.table.n_buckets: + return self.table.vals[k] + else: + raise KeyError(val) + + def get_iter_test(self, object key, Py_ssize_t iterations): + cdef Py_ssize_t i, val + if key != key or key is None: + key = na_sentinel + for i in range(iterations): + k = kh_get_pymap(self.table, key) + if k != self.table.n_buckets: + val = self.table.vals[k] + + cpdef set_item(self, object key, Py_ssize_t val): + cdef: + khiter_t k + int ret = 0 + char* buf + + hash(key) + if key != key or key is None: + key = na_sentinel + k = kh_put_pymap(self.table, key, &ret) + # self.table.keys[k] = key + if kh_exist_pymap(self.table, k): + self.table.vals[k] = val + else: + raise KeyError(key) + + def map_locations(self, ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + object val + khiter_t k + + for i in range(n): + val = values[i] + hash(val) + if val != val or val is None: + val = na_sentinel + + k = kh_put_pymap(self.table, val, &ret) + self.table.vals[k] = i + + def lookup(self, ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + object val + khiter_t k + int64_t[:] locs = np.empty(n, dtype=np.int64) + + for i in range(n): + val = values[i] + hash(val) + if val != val or val is None: + val = na_sentinel + + k = kh_get_pymap(self.table, val) + if k != self.table.n_buckets: + locs[i] = self.table.vals[k] + else: + locs[i] = -1 + + return np.asarray(locs) + + def unique(self, ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + object val + khiter_t k + ObjectVector uniques = ObjectVector() + bint seen_na = 0 + + for i in range(n): + val = values[i] + hash(val) + if not _checknan(val): + k = kh_get_pymap(self.table, val) + if k == self.table.n_buckets: + kh_put_pymap(self.table, val, &ret) + uniques.append(val) + elif not seen_na: + seen_na = 1 + uniques.append(nan) + + return uniques.to_array() + + def get_labels(self, ndarray[object] values, ObjectVector uniques, + Py_ssize_t count_prior, int64_t na_sentinel, + bint check_null=True): + cdef: + Py_ssize_t i, n = len(values) + int64_t[:] labels + Py_ssize_t idx, count = count_prior + int ret = 0 + object val + khiter_t k + + labels = np.empty(n, dtype=np.int64) + + for i in range(n): + val = values[i] + hash(val) + + if check_null and val != val or val is None: + labels[i] = na_sentinel + continue + + k = kh_get_pymap(self.table, val) + if k != self.table.n_buckets: + idx = self.table.vals[k] + labels[i] = idx + else: + k = kh_put_pymap(self.table, val, &ret) + self.table.vals[k] = count + uniques.append(val) + labels[i] = count + count += 1 + + return np.asarray(labels) \ No newline at end of file diff --git a/pandas/src/hashtable_func_helper.pxi b/pandas/src/hashtable_func_helper.pxi new file mode 100644 index 0000000000000..d05b81acc5dd5 --- /dev/null +++ b/pandas/src/hashtable_func_helper.pxi @@ -0,0 +1,197 @@ +""" +Template for each `dtype` helper function for hashtable + +WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in +""" + +#---------------------------------------------------------------------- +# VectorData +#---------------------------------------------------------------------- + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef build_count_table_float64(float64_t[:] values, + kh_float64_t *table, bint dropna): + cdef: + khiter_t k + Py_ssize_t i, n = len(values) + float64_t val + int ret = 0 + + with nogil: + kh_resize_float64(table, n) + + for i in range(n): + val = values[i] + if val == val or not dropna: + k = kh_get_float64(table, val) + if k != table.n_buckets: + table.vals[k] += 1 + else: + k = kh_put_float64(table, val, &ret) + table.vals[k] = 1 + + +@cython.wraparound(False) +@cython.boundscheck(False) +cpdef value_count_float64(float64_t[:] values, bint dropna): + cdef: + Py_ssize_t i=0 + kh_float64_t *table + float64_t[:] result_keys + int64_t[:] result_counts + int k + + table = kh_init_float64() + build_count_table_float64(values, table, dropna) + + result_keys = np.empty(table.n_occupied, dtype=np.float64) + result_counts = np.zeros(table.n_occupied, dtype=np.int64) + + with nogil: + for k in range(table.n_buckets): + if kh_exist_float64(table, k): + result_keys[i] = table.keys[k] + result_counts[i] = table.vals[k] + i += 1 + kh_destroy_float64(table) + + return np.asarray(result_keys), np.asarray(result_counts) + + +@cython.wraparound(False) +@cython.boundscheck(False) +def duplicated_float64(float64_t[:] values, + object keep='first'): + cdef: + int ret = 0, k + float64_t value + Py_ssize_t i, n = len(values) + kh_float64_t * table = kh_init_float64() + ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool') + + kh_resize_float64(table, min(n, _SIZE_HINT_LIMIT)) + + if keep not in ('last', 'first', False): + raise ValueError('keep must be either "first", "last" or False') + + if keep == 'last': + with nogil: + for i from n > i >=0: + kh_put_float64(table, values[i], &ret) + out[i] = ret == 0 + elif keep == 'first': + with nogil: + for i from 0 <= i < n: + kh_put_float64(table, values[i], &ret) + out[i] = ret == 0 + else: + with nogil: + for i from 0 <= i < n: + value = values[i] + k = kh_get_float64(table, value) + if k != table.n_buckets: + out[table.vals[k]] = 1 + out[i] = 1 + else: + k = kh_put_float64(table, value, &ret) + table.keys[k] = value + table.vals[k] = i + out[i] = 0 + kh_destroy_float64(table) + return out + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef build_count_table_int64(int64_t[:] values, + kh_int64_t *table, bint dropna): + cdef: + khiter_t k + Py_ssize_t i, n = len(values) + int64_t val + int ret = 0 + + with nogil: + kh_resize_int64(table, n) + + for i in range(n): + val = values[i] + if val == val or not dropna: + k = kh_get_int64(table, val) + if k != table.n_buckets: + table.vals[k] += 1 + else: + k = kh_put_int64(table, val, &ret) + table.vals[k] = 1 + + +@cython.wraparound(False) +@cython.boundscheck(False) +cpdef value_count_int64(int64_t[:] values, bint dropna): + cdef: + Py_ssize_t i=0 + kh_int64_t *table + int64_t[:] result_keys + int64_t[:] result_counts + int k + + table = kh_init_int64() + build_count_table_int64(values, table, dropna) + + result_keys = np.empty(table.n_occupied, dtype=np.int64) + result_counts = np.zeros(table.n_occupied, dtype=np.int64) + + with nogil: + for k in range(table.n_buckets): + if kh_exist_int64(table, k): + result_keys[i] = table.keys[k] + result_counts[i] = table.vals[k] + i += 1 + kh_destroy_int64(table) + + return np.asarray(result_keys), np.asarray(result_counts) + + +@cython.wraparound(False) +@cython.boundscheck(False) +def duplicated_int64(int64_t[:] values, + object keep='first'): + cdef: + int ret = 0, k + int64_t value + Py_ssize_t i, n = len(values) + kh_int64_t * table = kh_init_int64() + ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool') + + kh_resize_int64(table, min(n, _SIZE_HINT_LIMIT)) + + if keep not in ('last', 'first', False): + raise ValueError('keep must be either "first", "last" or False') + + if keep == 'last': + with nogil: + for i from n > i >=0: + kh_put_int64(table, values[i], &ret) + out[i] = ret == 0 + elif keep == 'first': + with nogil: + for i from 0 <= i < n: + kh_put_int64(table, values[i], &ret) + out[i] = ret == 0 + else: + with nogil: + for i from 0 <= i < n: + value = values[i] + k = kh_get_int64(table, value) + if k != table.n_buckets: + out[table.vals[k]] = 1 + out[i] = 1 + else: + k = kh_put_int64(table, value, &ret) + table.keys[k] = value + table.vals[k] = i + out[i] = 0 + kh_destroy_int64(table) + return out diff --git a/pandas/src/hashtable_func_helper.pxi.in b/pandas/src/hashtable_func_helper.pxi.in new file mode 100644 index 0000000000000..1840b914f3328 --- /dev/null +++ b/pandas/src/hashtable_func_helper.pxi.in @@ -0,0 +1,114 @@ +""" +Template for each `dtype` helper function for hashtable + +WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in +""" + +#---------------------------------------------------------------------- +# VectorData +#---------------------------------------------------------------------- + +{{py: + +# name +dtypes = ['float64', 'int64'] + +}} + +{{for dtype in dtypes}} + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef build_count_table_{{dtype}}({{dtype}}_t[:] values, + kh_{{dtype}}_t *table, bint dropna): + cdef: + khiter_t k + Py_ssize_t i, n = len(values) + {{dtype}}_t val + int ret = 0 + + with nogil: + kh_resize_{{dtype}}(table, n) + + for i in range(n): + val = values[i] + if val == val or not dropna: + k = kh_get_{{dtype}}(table, val) + if k != table.n_buckets: + table.vals[k] += 1 + else: + k = kh_put_{{dtype}}(table, val, &ret) + table.vals[k] = 1 + + +@cython.wraparound(False) +@cython.boundscheck(False) +cpdef value_count_{{dtype}}({{dtype}}_t[:] values, bint dropna): + cdef: + Py_ssize_t i=0 + kh_{{dtype}}_t *table + {{dtype}}_t[:] result_keys + int64_t[:] result_counts + int k + + table = kh_init_{{dtype}}() + build_count_table_{{dtype}}(values, table, dropna) + + result_keys = np.empty(table.n_occupied, dtype=np.{{dtype}}) + result_counts = np.zeros(table.n_occupied, dtype=np.int64) + + with nogil: + for k in range(table.n_buckets): + if kh_exist_{{dtype}}(table, k): + result_keys[i] = table.keys[k] + result_counts[i] = table.vals[k] + i += 1 + kh_destroy_{{dtype}}(table) + + return np.asarray(result_keys), np.asarray(result_counts) + + +@cython.wraparound(False) +@cython.boundscheck(False) +def duplicated_{{dtype}}({{dtype}}_t[:] values, + object keep='first'): + cdef: + int ret = 0, k + {{dtype}}_t value + Py_ssize_t i, n = len(values) + kh_{{dtype}}_t * table = kh_init_{{dtype}}() + ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool') + + kh_resize_{{dtype}}(table, min(n, _SIZE_HINT_LIMIT)) + + if keep not in ('last', 'first', False): + raise ValueError('keep must be either "first", "last" or False') + + if keep == 'last': + with nogil: + for i from n > i >=0: + kh_put_{{dtype}}(table, values[i], &ret) + out[i] = ret == 0 + elif keep == 'first': + with nogil: + for i from 0 <= i < n: + kh_put_{{dtype}}(table, values[i], &ret) + out[i] = ret == 0 + else: + with nogil: + for i from 0 <= i < n: + value = values[i] + k = kh_get_{{dtype}}(table, value) + if k != table.n_buckets: + out[table.vals[k]] = 1 + out[i] = 1 + else: + k = kh_put_{{dtype}}(table, value, &ret) + table.keys[k] = value + table.vals[k] = i + out[i] = 0 + kh_destroy_{{dtype}}(table) + return out + +{{endfor}} diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx index 5f7c5478b5d87..4fa730eac0fd1 100644 --- a/pandas/src/inference.pyx +++ b/pandas/src/inference.pyx @@ -6,51 +6,70 @@ iNaT = util.get_nat() cdef bint PY2 = sys.version_info[0] == 2 +cdef extern from "headers/stdint.h": + enum: UINT8_MAX + enum: UINT16_MAX + enum: UINT32_MAX + enum: UINT64_MAX + enum: INT8_MIN + enum: INT8_MAX + enum: INT16_MIN + enum: INT16_MAX + enum: INT32_MAX + enum: INT32_MIN + enum: INT64_MAX + enum: INT64_MIN + # core.common import for fast inference checks + + def is_float(object obj): return util.is_float_object(obj) + def is_integer(object obj): return util.is_integer_object(obj) + def is_bool(object obj): return util.is_bool_object(obj) + def is_complex(object obj): return util.is_complex_object(obj) -def is_period(object val): +cpdef bint is_period(object val): """ Return a boolean if this is a Period object """ return util.is_period_object(val) _TYPE_MAP = { - 'categorical' : 'categorical', - 'category' : 'categorical', + 'categorical': 'categorical', + 'category': 'categorical', 'int8': 'integer', 'int16': 'integer', 'int32': 'integer', 'int64': 'integer', - 'i' : 'integer', + 'i': 'integer', 'uint8': 'integer', 'uint16': 'integer', 'uint32': 'integer', 'uint64': 'integer', - 'u' : 'integer', + 'u': 'integer', 'float32': 'floating', 'float64': 'floating', - 'f' : 'floating', + 'f': 'floating', 'complex128': 'complex', - 'c' : 'complex', + 'c': 'complex', 'string': 'string' if PY2 else 'bytes', - 'S' : 'string' if PY2 else 'bytes', + 'S': 'string' if PY2 else 'bytes', 'unicode': 'unicode' if PY2 else 'string', - 'U' : 'unicode' if PY2 else 'string', + 'U': 'unicode' if PY2 else 'string', 'bool': 'boolean', - 'b' : 'boolean', - 'datetime64[ns]' : 'datetime64', - 'M' : 'datetime64', - 'timedelta64[ns]' : 'timedelta64', - 'm' : 'timedelta64', + 'b': 'boolean', + 'datetime64[ns]': 'datetime64', + 'M': 'datetime64', + 'timedelta64[ns]': 'timedelta64', + 'm': 'timedelta64', } # types only exist on certain platform @@ -74,12 +93,13 @@ cdef _try_infer_map(v): """ if its in our map, just return the dtype """ cdef: object attr, val - for attr in ['name','kind','base']: - val = getattr(v.dtype,attr) + for attr in ['name', 'kind', 'base']: + val = getattr(v.dtype, attr) if val in _TYPE_MAP: return _TYPE_MAP[val] return None + def infer_dtype(object _values): """ we are coercing to an ndarray here @@ -89,15 +109,17 @@ def infer_dtype(object _values): Py_ssize_t i, n object val ndarray values + bint seen_pdnat = False, seen_val = False if isinstance(_values, np.ndarray): values = _values - elif hasattr(_values,'dtype'): + elif hasattr(_values, 'dtype'): # this will handle ndarray-like # e.g. categoricals try: - values = getattr(_values, '_values', getattr(_values, 'values', _values)) + values = getattr(_values, '_values', getattr( + _values, 'values', _values)) except: val = _try_infer_map(_values) if val is not None: @@ -127,17 +149,34 @@ def infer_dtype(object _values): values = values.ravel() # try to use a valid value - for i in range(n): - val = util.get_value_1d(values, i) - if not is_null_datetimelike(val): - break + for i from 0 <= i < n: + val = util.get_value_1d(values, i) + + # do not use is_nul_datetimelike to keep + # np.datetime64('nat') and np.timedelta64('nat') + if util._checknull(val): + pass + elif val is NaT: + seen_pdnat = True + else: + seen_val = True + break - if util.is_datetime64_object(val) or val is NaT: + # if all values are nan/NaT + if seen_val is False and seen_pdnat is True: + return 'datetime' + # float/object nan is handled in latter logic + + if util.is_datetime64_object(val): if is_datetime64_array(values): return 'datetime64' elif is_timedelta_or_timedelta64_array(values): return 'timedelta' + elif is_timedelta(val): + if is_timedelta_or_timedelta64_array(values): + return 'timedelta' + elif util.is_integer_object(val): # a timedelta will show true here as well if is_timedelta(val): @@ -186,17 +225,15 @@ def infer_dtype(object _values): if is_bytes_array(values): return 'bytes' - elif is_timedelta(val): - if is_timedelta_or_timedelta64_array(values): - return 'timedelta' - elif is_period(val): if is_period_array(values): return 'period' for i in range(n): val = util.get_value_1d(values, i) - if util.is_integer_object(val): + if (util.is_integer_object(val) and + not util.is_timedelta64_object(val) and + not util.is_datetime64_object(val)): return 'mixed-integer' return 'mixed' @@ -212,31 +249,68 @@ def is_possible_datetimelike_array(object arr): for i in range(n): v = arr[i] if util.is_string_object(v): - continue + continue elif util._checknull(v): - continue + continue elif is_datetime(v): - seen_datetime=1 + seen_datetime=1 elif is_timedelta(v): - seen_timedelta=1 + seen_timedelta=1 else: - return False + return False return seen_datetime or seen_timedelta + cdef inline bint is_null_datetimelike(v): - # determine if we have a null for a timedelta/datetime (or integer versions)x + # determine if we have a null for a timedelta/datetime (or integer + # versions)x if util._checknull(v): return True + elif v is NaT: + return True elif util.is_timedelta64_object(v): return v.view('int64') == iNaT elif util.is_datetime64_object(v): return v.view('int64') == iNaT elif util.is_integer_object(v): return v == iNaT + return False + + +cdef inline bint is_null_datetime64(v): + # determine if we have a null for a datetime (or integer versions), + # excluding np.timedelta64('nat') + if util._checknull(v): + return True + elif v is NaT: + return True + elif util.is_datetime64_object(v): + return v.view('int64') == iNaT + return False + + +cdef inline bint is_null_timedelta64(v): + # determine if we have a null for a timedelta (or integer versions), + # excluding np.datetime64('nat') + if util._checknull(v): + return True elif v is NaT: return True + elif util.is_timedelta64_object(v): + return v.view('int64') == iNaT return False + +cdef inline bint is_null_period(v): + # determine if we have a null for a Period (or integer versions), + # excluding np.datetime64('nat') and np.timedelta64('nat') + if util._checknull(v): + return True + elif v is NaT: + return True + return False + + cdef inline bint is_datetime(object o): return PyDateTime_Check(o) @@ -249,6 +323,7 @@ cdef inline bint is_time(object o): cdef inline bint is_timedelta(object o): return PyDelta_Check(o) or util.is_timedelta64_object(o) + def is_bool_array(ndarray values): cdef: Py_ssize_t i, n = len(values) @@ -269,9 +344,11 @@ def is_bool_array(ndarray values): else: return False + def is_integer(object o): return util.is_integer_object(o) + def is_integer_array(ndarray values): cdef: Py_ssize_t i, n = len(values) @@ -292,6 +369,7 @@ def is_integer_array(ndarray values): else: return False + def is_integer_float_array(ndarray values): cdef: Py_ssize_t i, n = len(values) @@ -314,6 +392,7 @@ def is_integer_float_array(ndarray values): else: return False + def is_float_array(ndarray values): cdef: Py_ssize_t i, n = len(values) @@ -334,6 +413,7 @@ def is_float_array(ndarray values): else: return False + def is_string_array(ndarray values): cdef: Py_ssize_t i, n = len(values) @@ -355,6 +435,7 @@ def is_string_array(ndarray values): else: return False + def is_unicode_array(ndarray values): cdef: Py_ssize_t i, n = len(values) @@ -406,14 +487,15 @@ def is_datetime_array(ndarray[object] values): # return False for all nulls for i in range(n): v = values[i] - if is_null_datetimelike(v): + if is_null_datetime64(v): # we are a regular null if util._checknull(v): - null_count += 1 + null_count += 1 elif not is_datetime(v): return False return null_count != n + def is_datetime64_array(ndarray values): cdef Py_ssize_t i, null_count = 0, n = len(values) cdef object v @@ -423,7 +505,7 @@ def is_datetime64_array(ndarray values): # return False for all nulls for i in range(n): v = values[i] - if is_null_datetimelike(v): + if is_null_datetime64(v): # we are a regular null if util._checknull(v): null_count += 1 @@ -467,7 +549,7 @@ def is_timedelta_array(ndarray values): return False for i in range(n): v = values[i] - if is_null_datetimelike(v): + if is_null_timedelta64(v): # we are a regular null if util._checknull(v): null_count += 1 @@ -475,6 +557,7 @@ def is_timedelta_array(ndarray values): return False return null_count != n + def is_timedelta64_array(ndarray values): cdef Py_ssize_t i, null_count = 0, n = len(values) cdef object v @@ -482,7 +565,7 @@ def is_timedelta64_array(ndarray values): return False for i in range(n): v = values[i] - if is_null_datetimelike(v): + if is_null_timedelta64(v): # we are a regular null if util._checknull(v): null_count += 1 @@ -490,6 +573,7 @@ def is_timedelta64_array(ndarray values): return False return null_count != n + def is_timedelta_or_timedelta64_array(ndarray values): """ infer with timedeltas and/or nat/none """ cdef Py_ssize_t i, null_count = 0, n = len(values) @@ -498,7 +582,7 @@ def is_timedelta_or_timedelta64_array(ndarray values): return False for i in range(n): v = values[i] - if is_null_datetimelike(v): + if is_null_timedelta64(v): # we are a regular null if util._checknull(v): null_count += 1 @@ -506,6 +590,7 @@ def is_timedelta_or_timedelta64_array(ndarray values): return False return null_count != n + def is_date_array(ndarray[object] values): cdef Py_ssize_t i, n = len(values) if n == 0: @@ -515,6 +600,7 @@ def is_date_array(ndarray[object] values): return False return True + def is_time_array(ndarray[object] values): cdef Py_ssize_t i, n = len(values) if n == 0: @@ -524,20 +610,23 @@ def is_time_array(ndarray[object] values): return False return True -def is_period(object o): - from pandas import Period - return isinstance(o,Period) def is_period_array(ndarray[object] values): - cdef Py_ssize_t i, n = len(values) - from pandas.tseries.period import Period - + cdef Py_ssize_t i, null_count = 0, n = len(values) + cdef object v if n == 0: return False + + # return False for all nulls for i in range(n): - if not isinstance(values[i], Period): + v = values[i] + if is_null_period(v): + # we are a regular null + if util._checknull(v): + null_count += 1 + elif not is_period(v): return False - return True + return null_count != n cdef extern from "parse_helper.h": @@ -546,6 +635,7 @@ cdef extern from "parse_helper.h": cdef int64_t iINT64_MAX = INT64_MAX cdef int64_t iINT64_MIN = INT64_MIN + def maybe_convert_numeric(object[:] values, set na_values, bint convert_empty=True, bint coerce_numeric=False): """ @@ -613,9 +703,9 @@ def maybe_convert_numeric(object[:] values, set na_values, raise ValueError('integer out of range') else: seen_float = True - except: + except (TypeError, ValueError) as e: if not coerce_numeric: - raise + raise type(e)(str(e) + ' at position {}'.format(i)) floats[i] = nan seen_float = True @@ -699,7 +789,8 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, seen_float = 1 elif util.is_datetime64_object(val): if convert_datetime: - idatetimes[i] = convert_to_tsobject(val, None, None, 0, 0).value + idatetimes[i] = convert_to_tsobject( + val, None, None, 0, 0).value seen_datetime = 1 else: seen_object = 1 @@ -707,7 +798,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, break elif is_timedelta(val): if convert_timedelta: - itimedeltas[i] = convert_to_timedelta64(val, 'ns', False) + itimedeltas[i] = convert_to_timedelta64(val, 'ns') seen_timedelta = 1 else: seen_object = 1 @@ -734,7 +825,8 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, break else: seen_datetime = 1 - idatetimes[i] = convert_to_tsobject(val, None, None, 0, 0).value + idatetimes[i] = convert_to_tsobject( + val, None, None, 0, 0).value else: seen_object = 1 break @@ -784,7 +876,8 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, return floats elif seen_int: return ints - elif not seen_datetime and not seen_numeric and not seen_timedelta: + elif (not seen_datetime and not seen_numeric + and not seen_timedelta): return bools.view(np.bool_) else: @@ -814,7 +907,8 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, return floats elif seen_int: return ints - elif not seen_datetime and not seen_numeric and not seen_timedelta: + elif (not seen_datetime and not seen_numeric + and not seen_timedelta): return bools.view(np.bool_) return objects @@ -823,8 +917,9 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, def convert_sql_column(x): return maybe_convert_objects(x, try_float=1) + def try_parse_dates(ndarray[object] values, parser=None, - dayfirst=False,default=None): + dayfirst=False, default=None): cdef: Py_ssize_t i, n ndarray[object] result @@ -834,12 +929,12 @@ def try_parse_dates(ndarray[object] values, parser=None, if parser is None: if default is None: # GH2618 - date=datetime.now() - default=datetime(date.year,date.month,1) + date=datetime.now() + default=datetime(date.year, date.month, 1) try: from dateutil.parser import parse - parse_date = lambda x: parse(x, dayfirst=dayfirst,default=default) + parse_date = lambda x: parse(x, dayfirst=dayfirst, default=default) except ImportError: # pragma: no cover def parse_date(s): try: @@ -871,9 +966,10 @@ def try_parse_dates(ndarray[object] values, parser=None, return result + def try_parse_date_and_time(ndarray[object] dates, ndarray[object] times, date_parser=None, time_parser=None, - dayfirst=False,default=None): + dayfirst=False, default=None): cdef: Py_ssize_t i, n ndarray[object] result @@ -887,8 +983,8 @@ def try_parse_date_and_time(ndarray[object] dates, ndarray[object] times, if date_parser is None: if default is None: # GH2618 - date=datetime.now() - default=datetime(date.year,date.month,1) + date=datetime.now() + default=datetime(date.year, date.month, 1) try: from dateutil.parser import parse @@ -943,6 +1039,7 @@ def try_parse_year_month_day(ndarray[object] years, ndarray[object] months, return result + def try_parse_datetime_components(ndarray[object] years, ndarray[object] months, ndarray[object] days, @@ -979,6 +1076,7 @@ def try_parse_datetime_components(ndarray[object] years, return result + def sanitize_objects(ndarray[object] values, set na_values, convert_empty=True): cdef: @@ -1002,6 +1100,7 @@ def sanitize_objects(ndarray[object] values, set na_values, return na_count + def maybe_convert_bool(ndarray[object] arr, true_values=None, false_values=None): cdef: @@ -1093,6 +1192,7 @@ def map_infer_mask(ndarray arr, object f, ndarray[uint8_t] mask, return result + def map_infer(ndarray arr, object f, bint convert=1): """ Substitute for np.vectorize with pandas-friendly dtype inference @@ -1173,6 +1273,7 @@ def to_object_array(list rows, int min_width=0): return result + def tuples_to_object_array(ndarray[object] tuples): cdef: Py_ssize_t i, j, n, k, tmp @@ -1189,6 +1290,7 @@ def tuples_to_object_array(ndarray[object] tuples): return result + def to_object_array_tuples(list rows): cdef: Py_ssize_t i, j, n, k, tmp @@ -1240,3 +1342,74 @@ def fast_multiget(dict mapping, ndarray keys, default=np.nan): output[i] = default return maybe_convert_objects(output) + + +def downcast_int64(ndarray[int64_t] arr, object na_values, + bint use_unsigned=0): + cdef: + Py_ssize_t i, n = len(arr) + int64_t mx = INT64_MIN + 1, mn = INT64_MAX + int64_t NA = na_values[np.int64] + int64_t val + ndarray[uint8_t] mask + int na_count = 0 + + _mask = np.empty(n, dtype=bool) + mask = _mask.view(np.uint8) + + for i in range(n): + val = arr[i] + + if val == NA: + mask[i] = 1 + na_count += 1 + continue + + # not NA + mask[i] = 0 + + if val > mx: + mx = val + + if val < mn: + mn = val + + if mn >= 0 and use_unsigned: + if mx <= UINT8_MAX - 1: + result = arr.astype(np.uint8) + if na_count: + np.putmask(result, _mask, na_values[np.uint8]) + return result + + if mx <= UINT16_MAX - 1: + result = arr.astype(np.uint16) + if na_count: + np.putmask(result, _mask, na_values[np.uint16]) + return result + + if mx <= UINT32_MAX - 1: + result = arr.astype(np.uint32) + if na_count: + np.putmask(result, _mask, na_values[np.uint32]) + return result + + else: + if mn >= INT8_MIN + 1 and mx <= INT8_MAX: + result = arr.astype(np.int8) + if na_count: + np.putmask(result, _mask, na_values[np.int8]) + return result + + if mn >= INT16_MIN + 1 and mx <= INT16_MAX: + result = arr.astype(np.int16) + if na_count: + np.putmask(result, _mask, na_values[np.int16]) + return result + + if mn >= INT32_MIN + 1 and mx <= INT32_MAX: + result = arr.astype(np.int32) + if na_count: + np.putmask(result, _mask, na_values[np.int32]) + return result + + return arr diff --git a/pandas/src/join.pyx b/pandas/src/join.pyx index 8a9cf01375a68..65c790beb5dbf 100644 --- a/pandas/src/join.pyx +++ b/pandas/src/join.pyx @@ -1,3 +1,43 @@ +# cython: profile=False + +from numpy cimport * +cimport numpy as np +import numpy as np + +cimport cython + +import_array() + +cimport util + +from numpy cimport NPY_INT8 as NPY_int8 +from numpy cimport NPY_INT16 as NPY_int16 +from numpy cimport NPY_INT32 as NPY_int32 +from numpy cimport NPY_INT64 as NPY_int64 +from numpy cimport NPY_FLOAT16 as NPY_float16 +from numpy cimport NPY_FLOAT32 as NPY_float32 +from numpy cimport NPY_FLOAT64 as NPY_float64 + +from numpy cimport (int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t, + uint32_t, uint64_t, float16_t, float32_t, float64_t) + +int8 = np.dtype(np.int8) +int16 = np.dtype(np.int16) +int32 = np.dtype(np.int32) +int64 = np.dtype(np.int64) +float16 = np.dtype(np.float16) +float32 = np.dtype(np.float32) +float64 = np.dtype(np.float64) + +cdef double NaN = np.NaN +cdef double nan = NaN + +from pandas.algos import groupsort_indexer, ensure_platform_int +from pandas.core.algorithms import take_nd + +include "joins_func_helper.pxi" + + def inner_join(ndarray[int64_t] left, ndarray[int64_t] right, Py_ssize_t max_groups): cdef: @@ -48,6 +88,7 @@ def inner_join(ndarray[int64_t] left, ndarray[int64_t] right, return (_get_result_indexer(left_sorter, left_indexer), _get_result_indexer(right_sorter, right_indexer)) + def left_outer_join(ndarray[int64_t] left, ndarray[int64_t] right, Py_ssize_t max_groups, sort=True): cdef: @@ -108,23 +149,20 @@ def left_outer_join(ndarray[int64_t] left, ndarray[int64_t] right, # no multiple matches for any row on the left # this is a short-cut to avoid groupsort_indexer # otherwise, the `else` path also works in this case - if left_sorter.dtype != np.int_: - left_sorter = left_sorter.astype(np.int_) + left_sorter = ensure_platform_int(left_sorter) - rev = np.empty(len(left), dtype=np.int_) + rev = np.empty(len(left), dtype=np.intp) rev.put(left_sorter, np.arange(len(left))) else: rev, _ = groupsort_indexer(left_indexer, len(left)) - if rev.dtype != np.int_: - rev = rev.astype(np.int_) + rev = ensure_platform_int(rev) right_indexer = right_indexer.take(rev) left_indexer = left_indexer.take(rev) return left_indexer, right_indexer - def full_outer_join(ndarray[int64_t] left, ndarray[int64_t] right, Py_ssize_t max_groups): cdef: @@ -188,13 +226,9 @@ def full_outer_join(ndarray[int64_t] left, ndarray[int64_t] right, _get_result_indexer(right_sorter, right_indexer)) - def _get_result_indexer(sorter, indexer): - if indexer.dtype != np.int_: - indexer = indexer.astype(np.int_) if len(sorter) > 0: - res = sorter.take(indexer) - np.putmask(res, indexer == -1, -1) + res = take_nd(sorter, indexer, fill_value=-1) else: # length-0 case res = np.empty(len(indexer), dtype=np.int64) @@ -203,7 +237,6 @@ def _get_result_indexer(sorter, indexer): return res - def ffill_indexer(ndarray[int64_t] indexer): cdef: Py_ssize_t i, n = len(indexer) @@ -247,3 +280,5 @@ def ffill_by_group(ndarray[int64_t] indexer, ndarray[int64_t] group_ids, return result + +include "join_helper.pxi" diff --git a/pandas/src/join_helper.pxi b/pandas/src/join_helper.pxi new file mode 100644 index 0000000000000..44b8159351492 --- /dev/null +++ b/pandas/src/join_helper.pxi @@ -0,0 +1,1899 @@ +""" +Template for each `dtype` helper function for join + +WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in +""" + +#---------------------------------------------------------------------- +# left_join_indexer, inner_join_indexer, outer_join_indexer +#---------------------------------------------------------------------- + +# Joins on ordered, unique indices + +# right might contain non-unique values + + +@cython.wraparound(False) +@cython.boundscheck(False) +def left_join_indexer_unique_float64(ndarray[float64_t] left, + ndarray[float64_t] right): + cdef: + Py_ssize_t i, j, nleft, nright + ndarray[int64_t] indexer + float64_t lval, rval + + i = 0 + j = 0 + nleft = len(left) + nright = len(right) + + indexer = np.empty(nleft, dtype=np.int64) + while True: + if i == nleft: + break + + if j == nright: + indexer[i] = -1 + i += 1 + continue + + rval = right[j] + + while i < nleft - 1 and left[i] == rval: + indexer[i] = j + i += 1 + + if left[i] == right[j]: + indexer[i] = j + i += 1 + while i < nleft - 1 and left[i] == rval: + indexer[i] = j + i += 1 + j += 1 + elif left[i] > rval: + indexer[i] = -1 + j += 1 + else: + indexer[i] = -1 + i += 1 + return indexer + + +# @cython.wraparound(False) +# @cython.boundscheck(False) +def left_join_indexer_float64(ndarray[float64_t] left, + ndarray[float64_t] right): + """ + Two-pass algorithm for monotonic indexes. Handles many-to-one merges + """ + cdef: + Py_ssize_t i, j, k, nright, nleft, count + float64_t lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[float64_t] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft > 0: + while i < nleft: + if j == nright: + count += nleft - i + break + + lval = left[i] + rval = right[j] + + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + count += 1 + i += 1 + else: + j += 1 + + # do it again now that result size is known + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=np.float64) + + i = 0 + j = 0 + count = 0 + if nleft > 0: + while i < nleft: + if j == nright: + while i < nleft: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + i += 1 + count += 1 + break + + lval = left[i] + rval = right[j] + + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + count += 1 + i += 1 + else: + j += 1 + + return result, lindexer, rindexer + + +@cython.wraparound(False) +@cython.boundscheck(False) +def inner_join_indexer_float64(ndarray[float64_t] left, + ndarray[float64_t] right): + """ + Two-pass algorithm for monotonic indexes. Handles many-to-one merges + """ + cdef: + Py_ssize_t i, j, k, nright, nleft, count + float64_t lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[float64_t] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft > 0 and nright > 0: + while True: + if i == nleft: + break + if j == nright: + break + + lval = left[i] + rval = right[j] + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + i += 1 + else: + j += 1 + + # do it again now that result size is known + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=np.float64) + + i = 0 + j = 0 + count = 0 + if nleft > 0 and nright > 0: + while True: + if i == nleft: + break + if j == nright: + break + + lval = left[i] + rval = right[j] + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = rval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + i += 1 + else: + j += 1 + + return result, lindexer, rindexer + + +@cython.wraparound(False) +@cython.boundscheck(False) +def outer_join_indexer_float64(ndarray[float64_t] left, + ndarray[float64_t] right): + cdef: + Py_ssize_t i, j, nright, nleft, count + float64_t lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[float64_t] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft == 0: + count = nright + elif nright == 0: + count = nleft + else: + while True: + if i == nleft: + count += nright - j + break + if j == nright: + count += nleft - i + break + + lval = left[i] + rval = right[j] + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + count += 1 + i += 1 + else: + count += 1 + j += 1 + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=np.float64) + + # do it again, but populate the indexers / result + + i = 0 + j = 0 + count = 0 + if nleft == 0: + for j in range(nright): + lindexer[j] = -1 + rindexer[j] = j + result[j] = right[j] + elif nright == 0: + for i in range(nleft): + lindexer[i] = i + rindexer[i] = -1 + result[i] = left[i] + else: + while True: + if i == nleft: + while j < nright: + lindexer[count] = -1 + rindexer[count] = j + result[count] = right[j] + count += 1 + j += 1 + break + if j == nright: + while i < nleft: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + count += 1 + i += 1 + break + + lval = left[i] + rval = right[j] + + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + lindexer[count] = i + rindexer[count] = -1 + result[count] = lval + count += 1 + i += 1 + else: + lindexer[count] = -1 + rindexer[count] = j + result[count] = rval + count += 1 + j += 1 + + return result, lindexer, rindexer + +# Joins on ordered, unique indices + +# right might contain non-unique values + + +@cython.wraparound(False) +@cython.boundscheck(False) +def left_join_indexer_unique_float32(ndarray[float32_t] left, + ndarray[float32_t] right): + cdef: + Py_ssize_t i, j, nleft, nright + ndarray[int64_t] indexer + float32_t lval, rval + + i = 0 + j = 0 + nleft = len(left) + nright = len(right) + + indexer = np.empty(nleft, dtype=np.int64) + while True: + if i == nleft: + break + + if j == nright: + indexer[i] = -1 + i += 1 + continue + + rval = right[j] + + while i < nleft - 1 and left[i] == rval: + indexer[i] = j + i += 1 + + if left[i] == right[j]: + indexer[i] = j + i += 1 + while i < nleft - 1 and left[i] == rval: + indexer[i] = j + i += 1 + j += 1 + elif left[i] > rval: + indexer[i] = -1 + j += 1 + else: + indexer[i] = -1 + i += 1 + return indexer + + +# @cython.wraparound(False) +# @cython.boundscheck(False) +def left_join_indexer_float32(ndarray[float32_t] left, + ndarray[float32_t] right): + """ + Two-pass algorithm for monotonic indexes. Handles many-to-one merges + """ + cdef: + Py_ssize_t i, j, k, nright, nleft, count + float32_t lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[float32_t] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft > 0: + while i < nleft: + if j == nright: + count += nleft - i + break + + lval = left[i] + rval = right[j] + + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + count += 1 + i += 1 + else: + j += 1 + + # do it again now that result size is known + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=np.float32) + + i = 0 + j = 0 + count = 0 + if nleft > 0: + while i < nleft: + if j == nright: + while i < nleft: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + i += 1 + count += 1 + break + + lval = left[i] + rval = right[j] + + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + count += 1 + i += 1 + else: + j += 1 + + return result, lindexer, rindexer + + +@cython.wraparound(False) +@cython.boundscheck(False) +def inner_join_indexer_float32(ndarray[float32_t] left, + ndarray[float32_t] right): + """ + Two-pass algorithm for monotonic indexes. Handles many-to-one merges + """ + cdef: + Py_ssize_t i, j, k, nright, nleft, count + float32_t lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[float32_t] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft > 0 and nright > 0: + while True: + if i == nleft: + break + if j == nright: + break + + lval = left[i] + rval = right[j] + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + i += 1 + else: + j += 1 + + # do it again now that result size is known + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=np.float32) + + i = 0 + j = 0 + count = 0 + if nleft > 0 and nright > 0: + while True: + if i == nleft: + break + if j == nright: + break + + lval = left[i] + rval = right[j] + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = rval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + i += 1 + else: + j += 1 + + return result, lindexer, rindexer + + +@cython.wraparound(False) +@cython.boundscheck(False) +def outer_join_indexer_float32(ndarray[float32_t] left, + ndarray[float32_t] right): + cdef: + Py_ssize_t i, j, nright, nleft, count + float32_t lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[float32_t] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft == 0: + count = nright + elif nright == 0: + count = nleft + else: + while True: + if i == nleft: + count += nright - j + break + if j == nright: + count += nleft - i + break + + lval = left[i] + rval = right[j] + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + count += 1 + i += 1 + else: + count += 1 + j += 1 + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=np.float32) + + # do it again, but populate the indexers / result + + i = 0 + j = 0 + count = 0 + if nleft == 0: + for j in range(nright): + lindexer[j] = -1 + rindexer[j] = j + result[j] = right[j] + elif nright == 0: + for i in range(nleft): + lindexer[i] = i + rindexer[i] = -1 + result[i] = left[i] + else: + while True: + if i == nleft: + while j < nright: + lindexer[count] = -1 + rindexer[count] = j + result[count] = right[j] + count += 1 + j += 1 + break + if j == nright: + while i < nleft: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + count += 1 + i += 1 + break + + lval = left[i] + rval = right[j] + + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + lindexer[count] = i + rindexer[count] = -1 + result[count] = lval + count += 1 + i += 1 + else: + lindexer[count] = -1 + rindexer[count] = j + result[count] = rval + count += 1 + j += 1 + + return result, lindexer, rindexer + +# Joins on ordered, unique indices + +# right might contain non-unique values + + +@cython.wraparound(False) +@cython.boundscheck(False) +def left_join_indexer_unique_object(ndarray[object] left, + ndarray[object] right): + cdef: + Py_ssize_t i, j, nleft, nright + ndarray[int64_t] indexer + object lval, rval + + i = 0 + j = 0 + nleft = len(left) + nright = len(right) + + indexer = np.empty(nleft, dtype=np.int64) + while True: + if i == nleft: + break + + if j == nright: + indexer[i] = -1 + i += 1 + continue + + rval = right[j] + + while i < nleft - 1 and left[i] == rval: + indexer[i] = j + i += 1 + + if left[i] == right[j]: + indexer[i] = j + i += 1 + while i < nleft - 1 and left[i] == rval: + indexer[i] = j + i += 1 + j += 1 + elif left[i] > rval: + indexer[i] = -1 + j += 1 + else: + indexer[i] = -1 + i += 1 + return indexer + + +# @cython.wraparound(False) +# @cython.boundscheck(False) +def left_join_indexer_object(ndarray[object] left, + ndarray[object] right): + """ + Two-pass algorithm for monotonic indexes. Handles many-to-one merges + """ + cdef: + Py_ssize_t i, j, k, nright, nleft, count + object lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[object] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft > 0: + while i < nleft: + if j == nright: + count += nleft - i + break + + lval = left[i] + rval = right[j] + + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + count += 1 + i += 1 + else: + j += 1 + + # do it again now that result size is known + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=object) + + i = 0 + j = 0 + count = 0 + if nleft > 0: + while i < nleft: + if j == nright: + while i < nleft: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + i += 1 + count += 1 + break + + lval = left[i] + rval = right[j] + + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + count += 1 + i += 1 + else: + j += 1 + + return result, lindexer, rindexer + + +@cython.wraparound(False) +@cython.boundscheck(False) +def inner_join_indexer_object(ndarray[object] left, + ndarray[object] right): + """ + Two-pass algorithm for monotonic indexes. Handles many-to-one merges + """ + cdef: + Py_ssize_t i, j, k, nright, nleft, count + object lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[object] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft > 0 and nright > 0: + while True: + if i == nleft: + break + if j == nright: + break + + lval = left[i] + rval = right[j] + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + i += 1 + else: + j += 1 + + # do it again now that result size is known + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=object) + + i = 0 + j = 0 + count = 0 + if nleft > 0 and nright > 0: + while True: + if i == nleft: + break + if j == nright: + break + + lval = left[i] + rval = right[j] + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = rval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + i += 1 + else: + j += 1 + + return result, lindexer, rindexer + + +@cython.wraparound(False) +@cython.boundscheck(False) +def outer_join_indexer_object(ndarray[object] left, + ndarray[object] right): + cdef: + Py_ssize_t i, j, nright, nleft, count + object lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[object] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft == 0: + count = nright + elif nright == 0: + count = nleft + else: + while True: + if i == nleft: + count += nright - j + break + if j == nright: + count += nleft - i + break + + lval = left[i] + rval = right[j] + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + count += 1 + i += 1 + else: + count += 1 + j += 1 + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=object) + + # do it again, but populate the indexers / result + + i = 0 + j = 0 + count = 0 + if nleft == 0: + for j in range(nright): + lindexer[j] = -1 + rindexer[j] = j + result[j] = right[j] + elif nright == 0: + for i in range(nleft): + lindexer[i] = i + rindexer[i] = -1 + result[i] = left[i] + else: + while True: + if i == nleft: + while j < nright: + lindexer[count] = -1 + rindexer[count] = j + result[count] = right[j] + count += 1 + j += 1 + break + if j == nright: + while i < nleft: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + count += 1 + i += 1 + break + + lval = left[i] + rval = right[j] + + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + lindexer[count] = i + rindexer[count] = -1 + result[count] = lval + count += 1 + i += 1 + else: + lindexer[count] = -1 + rindexer[count] = j + result[count] = rval + count += 1 + j += 1 + + return result, lindexer, rindexer + +# Joins on ordered, unique indices + +# right might contain non-unique values + + +@cython.wraparound(False) +@cython.boundscheck(False) +def left_join_indexer_unique_int32(ndarray[int32_t] left, + ndarray[int32_t] right): + cdef: + Py_ssize_t i, j, nleft, nright + ndarray[int64_t] indexer + int32_t lval, rval + + i = 0 + j = 0 + nleft = len(left) + nright = len(right) + + indexer = np.empty(nleft, dtype=np.int64) + while True: + if i == nleft: + break + + if j == nright: + indexer[i] = -1 + i += 1 + continue + + rval = right[j] + + while i < nleft - 1 and left[i] == rval: + indexer[i] = j + i += 1 + + if left[i] == right[j]: + indexer[i] = j + i += 1 + while i < nleft - 1 and left[i] == rval: + indexer[i] = j + i += 1 + j += 1 + elif left[i] > rval: + indexer[i] = -1 + j += 1 + else: + indexer[i] = -1 + i += 1 + return indexer + + +# @cython.wraparound(False) +# @cython.boundscheck(False) +def left_join_indexer_int32(ndarray[int32_t] left, + ndarray[int32_t] right): + """ + Two-pass algorithm for monotonic indexes. Handles many-to-one merges + """ + cdef: + Py_ssize_t i, j, k, nright, nleft, count + int32_t lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[int32_t] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft > 0: + while i < nleft: + if j == nright: + count += nleft - i + break + + lval = left[i] + rval = right[j] + + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + count += 1 + i += 1 + else: + j += 1 + + # do it again now that result size is known + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=np.int32) + + i = 0 + j = 0 + count = 0 + if nleft > 0: + while i < nleft: + if j == nright: + while i < nleft: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + i += 1 + count += 1 + break + + lval = left[i] + rval = right[j] + + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + count += 1 + i += 1 + else: + j += 1 + + return result, lindexer, rindexer + + +@cython.wraparound(False) +@cython.boundscheck(False) +def inner_join_indexer_int32(ndarray[int32_t] left, + ndarray[int32_t] right): + """ + Two-pass algorithm for monotonic indexes. Handles many-to-one merges + """ + cdef: + Py_ssize_t i, j, k, nright, nleft, count + int32_t lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[int32_t] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft > 0 and nright > 0: + while True: + if i == nleft: + break + if j == nright: + break + + lval = left[i] + rval = right[j] + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + i += 1 + else: + j += 1 + + # do it again now that result size is known + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=np.int32) + + i = 0 + j = 0 + count = 0 + if nleft > 0 and nright > 0: + while True: + if i == nleft: + break + if j == nright: + break + + lval = left[i] + rval = right[j] + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = rval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + i += 1 + else: + j += 1 + + return result, lindexer, rindexer + + +@cython.wraparound(False) +@cython.boundscheck(False) +def outer_join_indexer_int32(ndarray[int32_t] left, + ndarray[int32_t] right): + cdef: + Py_ssize_t i, j, nright, nleft, count + int32_t lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[int32_t] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft == 0: + count = nright + elif nright == 0: + count = nleft + else: + while True: + if i == nleft: + count += nright - j + break + if j == nright: + count += nleft - i + break + + lval = left[i] + rval = right[j] + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + count += 1 + i += 1 + else: + count += 1 + j += 1 + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=np.int32) + + # do it again, but populate the indexers / result + + i = 0 + j = 0 + count = 0 + if nleft == 0: + for j in range(nright): + lindexer[j] = -1 + rindexer[j] = j + result[j] = right[j] + elif nright == 0: + for i in range(nleft): + lindexer[i] = i + rindexer[i] = -1 + result[i] = left[i] + else: + while True: + if i == nleft: + while j < nright: + lindexer[count] = -1 + rindexer[count] = j + result[count] = right[j] + count += 1 + j += 1 + break + if j == nright: + while i < nleft: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + count += 1 + i += 1 + break + + lval = left[i] + rval = right[j] + + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + lindexer[count] = i + rindexer[count] = -1 + result[count] = lval + count += 1 + i += 1 + else: + lindexer[count] = -1 + rindexer[count] = j + result[count] = rval + count += 1 + j += 1 + + return result, lindexer, rindexer + +# Joins on ordered, unique indices + +# right might contain non-unique values + + +@cython.wraparound(False) +@cython.boundscheck(False) +def left_join_indexer_unique_int64(ndarray[int64_t] left, + ndarray[int64_t] right): + cdef: + Py_ssize_t i, j, nleft, nright + ndarray[int64_t] indexer + int64_t lval, rval + + i = 0 + j = 0 + nleft = len(left) + nright = len(right) + + indexer = np.empty(nleft, dtype=np.int64) + while True: + if i == nleft: + break + + if j == nright: + indexer[i] = -1 + i += 1 + continue + + rval = right[j] + + while i < nleft - 1 and left[i] == rval: + indexer[i] = j + i += 1 + + if left[i] == right[j]: + indexer[i] = j + i += 1 + while i < nleft - 1 and left[i] == rval: + indexer[i] = j + i += 1 + j += 1 + elif left[i] > rval: + indexer[i] = -1 + j += 1 + else: + indexer[i] = -1 + i += 1 + return indexer + + +# @cython.wraparound(False) +# @cython.boundscheck(False) +def left_join_indexer_int64(ndarray[int64_t] left, + ndarray[int64_t] right): + """ + Two-pass algorithm for monotonic indexes. Handles many-to-one merges + """ + cdef: + Py_ssize_t i, j, k, nright, nleft, count + int64_t lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[int64_t] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft > 0: + while i < nleft: + if j == nright: + count += nleft - i + break + + lval = left[i] + rval = right[j] + + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + count += 1 + i += 1 + else: + j += 1 + + # do it again now that result size is known + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=np.int64) + + i = 0 + j = 0 + count = 0 + if nleft > 0: + while i < nleft: + if j == nright: + while i < nleft: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + i += 1 + count += 1 + break + + lval = left[i] + rval = right[j] + + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + count += 1 + i += 1 + else: + j += 1 + + return result, lindexer, rindexer + + +@cython.wraparound(False) +@cython.boundscheck(False) +def inner_join_indexer_int64(ndarray[int64_t] left, + ndarray[int64_t] right): + """ + Two-pass algorithm for monotonic indexes. Handles many-to-one merges + """ + cdef: + Py_ssize_t i, j, k, nright, nleft, count + int64_t lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[int64_t] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft > 0 and nright > 0: + while True: + if i == nleft: + break + if j == nright: + break + + lval = left[i] + rval = right[j] + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + i += 1 + else: + j += 1 + + # do it again now that result size is known + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=np.int64) + + i = 0 + j = 0 + count = 0 + if nleft > 0 and nright > 0: + while True: + if i == nleft: + break + if j == nright: + break + + lval = left[i] + rval = right[j] + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = rval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + i += 1 + else: + j += 1 + + return result, lindexer, rindexer + + +@cython.wraparound(False) +@cython.boundscheck(False) +def outer_join_indexer_int64(ndarray[int64_t] left, + ndarray[int64_t] right): + cdef: + Py_ssize_t i, j, nright, nleft, count + int64_t lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[int64_t] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft == 0: + count = nright + elif nright == 0: + count = nleft + else: + while True: + if i == nleft: + count += nright - j + break + if j == nright: + count += nleft - i + break + + lval = left[i] + rval = right[j] + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + count += 1 + i += 1 + else: + count += 1 + j += 1 + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=np.int64) + + # do it again, but populate the indexers / result + + i = 0 + j = 0 + count = 0 + if nleft == 0: + for j in range(nright): + lindexer[j] = -1 + rindexer[j] = j + result[j] = right[j] + elif nright == 0: + for i in range(nleft): + lindexer[i] = i + rindexer[i] = -1 + result[i] = left[i] + else: + while True: + if i == nleft: + while j < nright: + lindexer[count] = -1 + rindexer[count] = j + result[count] = right[j] + count += 1 + j += 1 + break + if j == nright: + while i < nleft: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + count += 1 + i += 1 + break + + lval = left[i] + rval = right[j] + + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + lindexer[count] = i + rindexer[count] = -1 + result[count] = lval + count += 1 + i += 1 + else: + lindexer[count] = -1 + rindexer[count] = j + result[count] = rval + count += 1 + j += 1 + + return result, lindexer, rindexer diff --git a/pandas/src/join_helper.pxi.in b/pandas/src/join_helper.pxi.in new file mode 100644 index 0000000000000..5b55ec2b1bf6d --- /dev/null +++ b/pandas/src/join_helper.pxi.in @@ -0,0 +1,407 @@ +""" +Template for each `dtype` helper function for join + +WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in +""" + +#---------------------------------------------------------------------- +# left_join_indexer, inner_join_indexer, outer_join_indexer +#---------------------------------------------------------------------- + +{{py: + +# name, c_type, dtype +dtypes = [('float64', 'float64_t', 'np.float64'), + ('float32', 'float32_t', 'np.float32'), + ('object', 'object', 'object'), + ('int32', 'int32_t', 'np.int32'), + ('int64', 'int64_t', 'np.int64')] + +def get_dispatch(dtypes): + + for name, c_type, dtype in dtypes: + yield name, c_type, dtype + +}} + +{{for name, c_type, dtype in get_dispatch(dtypes)}} + +# Joins on ordered, unique indices + +# right might contain non-unique values + + +@cython.wraparound(False) +@cython.boundscheck(False) +def left_join_indexer_unique_{{name}}(ndarray[{{c_type}}] left, + ndarray[{{c_type}}] right): + cdef: + Py_ssize_t i, j, nleft, nright + ndarray[int64_t] indexer + {{c_type}} lval, rval + + i = 0 + j = 0 + nleft = len(left) + nright = len(right) + + indexer = np.empty(nleft, dtype=np.int64) + while True: + if i == nleft: + break + + if j == nright: + indexer[i] = -1 + i += 1 + continue + + rval = right[j] + + while i < nleft - 1 and left[i] == rval: + indexer[i] = j + i += 1 + + if left[i] == right[j]: + indexer[i] = j + i += 1 + while i < nleft - 1 and left[i] == rval: + indexer[i] = j + i += 1 + j += 1 + elif left[i] > rval: + indexer[i] = -1 + j += 1 + else: + indexer[i] = -1 + i += 1 + return indexer + + +# @cython.wraparound(False) +# @cython.boundscheck(False) +def left_join_indexer_{{name}}(ndarray[{{c_type}}] left, + ndarray[{{c_type}}] right): + """ + Two-pass algorithm for monotonic indexes. Handles many-to-one merges + """ + cdef: + Py_ssize_t i, j, k, nright, nleft, count + {{c_type}} lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[{{c_type}}] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft > 0: + while i < nleft: + if j == nright: + count += nleft - i + break + + lval = left[i] + rval = right[j] + + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + count += 1 + i += 1 + else: + j += 1 + + # do it again now that result size is known + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype={{dtype}}) + + i = 0 + j = 0 + count = 0 + if nleft > 0: + while i < nleft: + if j == nright: + while i < nleft: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + i += 1 + count += 1 + break + + lval = left[i] + rval = right[j] + + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + count += 1 + i += 1 + else: + j += 1 + + return result, lindexer, rindexer + + +@cython.wraparound(False) +@cython.boundscheck(False) +def inner_join_indexer_{{name}}(ndarray[{{c_type}}] left, + ndarray[{{c_type}}] right): + """ + Two-pass algorithm for monotonic indexes. Handles many-to-one merges + """ + cdef: + Py_ssize_t i, j, k, nright, nleft, count + {{c_type}} lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[{{c_type}}] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft > 0 and nright > 0: + while True: + if i == nleft: + break + if j == nright: + break + + lval = left[i] + rval = right[j] + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + i += 1 + else: + j += 1 + + # do it again now that result size is known + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype={{dtype}}) + + i = 0 + j = 0 + count = 0 + if nleft > 0 and nright > 0: + while True: + if i == nleft: + break + if j == nright: + break + + lval = left[i] + rval = right[j] + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = rval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + i += 1 + else: + j += 1 + + return result, lindexer, rindexer + + +@cython.wraparound(False) +@cython.boundscheck(False) +def outer_join_indexer_{{name}}(ndarray[{{c_type}}] left, + ndarray[{{c_type}}] right): + cdef: + Py_ssize_t i, j, nright, nleft, count + {{c_type}} lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[{{c_type}}] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft == 0: + count = nright + elif nright == 0: + count = nleft + else: + while True: + if i == nleft: + count += nright - j + break + if j == nright: + count += nleft - i + break + + lval = left[i] + rval = right[j] + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + count += 1 + i += 1 + else: + count += 1 + j += 1 + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype={{dtype}}) + + # do it again, but populate the indexers / result + + i = 0 + j = 0 + count = 0 + if nleft == 0: + for j in range(nright): + lindexer[j] = -1 + rindexer[j] = j + result[j] = right[j] + elif nright == 0: + for i in range(nleft): + lindexer[i] = i + rindexer[i] = -1 + result[i] = left[i] + else: + while True: + if i == nleft: + while j < nright: + lindexer[count] = -1 + rindexer[count] = j + result[count] = right[j] + count += 1 + j += 1 + break + if j == nright: + while i < nleft: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + count += 1 + i += 1 + break + + lval = left[i] + rval = right[j] + + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + lindexer[count] = i + rindexer[count] = -1 + result[count] = lval + count += 1 + i += 1 + else: + lindexer[count] = -1 + rindexer[count] = j + result[count] = rval + count += 1 + j += 1 + + return result, lindexer, rindexer + +{{endfor}} \ No newline at end of file diff --git a/pandas/src/joins_func_helper.pxi b/pandas/src/joins_func_helper.pxi new file mode 100644 index 0000000000000..7a59da37c5ced --- /dev/null +++ b/pandas/src/joins_func_helper.pxi @@ -0,0 +1,373 @@ +""" +Template for each `dtype` helper function for hashtable + +WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in +""" + +#---------------------------------------------------------------------- +# asof_join_by +#---------------------------------------------------------------------- + + +from hashtable cimport * + + +def asof_join_int64_t_by_object(ndarray[int64_t] left_values, + ndarray[int64_t] right_values, + ndarray[object] left_by_values, + ndarray[object] right_by_values, + bint allow_exact_matches=1, + tolerance=None): + + cdef: + Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos + ndarray[int64_t] left_indexer, right_indexer + bint has_tolerance = 0 + int64_t tolerance_ + PyObjectHashTable hash_table + object by_value + + # if we are using tolerance, set our objects + if tolerance is not None: + has_tolerance = 1 + tolerance_ = tolerance + + left_size = len(left_values) + right_size = len(right_values) + + left_indexer = np.empty(left_size, dtype=np.int64) + right_indexer = np.empty(left_size, dtype=np.int64) + + hash_table = PyObjectHashTable(right_size) + + right_pos = 0 + for left_pos in range(left_size): + # restart right_pos if it went negative in a previous iteration + if right_pos < 0: + right_pos = 0 + + # find last position in right whose value is less than left's value + if allow_exact_matches: + while right_pos < right_size and\ + right_values[right_pos] <= left_values[left_pos]: + hash_table.set_item(right_by_values[right_pos], right_pos) + right_pos += 1 + else: + while right_pos < right_size and\ + right_values[right_pos] < left_values[left_pos]: + hash_table.set_item(right_by_values[right_pos], right_pos) + right_pos += 1 + right_pos -= 1 + + # save positions as the desired index + by_value = left_by_values[left_pos] + found_right_pos = hash_table.get_item(by_value)\ + if by_value in hash_table else -1 + left_indexer[left_pos] = left_pos + right_indexer[left_pos] = found_right_pos + + # if needed, verify that tolerance is met + if has_tolerance and found_right_pos != -1: + diff = left_values[left_pos] - right_values[found_right_pos] + if diff > tolerance_: + right_indexer[left_pos] = -1 + + return left_indexer, right_indexer + + +def asof_join_double_by_object(ndarray[double] left_values, + ndarray[double] right_values, + ndarray[object] left_by_values, + ndarray[object] right_by_values, + bint allow_exact_matches=1, + tolerance=None): + + cdef: + Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos + ndarray[int64_t] left_indexer, right_indexer + bint has_tolerance = 0 + double tolerance_ + PyObjectHashTable hash_table + object by_value + + # if we are using tolerance, set our objects + if tolerance is not None: + has_tolerance = 1 + tolerance_ = tolerance + + left_size = len(left_values) + right_size = len(right_values) + + left_indexer = np.empty(left_size, dtype=np.int64) + right_indexer = np.empty(left_size, dtype=np.int64) + + hash_table = PyObjectHashTable(right_size) + + right_pos = 0 + for left_pos in range(left_size): + # restart right_pos if it went negative in a previous iteration + if right_pos < 0: + right_pos = 0 + + # find last position in right whose value is less than left's value + if allow_exact_matches: + while right_pos < right_size and\ + right_values[right_pos] <= left_values[left_pos]: + hash_table.set_item(right_by_values[right_pos], right_pos) + right_pos += 1 + else: + while right_pos < right_size and\ + right_values[right_pos] < left_values[left_pos]: + hash_table.set_item(right_by_values[right_pos], right_pos) + right_pos += 1 + right_pos -= 1 + + # save positions as the desired index + by_value = left_by_values[left_pos] + found_right_pos = hash_table.get_item(by_value)\ + if by_value in hash_table else -1 + left_indexer[left_pos] = left_pos + right_indexer[left_pos] = found_right_pos + + # if needed, verify that tolerance is met + if has_tolerance and found_right_pos != -1: + diff = left_values[left_pos] - right_values[found_right_pos] + if diff > tolerance_: + right_indexer[left_pos] = -1 + + return left_indexer, right_indexer + + +def asof_join_int64_t_by_int64_t(ndarray[int64_t] left_values, + ndarray[int64_t] right_values, + ndarray[int64_t] left_by_values, + ndarray[int64_t] right_by_values, + bint allow_exact_matches=1, + tolerance=None): + + cdef: + Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos + ndarray[int64_t] left_indexer, right_indexer + bint has_tolerance = 0 + int64_t tolerance_ + Int64HashTable hash_table + int64_t by_value + + # if we are using tolerance, set our objects + if tolerance is not None: + has_tolerance = 1 + tolerance_ = tolerance + + left_size = len(left_values) + right_size = len(right_values) + + left_indexer = np.empty(left_size, dtype=np.int64) + right_indexer = np.empty(left_size, dtype=np.int64) + + hash_table = Int64HashTable(right_size) + + right_pos = 0 + for left_pos in range(left_size): + # restart right_pos if it went negative in a previous iteration + if right_pos < 0: + right_pos = 0 + + # find last position in right whose value is less than left's value + if allow_exact_matches: + while right_pos < right_size and\ + right_values[right_pos] <= left_values[left_pos]: + hash_table.set_item(right_by_values[right_pos], right_pos) + right_pos += 1 + else: + while right_pos < right_size and\ + right_values[right_pos] < left_values[left_pos]: + hash_table.set_item(right_by_values[right_pos], right_pos) + right_pos += 1 + right_pos -= 1 + + # save positions as the desired index + by_value = left_by_values[left_pos] + found_right_pos = hash_table.get_item(by_value)\ + if by_value in hash_table else -1 + left_indexer[left_pos] = left_pos + right_indexer[left_pos] = found_right_pos + + # if needed, verify that tolerance is met + if has_tolerance and found_right_pos != -1: + diff = left_values[left_pos] - right_values[found_right_pos] + if diff > tolerance_: + right_indexer[left_pos] = -1 + + return left_indexer, right_indexer + + +def asof_join_double_by_int64_t(ndarray[double] left_values, + ndarray[double] right_values, + ndarray[int64_t] left_by_values, + ndarray[int64_t] right_by_values, + bint allow_exact_matches=1, + tolerance=None): + + cdef: + Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos + ndarray[int64_t] left_indexer, right_indexer + bint has_tolerance = 0 + double tolerance_ + Int64HashTable hash_table + int64_t by_value + + # if we are using tolerance, set our objects + if tolerance is not None: + has_tolerance = 1 + tolerance_ = tolerance + + left_size = len(left_values) + right_size = len(right_values) + + left_indexer = np.empty(left_size, dtype=np.int64) + right_indexer = np.empty(left_size, dtype=np.int64) + + hash_table = Int64HashTable(right_size) + + right_pos = 0 + for left_pos in range(left_size): + # restart right_pos if it went negative in a previous iteration + if right_pos < 0: + right_pos = 0 + + # find last position in right whose value is less than left's value + if allow_exact_matches: + while right_pos < right_size and\ + right_values[right_pos] <= left_values[left_pos]: + hash_table.set_item(right_by_values[right_pos], right_pos) + right_pos += 1 + else: + while right_pos < right_size and\ + right_values[right_pos] < left_values[left_pos]: + hash_table.set_item(right_by_values[right_pos], right_pos) + right_pos += 1 + right_pos -= 1 + + # save positions as the desired index + by_value = left_by_values[left_pos] + found_right_pos = hash_table.get_item(by_value)\ + if by_value in hash_table else -1 + left_indexer[left_pos] = left_pos + right_indexer[left_pos] = found_right_pos + + # if needed, verify that tolerance is met + if has_tolerance and found_right_pos != -1: + diff = left_values[left_pos] - right_values[found_right_pos] + if diff > tolerance_: + right_indexer[left_pos] = -1 + + return left_indexer, right_indexer + + +#---------------------------------------------------------------------- +# asof_join +#---------------------------------------------------------------------- + + +def asof_join_int64_t(ndarray[int64_t] left_values, + ndarray[int64_t] right_values, + bint allow_exact_matches=1, + tolerance=None): + + cdef: + Py_ssize_t left_pos, right_pos, left_size, right_size + ndarray[int64_t] left_indexer, right_indexer + bint has_tolerance = 0 + int64_t tolerance_ + + # if we are using tolerance, set our objects + if tolerance is not None: + has_tolerance = 1 + tolerance_ = tolerance + + left_size = len(left_values) + right_size = len(right_values) + + left_indexer = np.empty(left_size, dtype=np.int64) + right_indexer = np.empty(left_size, dtype=np.int64) + + right_pos = 0 + for left_pos in range(left_size): + # restart right_pos if it went negative in a previous iteration + if right_pos < 0: + right_pos = 0 + + # find last position in right whose value is less than left's value + if allow_exact_matches: + while right_pos < right_size and\ + right_values[right_pos] <= left_values[left_pos]: + right_pos += 1 + else: + while right_pos < right_size and\ + right_values[right_pos] < left_values[left_pos]: + right_pos += 1 + right_pos -= 1 + + # save positions as the desired index + left_indexer[left_pos] = left_pos + right_indexer[left_pos] = right_pos + + # if needed, verify that tolerance is met + if has_tolerance and right_pos != -1: + diff = left_values[left_pos] - right_values[right_pos] + if diff > tolerance_: + right_indexer[left_pos] = -1 + + return left_indexer, right_indexer + + +def asof_join_double(ndarray[double] left_values, + ndarray[double] right_values, + bint allow_exact_matches=1, + tolerance=None): + + cdef: + Py_ssize_t left_pos, right_pos, left_size, right_size + ndarray[int64_t] left_indexer, right_indexer + bint has_tolerance = 0 + double tolerance_ + + # if we are using tolerance, set our objects + if tolerance is not None: + has_tolerance = 1 + tolerance_ = tolerance + + left_size = len(left_values) + right_size = len(right_values) + + left_indexer = np.empty(left_size, dtype=np.int64) + right_indexer = np.empty(left_size, dtype=np.int64) + + right_pos = 0 + for left_pos in range(left_size): + # restart right_pos if it went negative in a previous iteration + if right_pos < 0: + right_pos = 0 + + # find last position in right whose value is less than left's value + if allow_exact_matches: + while right_pos < right_size and\ + right_values[right_pos] <= left_values[left_pos]: + right_pos += 1 + else: + while right_pos < right_size and\ + right_values[right_pos] < left_values[left_pos]: + right_pos += 1 + right_pos -= 1 + + # save positions as the desired index + left_indexer[left_pos] = left_pos + right_indexer[left_pos] = right_pos + + # if needed, verify that tolerance is met + if has_tolerance and right_pos != -1: + diff = left_values[left_pos] - right_values[right_pos] + if diff > tolerance_: + right_indexer[left_pos] = -1 + + return left_indexer, right_indexer diff --git a/pandas/src/joins_func_helper.pxi.in b/pandas/src/joins_func_helper.pxi.in new file mode 100644 index 0000000000000..06c35cfb69e53 --- /dev/null +++ b/pandas/src/joins_func_helper.pxi.in @@ -0,0 +1,160 @@ +""" +Template for each `dtype` helper function for hashtable + +WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in +""" + +#---------------------------------------------------------------------- +# asof_join_by +#---------------------------------------------------------------------- + +{{py: + +# table_type, by_dtype +by_dtypes = [('PyObjectHashTable', 'object'), ('Int64HashTable', 'int64_t')] + +# on_dtype +on_dtypes = ['int64_t', 'double'] + +}} + + +from hashtable cimport * + +{{for table_type, by_dtype in by_dtypes}} +{{for on_dtype in on_dtypes}} + + +def asof_join_{{on_dtype}}_by_{{by_dtype}}(ndarray[{{on_dtype}}] left_values, + ndarray[{{on_dtype}}] right_values, + ndarray[{{by_dtype}}] left_by_values, + ndarray[{{by_dtype}}] right_by_values, + bint allow_exact_matches=1, + tolerance=None): + + cdef: + Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos + ndarray[int64_t] left_indexer, right_indexer + bint has_tolerance = 0 + {{on_dtype}} tolerance_ + {{table_type}} hash_table + {{by_dtype}} by_value + + # if we are using tolerance, set our objects + if tolerance is not None: + has_tolerance = 1 + tolerance_ = tolerance + + left_size = len(left_values) + right_size = len(right_values) + + left_indexer = np.empty(left_size, dtype=np.int64) + right_indexer = np.empty(left_size, dtype=np.int64) + + hash_table = {{table_type}}(right_size) + + right_pos = 0 + for left_pos in range(left_size): + # restart right_pos if it went negative in a previous iteration + if right_pos < 0: + right_pos = 0 + + # find last position in right whose value is less than left's value + if allow_exact_matches: + while right_pos < right_size and\ + right_values[right_pos] <= left_values[left_pos]: + hash_table.set_item(right_by_values[right_pos], right_pos) + right_pos += 1 + else: + while right_pos < right_size and\ + right_values[right_pos] < left_values[left_pos]: + hash_table.set_item(right_by_values[right_pos], right_pos) + right_pos += 1 + right_pos -= 1 + + # save positions as the desired index + by_value = left_by_values[left_pos] + found_right_pos = hash_table.get_item(by_value)\ + if by_value in hash_table else -1 + left_indexer[left_pos] = left_pos + right_indexer[left_pos] = found_right_pos + + # if needed, verify that tolerance is met + if has_tolerance and found_right_pos != -1: + diff = left_values[left_pos] - right_values[found_right_pos] + if diff > tolerance_: + right_indexer[left_pos] = -1 + + return left_indexer, right_indexer + +{{endfor}} +{{endfor}} + + +#---------------------------------------------------------------------- +# asof_join +#---------------------------------------------------------------------- + +{{py: + +# on_dtype +dtypes = ['int64_t', 'double'] + +}} + +{{for on_dtype in dtypes}} + + +def asof_join_{{on_dtype}}(ndarray[{{on_dtype}}] left_values, + ndarray[{{on_dtype}}] right_values, + bint allow_exact_matches=1, + tolerance=None): + + cdef: + Py_ssize_t left_pos, right_pos, left_size, right_size + ndarray[int64_t] left_indexer, right_indexer + bint has_tolerance = 0 + {{on_dtype}} tolerance_ + + # if we are using tolerance, set our objects + if tolerance is not None: + has_tolerance = 1 + tolerance_ = tolerance + + left_size = len(left_values) + right_size = len(right_values) + + left_indexer = np.empty(left_size, dtype=np.int64) + right_indexer = np.empty(left_size, dtype=np.int64) + + right_pos = 0 + for left_pos in range(left_size): + # restart right_pos if it went negative in a previous iteration + if right_pos < 0: + right_pos = 0 + + # find last position in right whose value is less than left's value + if allow_exact_matches: + while right_pos < right_size and\ + right_values[right_pos] <= left_values[left_pos]: + right_pos += 1 + else: + while right_pos < right_size and\ + right_values[right_pos] < left_values[left_pos]: + right_pos += 1 + right_pos -= 1 + + # save positions as the desired index + left_indexer[left_pos] = left_pos + right_indexer[left_pos] = right_pos + + # if needed, verify that tolerance is met + if has_tolerance and right_pos != -1: + diff = left_values[left_pos] - right_values[right_pos] + if diff > tolerance_: + right_indexer[left_pos] = -1 + + return left_indexer, right_indexer + +{{endfor}} + diff --git a/pandas/src/klib/khash.h b/pandas/src/klib/khash.h index 0f1a17c6333f4..dc004a0e1770b 100644 --- a/pandas/src/klib/khash.h +++ b/pandas/src/klib/khash.h @@ -52,7 +52,7 @@ int main() { * The capacity is a power of 2. This seems to dramatically improve the speed for simple keys. Thank Zilong Tan for the suggestion. Reference: - - http://code.google.com/p/ulib/ + - https://github.com/stefanocasazza/ULib - http://nothings.org/computer/judy/ * Allow to optionally use linear probing which usually has better diff --git a/pandas/src/klib/khash_python.h b/pandas/src/klib/khash_python.h index cdd94b5d8522f..a375a73b04c9e 100644 --- a/pandas/src/klib/khash_python.h +++ b/pandas/src/klib/khash_python.h @@ -2,9 +2,21 @@ #include "khash.h" -// kludge - -#define kh_float64_hash_func _Py_HashDouble +// Previously we were using the built in cpython hash function for doubles +// python 2.7 https://github.com/python/cpython/blob/2.7/Objects/object.c#L1021 +// python 3.5 https://github.com/python/cpython/blob/3.5/Python/pyhash.c#L85 + +// The python 3 hash function has the invariant hash(x) == hash(int(x)) == hash(decimal(x)) +// and the size of hash may be different by platform / version (long in py2, Py_ssize_t in py3). +// We don't need those invariants because types will be cast before hashing, and if Py_ssize_t +// is 64 bits the truncation causes collission issues. Given all that, we use our own +// simple hash, viewing the double bytes as an int64 and using khash's default +// hash for 64 bit integers. +// GH 13436 +khint64_t PANDAS_INLINE asint64(double key) { + return *(khint64_t *)(&key); +} +#define kh_float64_hash_func(key) (khint32_t)((asint64(key))>>33^(asint64(key))^(asint64(key))<<11) #define kh_float64_hash_equal(a, b) ((a) == (b) || ((b) != (b) && (a) != (a))) #define KHASH_MAP_INIT_FLOAT64(name, khval_t) \ diff --git a/pandas/src/offsets.pyx b/pandas/src/offsets.pyx index 096198c8a05fa..c963e256d0aa5 100644 --- a/pandas/src/offsets.pyx +++ b/pandas/src/offsets.pyx @@ -162,7 +162,7 @@ cdef class YearOffset(_Offset): cpdef prev(self): cdef int64_t days - days = 365 + is_leapyear(self.y - (1-self.ly)) + days = 365 + is_leapyear(self.y - (1 - self.ly)) self.t -= days * us_in_day self.y -= 1 @@ -204,8 +204,8 @@ cdef class MonthOffset(_Offset): self.t = ts.value + (self.dayoffset * us_in_day) # for day counting - self.m = ts.dts.month - 1 - self.y = ts.dts.year + self.m = ts.dts.month - 1 + self.y = ts.dts.year self.ly = is_leapyear(self.y) if self.biz != 0: diff --git a/pandas/src/parse_helper.h b/pandas/src/parse_helper.h index fd5089dd8963d..e565f02f27c88 100644 --- a/pandas/src/parse_helper.h +++ b/pandas/src/parse_helper.h @@ -66,7 +66,7 @@ int floatify(PyObject* str, double *result, int *maybe_int) { return 0; parsingerror: - PyErr_SetString(PyExc_ValueError, "Unable to parse string"); + PyErr_Format(PyExc_ValueError, "Unable to parse string \"%s\"", data); Py_XDECREF(tmp); return -1; diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index 6091c79e2b4fc..af85b7b894d26 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -684,14 +684,19 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) { #define IS_WHITESPACE(c) ((c == ' ' || c == '\t')) -#define IS_TERMINATOR(c) ((self->lineterminator == '\0' && \ - c == '\n') || c == self->lineterminator) +#define IS_TERMINATOR(c) ((self->lineterminator == '\0' && c == '\n') || \ + (self->lineterminator != '\0' && \ + c == self->lineterminator)) #define IS_QUOTE(c) ((c == self->quotechar && self->quoting != QUOTE_NONE)) // don't parse '\r' with a custom line terminator #define IS_CARRIAGE(c) ((self->lineterminator == '\0' && c == '\r')) +#define IS_COMMENT_CHAR(c) ((self->commentchar != '\0' && c == self->commentchar)) + +#define IS_ESCAPE_CHAR(c) ((self->escapechar != '\0' && c == self->escapechar)) + #define IS_SKIPPABLE_SPACE(c) ((!self->delim_whitespace && c == ' ' && \ self->skipinitialspace)) @@ -704,6 +709,11 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) { self->datapos = i; \ TRACE(("_TOKEN_CLEANUP: datapos: %d, datalen: %d\n", self->datapos, self->datalen)); +#define CHECK_FOR_BOM() \ + if (*buf == '\xef' && *(buf + 1) == '\xbb' && *(buf + 2) == '\xbf') { \ + buf += 3; \ + self->datapos += 3; \ + } int skip_this_line(parser_t *self, int64_t rownum) { if (self->skipset != NULL) { @@ -736,6 +746,10 @@ int tokenize_bytes(parser_t *self, size_t line_limit) TRACE(("%s\n", buf)); + if (self->file_lines == 0) { + CHECK_FOR_BOM(); + } + for (i = self->datapos; i < self->datalen; ++i) { // next character in file @@ -857,7 +871,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit) self->state = EAT_CRNL; } break; - } else if (c == self->commentchar) { + } else if (IS_COMMENT_CHAR(c)) { self->state = EAT_LINE_COMMENT; break; } else if (IS_WHITESPACE(c)) { @@ -890,7 +904,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit) } else if (IS_QUOTE(c)) { // start quoted field self->state = IN_QUOTED_FIELD; - } else if (c == self->escapechar) { + } else if (IS_ESCAPE_CHAR(c)) { // possible escaped character self->state = ESCAPED_CHAR; } else if (IS_SKIPPABLE_SPACE(c)) { @@ -903,7 +917,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit) // save empty field END_FIELD(); } - } else if (c == self->commentchar) { + } else if (IS_COMMENT_CHAR(c)) { END_FIELD(); self->state = EAT_COMMENT; } else { @@ -941,7 +955,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit) } else if (IS_CARRIAGE(c)) { END_FIELD(); self->state = EAT_CRNL; - } else if (c == self->escapechar) { + } else if (IS_ESCAPE_CHAR(c)) { // possible escaped character self->state = ESCAPED_CHAR; } else if (IS_DELIMITER(c)) { @@ -953,7 +967,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit) } else { self->state = START_FIELD; } - } else if (c == self->commentchar) { + } else if (IS_COMMENT_CHAR(c)) { END_FIELD(); self->state = EAT_COMMENT; } else { @@ -964,7 +978,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit) case IN_QUOTED_FIELD: // in quoted field - if (c == self->escapechar) { + if (IS_ESCAPE_CHAR(c)) { // possible escape character self->state = ESCAPE_IN_QUOTED_FIELD; } else if (IS_QUOTE(c)) { @@ -1221,20 +1235,7 @@ int parser_trim_buffers(parser_t *self) { size_t new_cap; void *newptr; - /* trim stream */ - new_cap = _next_pow2(self->stream_len) + 1; - TRACE(("parser_trim_buffers: new_cap = %zu, stream_cap = %zu, lines_cap = %zu\n", - new_cap, self->stream_cap, self->lines_cap)); - if (new_cap < self->stream_cap) { - TRACE(("parser_trim_buffers: new_cap < self->stream_cap, calling safe_realloc\n")); - newptr = safe_realloc((void*) self->stream, new_cap); - if (newptr == NULL) { - return PARSER_OUT_OF_MEMORY; - } else { - self->stream = newptr; - self->stream_cap = new_cap; - } - } + int i; /* trim words, word_starts */ new_cap = _next_pow2(self->words_len) + 1; @@ -1255,6 +1256,35 @@ int parser_trim_buffers(parser_t *self) { } } + /* trim stream */ + new_cap = _next_pow2(self->stream_len) + 1; + TRACE(("parser_trim_buffers: new_cap = %zu, stream_cap = %zu, lines_cap = %zu\n", + new_cap, self->stream_cap, self->lines_cap)); + if (new_cap < self->stream_cap) { + TRACE(("parser_trim_buffers: new_cap < self->stream_cap, calling safe_realloc\n")); + newptr = safe_realloc((void*) self->stream, new_cap); + if (newptr == NULL) { + return PARSER_OUT_OF_MEMORY; + } else { + // Update the pointers in the self->words array (char **) if `safe_realloc` + // moved the `self->stream` buffer. This block mirrors a similar block in + // `make_stream_space`. + if (self->stream != newptr) { + /* TRACE(("Moving word pointers\n")) */ + self->pword_start = (char*) newptr + self->word_start; + + for (i = 0; i < self->words_len; ++i) + { + self->words[i] = (char*) newptr + self->word_starts[i]; + } + } + + self->stream = newptr; + self->stream_cap = new_cap; + + } + } + /* trim line_start, line_fields */ new_cap = _next_pow2(self->lines) + 1; if (new_cap < self->lines_cap) { diff --git a/pandas/src/period.pyx b/pandas/src/period.pyx index 858aa58df8d7d..5565f25937394 100644 --- a/pandas/src/period.pyx +++ b/pandas/src/period.pyx @@ -24,6 +24,7 @@ cimport cython from datetime cimport * cimport util cimport lib +from lib cimport is_null_datetimelike, is_period import lib from pandas import tslib from tslib import Timedelta, Timestamp, iNaT, NaT @@ -79,17 +80,21 @@ cdef extern from "period_helper.h": ctypedef int64_t (*freq_conv_func)(int64_t, char, asfreq_info*) void initialize_daytime_conversion_factor_matrix() - int64_t asfreq(int64_t dtordinal, int freq1, int freq2, char relation) except INT32_MIN + int64_t asfreq(int64_t dtordinal, int freq1, int freq2, + char relation) except INT32_MIN freq_conv_func get_asfreq_func(int fromFreq, int toFreq) void get_asfreq_info(int fromFreq, int toFreq, asfreq_info *af_info) int64_t get_period_ordinal(int year, int month, int day, - int hour, int minute, int second, int microseconds, int picoseconds, - int freq) nogil except INT32_MIN + int hour, int minute, int second, + int microseconds, int picoseconds, + int freq) nogil except INT32_MIN - int64_t get_python_ordinal(int64_t period_ordinal, int freq) except INT32_MIN + int64_t get_python_ordinal(int64_t period_ordinal, + int freq) except INT32_MIN - int get_date_info(int64_t ordinal, int freq, date_info *dinfo) nogil except INT32_MIN + int get_date_info(int64_t ordinal, int freq, + date_info *dinfo) nogil except INT32_MIN double getAbsTime(int, int64_t, int64_t) int pyear(int64_t ordinal, int freq) except INT32_MIN @@ -133,6 +138,7 @@ cdef inline int64_t remove_mult(int64_t period_ord_w_mult, int64_t mult): return period_ord_w_mult * mult + 1; + @cython.wraparound(False) @cython.boundscheck(False) def dt64arr_to_periodarr(ndarray[int64_t] dtarr, int freq, tz=None): @@ -157,11 +163,13 @@ def dt64arr_to_periodarr(ndarray[int64_t] dtarr, int freq, tz=None): continue pandas_datetime_to_datetimestruct(dtarr[i], PANDAS_FR_ns, &dts) out[i] = get_period_ordinal(dts.year, dts.month, dts.day, - dts.hour, dts.min, dts.sec, dts.us, dts.ps, freq) + dts.hour, dts.min, dts.sec, + dts.us, dts.ps, freq) else: out = localize_dt64arr_to_period(dtarr, freq, tz) return out + @cython.wraparound(False) @cython.boundscheck(False) def periodarr_to_dt64arr(ndarray[int64_t] periodarr, int freq): @@ -211,6 +219,7 @@ cpdef int64_t period_asfreq(int64_t period_ordinal, int freq1, int freq2, return retval + def period_asfreq_arr(ndarray[int64_t] arr, int freq1, int freq2, bint end): """ Convert int64-array of period ordinals from one frequency to another, and @@ -253,7 +262,9 @@ def period_asfreq_arr(ndarray[int64_t] arr, int freq1, int freq2, bint end): return result -def period_ordinal(int y, int m, int d, int h, int min, int s, int us, int ps, int freq): + +def period_ordinal(int y, int m, int d, int h, int min, + int s, int us, int ps, int freq): cdef: int64_t ordinal @@ -283,6 +294,7 @@ cpdef int64_t period_ordinal_to_dt64(int64_t ordinal, int freq) nogil: return pandas_datetimestruct_to_datetime(PANDAS_FR_ns, &dts) + def period_format(int64_t value, int freq, object fmt=None): cdef: int freq_group @@ -331,7 +343,8 @@ cdef list extra_fmts = [(b"%q", b"^`AB`^"), (b"%u", b"^`IJ`^"), (b"%n", b"^`KL`^")] -cdef list str_extra_fmts = ["^`AB`^", "^`CD`^", "^`EF`^", "^`GH`^", "^`IJ`^", "^`KL`^"] +cdef list str_extra_fmts = ["^`AB`^", "^`CD`^", "^`EF`^", + "^`GH`^", "^`IJ`^", "^`KL`^"] cdef object _period_strftime(int64_t value, int freq, object fmt): import sys @@ -389,6 +402,7 @@ cdef object _period_strftime(int64_t value, int freq, object fmt): ctypedef int (*accessor)(int64_t ordinal, int freq) except INT32_MIN + def get_period_field(int code, int64_t value, int freq): cdef accessor f = _get_accessor_func(code) if f is NULL: @@ -397,6 +411,7 @@ def get_period_field(int code, int64_t value, int freq): return np.nan return f(value, freq) + def get_period_field_arr(int code, ndarray[int64_t] arr, int freq): cdef: Py_ssize_t i, sz @@ -419,7 +434,6 @@ def get_period_field_arr(int code, ndarray[int64_t] arr, int freq): return out - cdef accessor _get_accessor_func(int code): if code == 0: return &pyear @@ -458,13 +472,46 @@ def extract_ordinals(ndarray[object] values, freq): for i in range(n): p = values[i] - ordinals[i] = p.ordinal - if p.freqstr != freqstr: - msg = _DIFFERENT_FREQ_INDEX.format(freqstr, p.freqstr) - raise IncompatibleFrequency(msg) + + if is_null_datetimelike(p): + ordinals[i] = tslib.iNaT + else: + try: + ordinals[i] = p.ordinal + + if p.freqstr != freqstr: + msg = _DIFFERENT_FREQ_INDEX.format(freqstr, p.freqstr) + raise IncompatibleFrequency(msg) + + except AttributeError: + p = Period(p, freq=freq) + if p is tslib.NaT: + # input may contain NaT-like string + ordinals[i] = tslib.iNaT + else: + ordinals[i] = p.ordinal return ordinals + +def extract_freq(ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values) + object p + + for i in range(n): + p = values[i] + + try: + # now Timestamp / NaT has freq attr + if is_period(p): + return p.freq + except AttributeError: + pass + + raise ValueError('freq not specified and cannot be inferred') + + cpdef resolution(ndarray[int64_t] stamps, tz=None): cdef: Py_ssize_t i, n = len(stamps) @@ -537,7 +584,7 @@ cdef _reso_local(ndarray[int64_t] stamps, object tz): pos = _pos # statictzinfo - if typ not in ['pytz','dateutil']: + if typ not in ['pytz', 'dateutil']: for i in range(n): if stamps[i] == NPY_NAT: continue @@ -579,7 +626,8 @@ cdef ndarray[int64_t] localize_dt64arr_to_period(ndarray[int64_t] stamps, continue pandas_datetime_to_datetimestruct(stamps[i], PANDAS_FR_ns, &dts) result[i] = get_period_ordinal(dts.year, dts.month, dts.day, - dts.hour, dts.min, dts.sec, dts.us, dts.ps, freq) + dts.hour, dts.min, dts.sec, + dts.us, dts.ps, freq) elif _is_tzlocal(tz): for i in range(n): @@ -594,7 +642,8 @@ cdef ndarray[int64_t] localize_dt64arr_to_period(ndarray[int64_t] stamps, pandas_datetime_to_datetimestruct(stamps[i] + delta, PANDAS_FR_ns, &dts) result[i] = get_period_ordinal(dts.year, dts.month, dts.day, - dts.hour, dts.min, dts.sec, dts.us, dts.ps, freq) + dts.hour, dts.min, dts.sec, + dts.us, dts.ps, freq) else: # Adjust datetime64 timestamp, recompute datetimestruct trans, deltas, typ = _get_dst_info(tz) @@ -605,7 +654,7 @@ cdef ndarray[int64_t] localize_dt64arr_to_period(ndarray[int64_t] stamps, pos = _pos # statictzinfo - if typ not in ['pytz','dateutil']: + if typ not in ['pytz', 'dateutil']: for i in range(n): if stamps[i] == NPY_NAT: result[i] = NPY_NAT @@ -613,7 +662,8 @@ cdef ndarray[int64_t] localize_dt64arr_to_period(ndarray[int64_t] stamps, pandas_datetime_to_datetimestruct(stamps[i] + deltas[0], PANDAS_FR_ns, &dts) result[i] = get_period_ordinal(dts.year, dts.month, dts.day, - dts.hour, dts.min, dts.sec, dts.us, dts.ps, freq) + dts.hour, dts.min, dts.sec, + dts.us, dts.ps, freq) else: for i in range(n): if stamps[i] == NPY_NAT: @@ -622,51 +672,34 @@ cdef ndarray[int64_t] localize_dt64arr_to_period(ndarray[int64_t] stamps, pandas_datetime_to_datetimestruct(stamps[i] + deltas[pos[i]], PANDAS_FR_ns, &dts) result[i] = get_period_ordinal(dts.year, dts.month, dts.day, - dts.hour, dts.min, dts.sec, dts.us, dts.ps, freq) + dts.hour, dts.min, dts.sec, + dts.us, dts.ps, freq) return result _DIFFERENT_FREQ = "Input has different freq={1} from Period(freq={0})" -_DIFFERENT_FREQ_INDEX = "Input has different freq={1} from PeriodIndex(freq={0})" +_DIFFERENT_FREQ_INDEX = ("Input has different freq={1} " + "from PeriodIndex(freq={0})") class IncompatibleFrequency(ValueError): pass -cdef class Period(object): - """ - Represents an period of time +cdef class _Period(object): - Parameters - ---------- - value : Period or compat.string_types, default None - The time period represented (e.g., '4Q2005') - freq : str, default None - One of pandas period strings or corresponding objects - year : int, default None - month : int, default 1 - quarter : int, default None - day : int, default 1 - hour : int, default 0 - minute : int, default 0 - second : int, default 0 - """ cdef public: int64_t ordinal object freq - _comparables = ['name','freqstr'] + _comparables = ['name', 'freqstr'] _typ = 'period' @classmethod def _maybe_convert_freq(cls, object freq): - if isinstance(freq, compat.string_types): - freq = freq.upper() - freq = frequencies._period_alias_dict.get(freq, freq) - elif isinstance(freq, (int, tuple)): + if isinstance(freq, (int, tuple)): code, stride = frequencies.get_freq_code(freq) freq = frequencies._get_freq_str(code, stride) @@ -680,98 +713,25 @@ cdef class Period(object): @classmethod def _from_ordinal(cls, ordinal, freq): - """ fast creation from an ordinal and freq that are already validated! """ - self = Period.__new__(cls) - self.ordinal = ordinal - self.freq = cls._maybe_convert_freq(freq) - return self - - def __init__(self, value=None, freq=None, ordinal=None, - year=None, month=1, quarter=None, day=1, - hour=0, minute=0, second=0): - # freq points to a tuple (base, mult); base is one of the defined - # periods such as A, Q, etc. Every five minutes would be, e.g., - # ('T', 5) but may be passed in as a string like '5T' - - # ordinal is the period offset from the gregorian proleptic epoch - - if ordinal is not None and value is not None: - raise ValueError(("Only value or ordinal but not both should be " - "given but not both")) - elif ordinal is not None: - if not lib.is_integer(ordinal): - raise ValueError("Ordinal must be an integer") - if freq is None: - raise ValueError('Must supply freq for ordinal value') - - elif value is None: - if freq is None: - raise ValueError("If value is None, freq cannot be None") - ordinal = _ordinal_from_fields(year, month, quarter, day, - hour, minute, second, freq) - - elif isinstance(value, Period): - other = value - if freq is None or frequencies.get_freq_code(freq) == frequencies.get_freq_code(other.freq): - ordinal = other.ordinal - freq = other.freq - else: - converted = other.asfreq(freq) - ordinal = converted.ordinal - - elif lib.is_null_datetimelike(value) or value in tslib._nat_strings: - ordinal = tslib.iNaT - if freq is None: - raise ValueError("If value is NaT, freq cannot be None " - "because it cannot be inferred") - - elif isinstance(value, compat.string_types) or lib.is_integer(value): - if lib.is_integer(value): - value = str(value) - value = value.upper() - dt, _, reso = parse_time_string(value, freq) - - if freq is None: - try: - freq = frequencies.Resolution.get_freq(reso) - except KeyError: - raise ValueError("Invalid frequency or could not infer: %s" % reso) - - elif isinstance(value, datetime): - dt = value - if freq is None: - raise ValueError('Must supply freq for datetime value') - elif isinstance(value, np.datetime64): - dt = Timestamp(value) - if freq is None: - raise ValueError('Must supply freq for datetime value') - elif isinstance(value, date): - dt = datetime(year=value.year, month=value.month, day=value.day) - if freq is None: - raise ValueError('Must supply freq for datetime value') - else: - msg = "Value must be Period, string, integer, or datetime" - raise ValueError(msg) - - base, mult = frequencies.get_freq_code(freq) - - if ordinal is None: - self.ordinal = get_period_ordinal(dt.year, dt.month, dt.day, - dt.hour, dt.minute, dt.second, - dt.microsecond, 0, base) + """ + Fast creation from an ordinal and freq that are already validated! + """ + if ordinal == tslib.iNaT: + return tslib.NaT else: + self = _Period.__new__(cls) self.ordinal = ordinal - - self.freq = self._maybe_convert_freq(freq) + self.freq = cls._maybe_convert_freq(freq) + return self def __richcmp__(self, other, op): if isinstance(other, Period): if other.freq != self.freq: msg = _DIFFERENT_FREQ.format(self.freqstr, other.freqstr) raise IncompatibleFrequency(msg) - if self.ordinal == tslib.iNaT or other.ordinal == tslib.iNaT: - return _nat_scalar_rules[op] return PyObject_RichCompareBool(self.ordinal, other.ordinal, op) + elif other is tslib.NaT: + return _nat_scalar_rules[op] # index/series like elif hasattr(other, '_typ'): return NotImplemented @@ -784,31 +744,26 @@ cdef class Period(object): (type(self).__name__, type(other).__name__)) def __hash__(self): - return hash((self.ordinal, self.freq)) + return hash((self.ordinal, self.freqstr)) def _add_delta(self, other): - if isinstance(other, (timedelta, np.timedelta64, offsets.Tick, Timedelta)): + if isinstance(other, (timedelta, np.timedelta64, + offsets.Tick, Timedelta)): offset = frequencies.to_offset(self.freq.rule_code) if isinstance(offset, offsets.Tick): nanos = tslib._delta_to_nanoseconds(other) offset_nanos = tslib._delta_to_nanoseconds(offset) if nanos % offset_nanos == 0: - if self.ordinal == tslib.iNaT: - ordinal = self.ordinal - else: - ordinal = self.ordinal + (nanos // offset_nanos) + ordinal = self.ordinal + (nanos // offset_nanos) return Period(ordinal=ordinal, freq=self.freq) msg = 'Input cannot be converted to Period(freq={0})' raise IncompatibleFrequency(msg.format(self.freqstr)) elif isinstance(other, offsets.DateOffset): - freqstr = frequencies.get_standard_freq(other) + freqstr = other.rule_code base = frequencies.get_base_alias(freqstr) if base == self.freq.rule_code: - if self.ordinal == tslib.iNaT: - ordinal = self.ordinal - else: - ordinal = self.ordinal + other.n + ordinal = self.ordinal + other.n return Period(ordinal=ordinal, freq=self.freq) msg = _DIFFERENT_FREQ.format(self.freqstr, other.freqstr) raise IncompatibleFrequency(msg) @@ -818,15 +773,13 @@ cdef class Period(object): def __add__(self, other): if isinstance(self, Period): if isinstance(other, (timedelta, np.timedelta64, - offsets.Tick, offsets.DateOffset, Timedelta)): + offsets.Tick, offsets.DateOffset, + Timedelta)): return self._add_delta(other) elif other is tslib.NaT: return tslib.NaT elif lib.is_integer(other): - if self.ordinal == tslib.iNaT: - ordinal = self.ordinal - else: - ordinal = self.ordinal + other * self.freq.n + ordinal = self.ordinal + other * self.freq.n return Period(ordinal=ordinal, freq=self.freq) else: # pragma: no cover return NotImplemented @@ -838,21 +791,17 @@ cdef class Period(object): def __sub__(self, other): if isinstance(self, Period): if isinstance(other, (timedelta, np.timedelta64, - offsets.Tick, offsets.DateOffset, Timedelta)): + offsets.Tick, offsets.DateOffset, + Timedelta)): neg_other = -other return self + neg_other elif lib.is_integer(other): - if self.ordinal == tslib.iNaT: - ordinal = self.ordinal - else: - ordinal = self.ordinal - other * self.freq.n + ordinal = self.ordinal - other * self.freq.n return Period(ordinal=ordinal, freq=self.freq) elif isinstance(other, Period): if other.freq != self.freq: msg = _DIFFERENT_FREQ.format(self.freqstr, other.freqstr) raise IncompatibleFrequency(msg) - if self.ordinal == tslib.iNaT or other.ordinal == tslib.iNaT: - return Period(ordinal=tslib.iNaT, freq=self.freq) return self.ordinal - other.ordinal elif getattr(other, '_typ', None) == 'periodindex': return -other.__sub__(self) @@ -880,20 +829,18 @@ cdef class Period(object): ------- resampled : Period """ + freq = self._maybe_convert_freq(freq) how = _validate_end_alias(how) base1, mult1 = frequencies.get_freq_code(self.freq) base2, mult2 = frequencies.get_freq_code(freq) - if self.ordinal == tslib.iNaT: - ordinal = self.ordinal + # mult1 can't be negative or 0 + end = how == 'E' + if end: + ordinal = self.ordinal + mult1 - 1 else: - # mult1 can't be negative or 0 - end = how == 'E' - if end: - ordinal = self.ordinal + mult1 - 1 - else: - ordinal = self.ordinal - ordinal = period_asfreq(ordinal, base1, base2, end) + ordinal = self.ordinal + ordinal = period_asfreq(ordinal, base1, base2, end) return Period(ordinal=ordinal, freq=freq) @@ -903,12 +850,9 @@ cdef class Period(object): @property def end_time(self): - if self.ordinal == tslib.iNaT: - ordinal = self.ordinal - else: - # freq.n can't be negative or 0 - # ordinal = (self + self.freq.n).start_time.value - 1 - ordinal = (self + 1).start_time.value - 1 + # freq.n can't be negative or 0 + # ordinal = (self + self.freq.n).start_time.value - 1 + ordinal = (self + 1).start_time.value - 1 return Timestamp(ordinal) def to_timestamp(self, freq=None, how='start', tz=None): @@ -929,6 +873,8 @@ cdef class Period(object): ------- Timestamp """ + if freq is not None: + freq = self._maybe_convert_freq(freq) how = _validate_end_alias(how) if freq is None: @@ -990,6 +936,9 @@ cdef class Period(object): property daysinmonth: def __get__(self): return self.days_in_month + property is_leap_year: + def __get__(self): + return bool(is_leapyear(self._field(0))) @classmethod def now(cls, freq=None): @@ -1169,13 +1118,126 @@ cdef class Period(object): return period_format(self.ordinal, base, fmt) -def _ordinal_from_fields(year, month, quarter, day, hour, minute, - second, freq): +class Period(_Period): + """ + Represents an period of time + + Parameters + ---------- + value : Period or compat.string_types, default None + The time period represented (e.g., '4Q2005') + freq : str, default None + One of pandas period strings or corresponding objects + year : int, default None + month : int, default 1 + quarter : int, default None + day : int, default 1 + hour : int, default 0 + minute : int, default 0 + second : int, default 0 + """ + + def __new__(cls, value=None, freq=None, ordinal=None, + year=None, month=None, quarter=None, day=None, + hour=None, minute=None, second=None): + # freq points to a tuple (base, mult); base is one of the defined + # periods such as A, Q, etc. Every five minutes would be, e.g., + # ('T', 5) but may be passed in as a string like '5T' + + # ordinal is the period offset from the gregorian proleptic epoch + + cdef _Period self + + if freq is not None: + freq = cls._maybe_convert_freq(freq) + + if ordinal is not None and value is not None: + raise ValueError(("Only value or ordinal but not both should be " + "given but not both")) + elif ordinal is not None: + if not lib.is_integer(ordinal): + raise ValueError("Ordinal must be an integer") + if freq is None: + raise ValueError('Must supply freq for ordinal value') + + elif value is None: + if (year is None and month is None and + quarter is None and day is None and + hour is None and minute is None and second is None): + ordinal = tslib.iNaT + else: + if freq is None: + raise ValueError("If value is None, freq cannot be None") + + # set defaults + month = 1 if month is None else month + day = 1 if day is None else day + hour = 0 if hour is None else hour + minute = 0 if minute is None else minute + second = 0 if second is None else second + + ordinal = _ordinal_from_fields(year, month, quarter, day, + hour, minute, second, freq) + + elif isinstance(value, Period): + other = value + if freq is None or frequencies.get_freq_code( + freq) == frequencies.get_freq_code(other.freq): + ordinal = other.ordinal + freq = other.freq + else: + converted = other.asfreq(freq) + ordinal = converted.ordinal + + elif is_null_datetimelike(value) or value in tslib._nat_strings: + ordinal = tslib.iNaT + + elif isinstance(value, compat.string_types) or lib.is_integer(value): + if lib.is_integer(value): + value = str(value) + value = value.upper() + dt, _, reso = parse_time_string(value, freq) + + if freq is None: + try: + freq = frequencies.Resolution.get_freq(reso) + except KeyError: + raise ValueError( + "Invalid frequency or could not infer: %s" % reso) + + elif isinstance(value, datetime): + dt = value + if freq is None: + raise ValueError('Must supply freq for datetime value') + elif isinstance(value, np.datetime64): + dt = Timestamp(value) + if freq is None: + raise ValueError('Must supply freq for datetime value') + elif isinstance(value, date): + dt = datetime(year=value.year, month=value.month, day=value.day) + if freq is None: + raise ValueError('Must supply freq for datetime value') + else: + msg = "Value must be Period, string, integer, or datetime" + raise ValueError(msg) + + if ordinal is None: + base, mult = frequencies.get_freq_code(freq) + ordinal = get_period_ordinal(dt.year, dt.month, dt.day, + dt.hour, dt.minute, dt.second, + dt.microsecond, 0, base) + + return cls._from_ordinal(ordinal, freq) + + +def _ordinal_from_fields(year, month, quarter, day, + hour, minute, second, freq): base, mult = frequencies.get_freq_code(freq) if quarter is not None: year, month = _quarter_to_myear(year, quarter, freq) - return get_period_ordinal(year, month, day, hour, minute, second, 0, 0, base) + return get_period_ordinal(year, month, day, hour, + minute, second, 0, 0, base) def _quarter_to_myear(year, quarter, freq): @@ -1183,7 +1245,8 @@ def _quarter_to_myear(year, quarter, freq): if quarter <= 0 or quarter > 4: raise ValueError('Quarter must be 1 <= q <= 4') - mnum = frequencies._month_numbers[frequencies._get_rule_month(freq)] + 1 + mnum = frequencies._month_numbers[ + frequencies._get_rule_month(freq)] + 1 month = (mnum + (quarter - 1) * 3) % 12 + 1 if month > mnum: year -= 1 diff --git a/pandas/src/reduce.pyx b/pandas/src/reduce.pyx index c3f8bdfbfd0a6..1cd3e53494a72 100644 --- a/pandas/src/reduce.pyx +++ b/pandas/src/reduce.pyx @@ -46,11 +46,11 @@ cdef class Reducer: self.chunksize = k self.increment = k * arr.dtype.itemsize - self.f = f self.arr = arr self.labels = labels - self.dummy, self.typ, self.index, self.ityp = self._check_dummy(dummy=dummy) + self.dummy, self.typ, self.index, self.ityp = self._check_dummy( + dummy=dummy) def _check_dummy(self, dummy=None): cdef object index=None, typ=None, ityp=None @@ -65,16 +65,17 @@ cdef class Reducer: else: # we passed a series-like - if hasattr(dummy,'values'): + if hasattr(dummy, 'values'): typ = type(dummy) - index = getattr(dummy,'index',None) + index = getattr(dummy, 'index', None) dummy = dummy.values if dummy.dtype != self.arr.dtype: raise ValueError('Dummy array must be same dtype') if len(dummy) != self.chunksize: - raise ValueError('Dummy array must be length %d' % self.chunksize) + raise ValueError('Dummy array must be length %d' % + self.chunksize) return dummy, typ, index, ityp @@ -111,15 +112,16 @@ cdef class Reducer: if self.typ is not None: - # recreate with the index if supplied - if has_index: + # recreate with the index if supplied + if has_index: - cached_typ = self.typ(chunk, index=self.index, name=name) + cached_typ = self.typ( + chunk, index=self.index, name=name) - else: + else: - # use the passsed typ, sans index - cached_typ = self.typ(chunk, name=name) + # use the passsed typ, sans index + cached_typ = self.typ(chunk, name=name) # use the cached_typ if possible if cached_typ is not None: @@ -127,13 +129,15 @@ cdef class Reducer: if has_index: object.__setattr__(cached_typ, 'index', self.index) - object.__setattr__(cached_typ._data._block, 'values', chunk) + object.__setattr__( + cached_typ._data._block, 'values', chunk) object.__setattr__(cached_typ, 'name', name) res = self.f(cached_typ) else: res = self.f(chunk) - if hasattr(res,'values') and isinstance(res.values, np.ndarray): + if hasattr(res, 'values') and isinstance( + res.values, np.ndarray): res = res.values if i == 0: result = _get_result_array(res, @@ -167,7 +171,8 @@ cdef class SeriesBinGrouper: bint passed_dummy cdef public: - object arr, index, dummy_arr, dummy_index, values, f, bins, typ, ityp, name + object arr, index, dummy_arr, dummy_index + object values, f, bins, typ, ityp, name def __init__(self, object series, object f, object bins, object dummy): n = len(series) @@ -182,7 +187,7 @@ cdef class SeriesBinGrouper: self.typ = series._constructor self.ityp = series.index._constructor self.index = series.index.values - self.name = getattr(series,'name',None) + self.name = getattr(series, 'name', None) self.dummy_arr, self.dummy_index = self._check_dummy(dummy) self.passed_dummy = dummy is not None @@ -205,7 +210,7 @@ cdef class SeriesBinGrouper: raise ValueError('Dummy array must be same dtype') if not values.flags.contiguous: values = values.copy() - index = dummy.index.values + index = dummy.index.values if not index.flags.contiguous: index = index.copy() @@ -227,9 +232,9 @@ cdef class SeriesBinGrouper: counts[0] = self.bins[0] for i in range(1, self.ngroups): if i == self.ngroups - 1: - counts[i] = len(self.arr) - self.bins[i-1] + counts[i] = len(self.arr) - self.bins[i - 1] else: - counts[i] = self.bins[i] - self.bins[i-1] + counts[i] = self.bins[i] - self.bins[i - 1] group_size = 0 n = len(self.arr) @@ -252,7 +257,8 @@ cdef class SeriesBinGrouper: else: object.__setattr__(cached_ityp, '_data', islider.buf) cached_ityp._engine.clear_mapping() - object.__setattr__(cached_typ._data._block, 'values', vslider.buf) + object.__setattr__( + cached_typ._data._block, 'values', vslider.buf) object.__setattr__(cached_typ, '_index', cached_ityp) object.__setattr__(cached_typ, 'name', name) @@ -293,7 +299,8 @@ cdef class SeriesGrouper: bint passed_dummy cdef public: - object arr, index, dummy_arr, dummy_index, f, labels, values, typ, ityp, name + object arr, index, dummy_arr, dummy_index + object f, labels, values, typ, ityp, name def __init__(self, object series, object f, object labels, Py_ssize_t ngroups, object dummy): @@ -309,7 +316,7 @@ cdef class SeriesGrouper: self.typ = series._constructor self.ityp = series.index._constructor self.index = series.index.values - self.name = getattr(series,'name',None) + self.name = getattr(series, 'name', None) self.dummy_arr, self.dummy_index = self._check_dummy(dummy) self.passed_dummy = dummy is not None @@ -320,14 +327,14 @@ cdef class SeriesGrouper: if dummy is None: values = np.empty(0, dtype=self.arr.dtype) - index = None + index = None else: values = dummy.values if dummy.dtype != self.arr.dtype: raise ValueError('Dummy array must be same dtype') if not values.flags.contiguous: values = values.copy() - index = dummy.index.values + index = dummy.index.values if not index.flags.contiguous: index = index.copy() @@ -375,7 +382,8 @@ cdef class SeriesGrouper: else: object.__setattr__(cached_ityp, '_data', islider.buf) cached_ityp._engine.clear_mapping() - object.__setattr__(cached_typ._data._block, 'values', vslider.buf) + object.__setattr__( + cached_typ._data._block, 'values', vslider.buf) object.__setattr__(cached_typ, '_index', cached_ityp) object.__setattr__(cached_typ, 'name', name) @@ -411,14 +419,14 @@ cdef class SeriesGrouper: cdef inline _extract_result(object res): """ extract the result object, it might be a 0-dim ndarray or a len-1 0-dim, or a scalar """ - if hasattr(res,'values'): - res = res.values + if hasattr(res, 'values'): + res = res.values if not np.isscalar(res): - if isinstance(res, np.ndarray): - if res.ndim == 0: - res = res.item() - elif res.ndim == 1 and len(res) == 1: - res = res[0] + if isinstance(res, np.ndarray): + if res.ndim == 0: + res = res.item() + elif res.ndim == 1 and len(res) == 1: + res = res[0] return res cdef class Slider: @@ -467,9 +475,11 @@ cdef class Slider: self.buf.data = self.orig_data self.buf.strides[0] = self.orig_stride + class InvalidApply(Exception): pass + def apply_frame_axis0(object frame, object f, object names, ndarray[int64_t] starts, ndarray[int64_t] ends): cdef: @@ -482,7 +492,6 @@ def apply_frame_axis0(object frame, object f, object names, if frame.index._has_complex_internals: raise InvalidApply('Cannot modify frame index internals') - results = [] # Need to infer if our low-level mucking is going to cause a segfault @@ -496,7 +505,6 @@ def apply_frame_axis0(object frame, object f, object names, except: raise InvalidApply('Let this error raise above us') - slider = BlockSlider(frame) mutated = False @@ -550,7 +558,8 @@ cdef class BlockSlider: util.set_array_not_contiguous(x) self.nblocks = len(self.blocks) - self.idx_slider = Slider(self.frame.index.values, self.dummy.index.values) + self.idx_slider = Slider( + self.frame.index.values, self.dummy.index.values) self.base_ptrs = malloc(sizeof(char*) * len(self.blocks)) for i, block in enumerate(self.blocks): @@ -574,7 +583,7 @@ cdef class BlockSlider: # move and set the index self.idx_slider.move(start, end) - object.__setattr__(self.index,'_data',self.idx_slider.buf) + object.__setattr__(self.index, '_data', self.idx_slider.buf) self.index._engine.clear_mapping() cdef reset(self): @@ -589,6 +598,7 @@ cdef class BlockSlider: arr.data = self.base_ptrs[i] arr.shape[1] = 0 + def reduce(arr, f, axis=0, dummy=None, labels=None): """ @@ -606,7 +616,7 @@ def reduce(arr, f, axis=0, dummy=None, labels=None): raise Exception('Cannot use shortcut') # pass as an ndarray - if hasattr(labels,'values'): + if hasattr(labels, 'values'): labels = labels.values reducer = Reducer(arr, f, axis=axis, dummy=dummy, labels=labels) diff --git a/pandas/src/skiplist.pyx b/pandas/src/skiplist.pyx index e7db7bd5a4a02..3017931e25115 100644 --- a/pandas/src/skiplist.pyx +++ b/pandas/src/skiplist.pyx @@ -75,7 +75,6 @@ cdef class IndexableSkiplist: i -= node.width[level] node = node.next[level] - return node.value cpdef insert(self, double value): diff --git a/pandas/src/sparse.pyx b/pandas/src/sparse.pyx index 94ae26e00f087..7ab29414499fc 100644 --- a/pandas/src/sparse.pyx +++ b/pandas/src/sparse.pyx @@ -1,4 +1,5 @@ -from numpy cimport ndarray, uint8_t, int32_t, float64_t +from numpy cimport (ndarray, uint8_t, int64_t, int32_t, int16_t, int8_t, + float64_t, float32_t, float16_t) cimport numpy as np cimport cython @@ -19,7 +20,7 @@ _np_version_under1p11 = LooseVersion(_np_version) < '1.11' np.import_array() np.import_ufunc() -#------------------------------------------------------------------------------- +# ----------------------------------------------------------------------------- # Preamble stuff cdef float64_t NaN = np.NaN @@ -28,7 +29,7 @@ cdef float64_t INF = np.inf cdef inline int int_max(int a, int b): return a if a >= b else b cdef inline int int_min(int a, int b): return a if a <= b else b -#------------------------------------------------------------------------------- +# ----------------------------------------------------------------------------- cdef class SparseIndex: @@ -111,7 +112,8 @@ cdef class IntIndex(SparseIndex): xindices = self.indices yindices = y.indices - new_indices = np.empty(min(len(xindices), len(yindices)), dtype=np.int32) + new_indices = np.empty(min( + len(xindices), len(yindices)), dtype=np.int32) for xi from 0 <= xi < self.npoints: xind = xindices[xi] @@ -146,13 +148,13 @@ cdef class IntIndex(SparseIndex): return IntIndex(self.length, new_indices) @cython.wraparound(False) - cpdef int lookup(self, Py_ssize_t index): + cpdef int32_t lookup(self, Py_ssize_t index): """ Return the internal location if value exists on given index. Return -1 otherwise. """ cdef: - Py_ssize_t res + int32_t res ndarray[int32_t, ndim=1] inds inds = self.indices @@ -170,7 +172,8 @@ cdef class IntIndex(SparseIndex): return -1 @cython.wraparound(False) - cpdef ndarray[int32_t] lookup_array(self, ndarray[int32_t, ndim=1] indexer): + cpdef ndarray[int32_t] lookup_array(self, ndarray[ + int32_t, ndim=1] indexer): """ Vectorized lookup, returns ndarray[int32_t] """ @@ -278,7 +281,7 @@ cpdef get_blocks(ndarray[int32_t, ndim=1] indices): lens = lens[:result_indexer] return locs, lens -#------------------------------------------------------------------------------- +# ----------------------------------------------------------------------------- # BlockIndex cdef class BlockIndex(SparseIndex): @@ -289,7 +292,7 @@ cdef class BlockIndex(SparseIndex): ---------- """ cdef readonly: - Py_ssize_t nblocks, npoints, length + int32_t nblocks, npoints, length ndarray blocs, blengths cdef: @@ -307,7 +310,7 @@ cdef class BlockIndex(SparseIndex): self.lenbuf = self.blengths.data self.length = length - self.nblocks = len(self.blocs) + self.nblocks = np.int32(len(self.blocs)) self.npoints = self.blengths.sum() # self.block_start = blocs @@ -349,7 +352,7 @@ cdef class BlockIndex(SparseIndex): for i from 0 <= i < self.nblocks: if i > 0: - if blocs[i] <= blocs[i-1]: + if blocs[i] <= blocs[i - 1]: raise ValueError('Locations not in ascending order') if i < self.nblocks - 1: @@ -380,7 +383,7 @@ cdef class BlockIndex(SparseIndex): def to_int_index(self): cdef: - Py_ssize_t i = 0, j, b + int32_t i = 0, j, b int32_t offset ndarray[int32_t, ndim=1] indices @@ -497,7 +500,7 @@ cdef class BlockIndex(SparseIndex): """ return BlockUnion(self, y.to_block_index()).result - cpdef int lookup(self, Py_ssize_t index): + cpdef Py_ssize_t lookup(self, Py_ssize_t index): """ Return the internal location if value exists on given index. Return -1 otherwise. @@ -523,7 +526,8 @@ cdef class BlockIndex(SparseIndex): return -1 @cython.wraparound(False) - cpdef ndarray[int32_t] lookup_array(self, ndarray[int32_t, ndim=1] indexer): + cpdef ndarray[int32_t] lookup_array(self, ndarray[ + int32_t, ndim=1] indexer): """ Vectorized lookup, returns ndarray[int32_t] """ @@ -641,7 +645,8 @@ cdef class BlockUnion(BlockMerge): cdef _make_merged_blocks(self): cdef: - ndarray[int32_t, ndim=1] xstart, xend, ystart, yend, out_bloc, out_blen + ndarray[int32_t, ndim=1] xstart, xend, ystart + ndarray[int32_t, ndim=1] yend, out_bloc, out_blen int32_t nstart, nend, diff Py_ssize_t max_len, result_indexer = 0 @@ -751,350 +756,13 @@ cdef class BlockUnion(BlockMerge): return self._find_next_block_end(1 - mode) -#------------------------------------------------------------------------------- +# ----------------------------------------------------------------------------- # Sparse arithmetic -ctypedef float64_t (* double_func)(float64_t a, float64_t b) +include "sparse_op_helper.pxi" -cdef inline tuple sparse_combine(ndarray x, SparseIndex xindex, float64_t xfill, - ndarray y, SparseIndex yindex, float64_t yfill, - double_func op): - if isinstance(xindex, BlockIndex): - return block_op(x, xindex.to_block_index(), xfill, - y, yindex.to_block_index(), yfill, op) - elif isinstance(xindex, IntIndex): - return int_op(x, xindex.to_int_index(), xfill, - y, yindex.to_int_index(), yfill, op) - - -@cython.boundscheck(False) -cdef inline tuple block_op(ndarray x_, BlockIndex xindex, float64_t xfill, - ndarray y_, BlockIndex yindex, float64_t yfill, - double_func op): - """ - Binary operator on BlockIndex objects with fill values - """ - - cdef: - BlockIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - Py_ssize_t xbp = 0, ybp = 0 # block positions - int32_t xloc, yloc - Py_ssize_t xblock = 0, yblock = 0 # block numbers - - ndarray[float64_t, ndim=1] x, y - ndarray[float64_t, ndim=1] out - - # to suppress Cython warning - x = x_ - y = y_ - - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.float64) - - # Wow, what a hack job. Need to do something about this - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if yblock == yindex.nblocks: - # use y fill value - out[out_i] = op(x[xi], yfill) - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - continue - - if xblock == xindex.nblocks: - # use x fill value - out[out_i] = op(xfill, y[yi]) - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - continue - - yloc = yindex.locbuf[yblock] + ybp - xloc = xindex.locbuf[xblock] + xbp - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = op(x[xi], y[yi]) - xi += 1 - yi += 1 - - # advance both locations - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - elif xloc < yloc: - # use y fill value - out[out_i] = op(x[xi], yfill) - xi += 1 - - # advance x location - xbp += 1 - if xbp == xindex.lenbuf[xblock]: - xblock += 1 - xbp = 0 - else: - # use x fill value - out[out_i] = op(xfill, y[yi]) - yi += 1 - - # advance y location - ybp += 1 - if ybp == yindex.lenbuf[yblock]: - yblock += 1 - ybp = 0 - - return out, out_index - - -@cython.boundscheck(False) -cdef inline tuple int_op(ndarray x_, IntIndex xindex, float64_t xfill, - ndarray y_, IntIndex yindex, float64_t yfill, - double_func op): - cdef: - IntIndex out_index - Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices - int32_t xloc, yloc - ndarray[int32_t, ndim=1] xindices, yindices, out_indices - ndarray[float64_t, ndim=1] x, y - ndarray[float64_t, ndim=1] out - - # suppress Cython compiler warnings due to inlining - x = x_ - y = y_ - - # need to do this first to know size of result array - out_index = xindex.make_union(yindex) - out = np.empty(out_index.npoints, dtype=np.float64) - - xindices = xindex.indices - yindices = yindex.indices - out_indices = out_index.indices - - # walk the two SparseVectors, adding matched locations... - for out_i from 0 <= out_i < out_index.npoints: - if xi == xindex.npoints: - # use x fill value - out[out_i] = op(xfill, y[yi]) - yi += 1 - continue - - if yi == yindex.npoints: - # use y fill value - out[out_i] = op(x[xi], yfill) - xi += 1 - continue - - xloc = xindices[xi] - yloc = yindices[yi] - - # each index in the out_index had to come from either x, y, or both - if xloc == yloc: - out[out_i] = op(x[xi], y[yi]) - xi += 1 - yi += 1 - elif xloc < yloc: - # use y fill value - out[out_i] = op(x[xi], yfill) - xi += 1 - else: - # use x fill value - out[out_i] = op(xfill, y[yi]) - yi += 1 - - return out, out_index - -cdef inline float64_t __add(float64_t a, float64_t b): - return a + b - -cdef inline float64_t __sub(float64_t a, float64_t b): - return a - b - -cdef inline float64_t __rsub(float64_t a, float64_t b): - return b - a - -cdef inline float64_t __div(float64_t a, float64_t b): - if b == 0: - if a > 0: - return INF - elif a < 0: - return -INF - else: - return NaN - else: - return a / b - -cdef inline float64_t __rdiv(float64_t a, float64_t b): - return __div(b, a) - -cdef inline float64_t __floordiv(float64_t a, float64_t b): - if b == 0: - # numpy >= 1.11 returns NaN - # for a // 0, rather than +-inf - if _np_version_under1p11: - if a > 0: - return INF - elif a < 0: - return -INF - return NaN - else: - return a // b - -cdef inline float64_t __rfloordiv(float64_t a, float64_t b): - return __floordiv(b, a) - -cdef inline float64_t __mul(float64_t a, float64_t b): - return a * b - -cdef inline float64_t __eq(float64_t a, float64_t b): - return a == b - -cdef inline float64_t __ne(float64_t a, float64_t b): - return a != b - -cdef inline float64_t __lt(float64_t a, float64_t b): - return a < b - -cdef inline float64_t __gt(float64_t a, float64_t b): - return a > b - -cdef inline float64_t __le(float64_t a, float64_t b): - return a <= b - -cdef inline float64_t __ge(float64_t a, float64_t b): - return a >= b - -cdef inline float64_t __mod(float64_t a, float64_t b): - if b == 0: - return NaN - else: - return a % b - -cdef inline float64_t __rmod(float64_t a, float64_t b): - return __mod(b, a) - -cdef inline float64_t __pow(float64_t a, float64_t b): - return a ** b - -cdef inline float64_t __rpow(float64_t a, float64_t b): - return __pow(b, a) - - -# This probably needs to be "templated" to achieve maximum performance. -# TODO: quantify performance boost to "templating" - -cpdef sparse_add(ndarray x, SparseIndex xindex, float64_t xfill, - ndarray y, SparseIndex yindex, float64_t yfill): - return sparse_combine(x, xindex, xfill, - y, yindex, yfill, __add) - -cpdef sparse_sub(ndarray x, SparseIndex xindex, float64_t xfill, - ndarray y, SparseIndex yindex, float64_t yfill): - return sparse_combine(x, xindex, xfill, - y, yindex, yfill, __sub) - -cpdef sparse_rsub(ndarray x, SparseIndex xindex, float64_t xfill, - ndarray y, SparseIndex yindex, float64_t yfill): - return sparse_combine(x, xindex, xfill, - y, yindex, yfill, __rsub) - -cpdef sparse_mul(ndarray x, SparseIndex xindex, float64_t xfill, - ndarray y, SparseIndex yindex, float64_t yfill): - return sparse_combine(x, xindex, xfill, - y, yindex, yfill, __mul) - -cpdef sparse_div(ndarray x, SparseIndex xindex, float64_t xfill, - ndarray y, SparseIndex yindex, float64_t yfill): - return sparse_combine(x, xindex, xfill, - y, yindex, yfill, __div) - -cpdef sparse_rdiv(ndarray x, SparseIndex xindex, float64_t xfill, - ndarray y, SparseIndex yindex, float64_t yfill): - return sparse_combine(x, xindex, xfill, - y, yindex, yfill, __rdiv) - -sparse_truediv = sparse_div -sparse_rtruediv = sparse_rdiv - -cpdef sparse_floordiv(ndarray x, SparseIndex xindex, float64_t xfill, - ndarray y, SparseIndex yindex, float64_t yfill): - return sparse_combine(x, xindex, xfill, - y, yindex, yfill, __floordiv) - -cpdef sparse_rfloordiv(ndarray x, SparseIndex xindex, float64_t xfill, - ndarray y, SparseIndex yindex, float64_t yfill): - return sparse_combine(x, xindex, xfill, - y, yindex, yfill, __rfloordiv) - -cpdef sparse_mod(ndarray x, SparseIndex xindex, float64_t xfill, - ndarray y, SparseIndex yindex, float64_t yfill): - return sparse_combine(x, xindex, xfill, - y, yindex, yfill, __mod) - -cpdef sparse_rmod(ndarray x, SparseIndex xindex, float64_t xfill, - ndarray y, SparseIndex yindex, float64_t yfill): - return sparse_combine(x, xindex, xfill, - y, yindex, yfill, __rmod) - -cpdef sparse_pow(ndarray x, SparseIndex xindex, float64_t xfill, - ndarray y, SparseIndex yindex, float64_t yfill): - return sparse_combine(x, xindex, xfill, - y, yindex, yfill, __pow) - -cpdef sparse_rpow(ndarray x, SparseIndex xindex, float64_t xfill, - ndarray y, SparseIndex yindex, float64_t yfill): - return sparse_combine(x, xindex, xfill, - y, yindex, yfill, __rpow) - -cpdef sparse_eq(ndarray x, SparseIndex xindex, float64_t xfill, - ndarray y, SparseIndex yindex, float64_t yfill): - return sparse_combine(x, xindex, xfill, - y, yindex, yfill, __eq) - -cpdef sparse_ne(ndarray x, SparseIndex xindex, float64_t xfill, - ndarray y, SparseIndex yindex, float64_t yfill): - return sparse_combine(x, xindex, xfill, - y, yindex, yfill, __ne) - -cpdef sparse_lt(ndarray x, SparseIndex xindex, float64_t xfill, - ndarray y, SparseIndex yindex, float64_t yfill): - return sparse_combine(x, xindex, xfill, - y, yindex, yfill, __lt) - -cpdef sparse_gt(ndarray x, SparseIndex xindex, float64_t xfill, - ndarray y, SparseIndex yindex, float64_t yfill): - return sparse_combine(x, xindex, xfill, - y, yindex, yfill, __gt) - -cpdef sparse_le(ndarray x, SparseIndex xindex, float64_t xfill, - ndarray y, SparseIndex yindex, float64_t yfill): - return sparse_combine(x, xindex, xfill, - y, yindex, yfill, __le) - -cpdef sparse_ge(ndarray x, SparseIndex xindex, float64_t xfill, - ndarray y, SparseIndex yindex, float64_t yfill): - return sparse_combine(x, xindex, xfill, - y, yindex, yfill, __ge) - -#------------------------------------------------------------------------------- +# ----------------------------------------------------------------------------- # Indexing operations def get_reindexer(ndarray[object, ndim=1] values, dict index_map): diff --git a/pandas/src/sparse_op_helper.pxi b/pandas/src/sparse_op_helper.pxi new file mode 100644 index 0000000000000..8462c31c84679 --- /dev/null +++ b/pandas/src/sparse_op_helper.pxi @@ -0,0 +1,5864 @@ +""" +Template for each `dtype` helper function for sparse ops + +WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in +""" + +#---------------------------------------------------------------------- +# Sparse op +#---------------------------------------------------------------------- + +cdef inline float64_t __div_float64(float64_t a, float64_t b): + if b == 0: + if a > 0: + return INF + elif a < 0: + return -INF + else: + return NaN + else: + return float(a) / b + +cdef inline float64_t __truediv_float64(float64_t a, float64_t b): + return __div_float64(a, b) + +cdef inline float64_t __floordiv_float64(float64_t a, float64_t b): + if b == 0: + # numpy >= 1.11 returns NaN + # for a // 0, rather than +-inf + if _np_version_under1p11: + if a > 0: + return INF + elif a < 0: + return -INF + return NaN + else: + return a // b + +cdef inline float64_t __mod_float64(float64_t a, float64_t b): + if b == 0: + return NaN + else: + return a % b + +cdef inline float64_t __div_int64(int64_t a, int64_t b): + if b == 0: + if a > 0: + return INF + elif a < 0: + return -INF + else: + return NaN + else: + return float(a) / b + +cdef inline float64_t __truediv_int64(int64_t a, int64_t b): + return __div_int64(a, b) + +cdef inline int64_t __floordiv_int64(int64_t a, int64_t b): + if b == 0: + return 0 + else: + return a // b + +cdef inline int64_t __mod_int64(int64_t a, int64_t b): + if b == 0: + return 0 + else: + return a % b + +#---------------------------------------------------------------------- +# sparse array op +#---------------------------------------------------------------------- + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_add_float64(ndarray x_, + BlockIndex xindex, + float64_t xfill, + ndarray y_, + BlockIndex yindex, + float64_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[float64_t, ndim=1] x, y + ndarray[float64_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.float64) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = x[xi] + yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = xfill + y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] + y[yi] + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] + yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = xfill + y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, xfill + yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_add_float64(ndarray x_, IntIndex xindex, + float64_t xfill, + ndarray y_, IntIndex yindex, + float64_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[float64_t, ndim=1] x, y + ndarray[float64_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.float64) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = xfill + y[yi] + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = x[xi] + yfill + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] + y[yi] + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] + yfill + xi += 1 + else: + # use x fill value + out[out_i] = xfill + y[yi] + yi += 1 + + return out, out_index, xfill + yfill + + +cpdef sparse_add_float64(ndarray[float64_t, ndim=1] x, + SparseIndex xindex, float64_t xfill, + ndarray[float64_t, ndim=1] y, + SparseIndex yindex, float64_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_add_float64(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_add_float64(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_fill_add_float64(float64_t xfill, + float64_t yfill): + return xfill + yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_add_int64(ndarray x_, + BlockIndex xindex, + int64_t xfill, + ndarray y_, + BlockIndex yindex, + int64_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[int64_t, ndim=1] x, y + ndarray[int64_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.int64) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = x[xi] + yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = xfill + y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] + y[yi] + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] + yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = xfill + y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, xfill + yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_add_int64(ndarray x_, IntIndex xindex, + int64_t xfill, + ndarray y_, IntIndex yindex, + int64_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[int64_t, ndim=1] x, y + ndarray[int64_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.int64) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = xfill + y[yi] + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = x[xi] + yfill + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] + y[yi] + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] + yfill + xi += 1 + else: + # use x fill value + out[out_i] = xfill + y[yi] + yi += 1 + + return out, out_index, xfill + yfill + + +cpdef sparse_add_int64(ndarray[int64_t, ndim=1] x, + SparseIndex xindex, int64_t xfill, + ndarray[int64_t, ndim=1] y, + SparseIndex yindex, int64_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_add_int64(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_add_int64(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_fill_add_int64(int64_t xfill, + int64_t yfill): + return xfill + yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_sub_float64(ndarray x_, + BlockIndex xindex, + float64_t xfill, + ndarray y_, + BlockIndex yindex, + float64_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[float64_t, ndim=1] x, y + ndarray[float64_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.float64) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = x[xi] - yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = xfill - y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] - y[yi] + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] - yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = xfill - y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, xfill - yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_sub_float64(ndarray x_, IntIndex xindex, + float64_t xfill, + ndarray y_, IntIndex yindex, + float64_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[float64_t, ndim=1] x, y + ndarray[float64_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.float64) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = xfill - y[yi] + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = x[xi] - yfill + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] - y[yi] + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] - yfill + xi += 1 + else: + # use x fill value + out[out_i] = xfill - y[yi] + yi += 1 + + return out, out_index, xfill - yfill + + +cpdef sparse_sub_float64(ndarray[float64_t, ndim=1] x, + SparseIndex xindex, float64_t xfill, + ndarray[float64_t, ndim=1] y, + SparseIndex yindex, float64_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_sub_float64(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_sub_float64(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_fill_sub_float64(float64_t xfill, + float64_t yfill): + return xfill - yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_sub_int64(ndarray x_, + BlockIndex xindex, + int64_t xfill, + ndarray y_, + BlockIndex yindex, + int64_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[int64_t, ndim=1] x, y + ndarray[int64_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.int64) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = x[xi] - yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = xfill - y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] - y[yi] + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] - yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = xfill - y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, xfill - yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_sub_int64(ndarray x_, IntIndex xindex, + int64_t xfill, + ndarray y_, IntIndex yindex, + int64_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[int64_t, ndim=1] x, y + ndarray[int64_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.int64) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = xfill - y[yi] + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = x[xi] - yfill + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] - y[yi] + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] - yfill + xi += 1 + else: + # use x fill value + out[out_i] = xfill - y[yi] + yi += 1 + + return out, out_index, xfill - yfill + + +cpdef sparse_sub_int64(ndarray[int64_t, ndim=1] x, + SparseIndex xindex, int64_t xfill, + ndarray[int64_t, ndim=1] y, + SparseIndex yindex, int64_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_sub_int64(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_sub_int64(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_fill_sub_int64(int64_t xfill, + int64_t yfill): + return xfill - yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_mul_float64(ndarray x_, + BlockIndex xindex, + float64_t xfill, + ndarray y_, + BlockIndex yindex, + float64_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[float64_t, ndim=1] x, y + ndarray[float64_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.float64) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = x[xi] * yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = xfill * y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] * y[yi] + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] * yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = xfill * y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, xfill * yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_mul_float64(ndarray x_, IntIndex xindex, + float64_t xfill, + ndarray y_, IntIndex yindex, + float64_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[float64_t, ndim=1] x, y + ndarray[float64_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.float64) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = xfill * y[yi] + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = x[xi] * yfill + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] * y[yi] + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] * yfill + xi += 1 + else: + # use x fill value + out[out_i] = xfill * y[yi] + yi += 1 + + return out, out_index, xfill * yfill + + +cpdef sparse_mul_float64(ndarray[float64_t, ndim=1] x, + SparseIndex xindex, float64_t xfill, + ndarray[float64_t, ndim=1] y, + SparseIndex yindex, float64_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_mul_float64(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_mul_float64(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_fill_mul_float64(float64_t xfill, + float64_t yfill): + return xfill * yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_mul_int64(ndarray x_, + BlockIndex xindex, + int64_t xfill, + ndarray y_, + BlockIndex yindex, + int64_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[int64_t, ndim=1] x, y + ndarray[int64_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.int64) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = x[xi] * yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = xfill * y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] * y[yi] + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] * yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = xfill * y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, xfill * yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_mul_int64(ndarray x_, IntIndex xindex, + int64_t xfill, + ndarray y_, IntIndex yindex, + int64_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[int64_t, ndim=1] x, y + ndarray[int64_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.int64) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = xfill * y[yi] + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = x[xi] * yfill + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] * y[yi] + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] * yfill + xi += 1 + else: + # use x fill value + out[out_i] = xfill * y[yi] + yi += 1 + + return out, out_index, xfill * yfill + + +cpdef sparse_mul_int64(ndarray[int64_t, ndim=1] x, + SparseIndex xindex, int64_t xfill, + ndarray[int64_t, ndim=1] y, + SparseIndex yindex, int64_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_mul_int64(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_mul_int64(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_fill_mul_int64(int64_t xfill, + int64_t yfill): + return xfill * yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_div_float64(ndarray x_, + BlockIndex xindex, + float64_t xfill, + ndarray y_, + BlockIndex yindex, + float64_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[float64_t, ndim=1] x, y + ndarray[float64_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.float64) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = __div_float64(x[xi], yfill) + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = __div_float64(xfill, y[yi]) + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = __div_float64(x[xi], y[yi]) + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = __div_float64(x[xi], yfill) + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = __div_float64(xfill, y[yi]) + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, __div_float64(xfill, yfill) + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_div_float64(ndarray x_, IntIndex xindex, + float64_t xfill, + ndarray y_, IntIndex yindex, + float64_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[float64_t, ndim=1] x, y + ndarray[float64_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.float64) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = __div_float64(xfill, y[yi]) + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = __div_float64(x[xi], yfill) + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = __div_float64(x[xi], y[yi]) + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = __div_float64(x[xi], yfill) + xi += 1 + else: + # use x fill value + out[out_i] = __div_float64(xfill, y[yi]) + yi += 1 + + return out, out_index, __div_float64(xfill, yfill) + + +cpdef sparse_div_float64(ndarray[float64_t, ndim=1] x, + SparseIndex xindex, float64_t xfill, + ndarray[float64_t, ndim=1] y, + SparseIndex yindex, float64_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_div_float64(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_div_float64(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_fill_div_float64(float64_t xfill, + float64_t yfill): + return __div_float64(xfill, yfill) + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_div_int64(ndarray x_, + BlockIndex xindex, + int64_t xfill, + ndarray y_, + BlockIndex yindex, + int64_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[int64_t, ndim=1] x, y + ndarray[float64_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.float64) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = __div_int64(x[xi], yfill) + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = __div_int64(xfill, y[yi]) + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = __div_int64(x[xi], y[yi]) + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = __div_int64(x[xi], yfill) + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = __div_int64(xfill, y[yi]) + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, __div_int64(xfill, yfill) + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_div_int64(ndarray x_, IntIndex xindex, + int64_t xfill, + ndarray y_, IntIndex yindex, + int64_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[int64_t, ndim=1] x, y + ndarray[float64_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.float64) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = __div_int64(xfill, y[yi]) + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = __div_int64(x[xi], yfill) + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = __div_int64(x[xi], y[yi]) + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = __div_int64(x[xi], yfill) + xi += 1 + else: + # use x fill value + out[out_i] = __div_int64(xfill, y[yi]) + yi += 1 + + return out, out_index, __div_int64(xfill, yfill) + + +cpdef sparse_div_int64(ndarray[int64_t, ndim=1] x, + SparseIndex xindex, int64_t xfill, + ndarray[int64_t, ndim=1] y, + SparseIndex yindex, int64_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_div_int64(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_div_int64(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_fill_div_int64(int64_t xfill, + int64_t yfill): + return __div_int64(xfill, yfill) + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_mod_float64(ndarray x_, + BlockIndex xindex, + float64_t xfill, + ndarray y_, + BlockIndex yindex, + float64_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[float64_t, ndim=1] x, y + ndarray[float64_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.float64) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = __mod_float64(x[xi], yfill) + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = __mod_float64(xfill, y[yi]) + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = __mod_float64(x[xi], y[yi]) + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = __mod_float64(x[xi], yfill) + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = __mod_float64(xfill, y[yi]) + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, __mod_float64(xfill, yfill) + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_mod_float64(ndarray x_, IntIndex xindex, + float64_t xfill, + ndarray y_, IntIndex yindex, + float64_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[float64_t, ndim=1] x, y + ndarray[float64_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.float64) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = __mod_float64(xfill, y[yi]) + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = __mod_float64(x[xi], yfill) + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = __mod_float64(x[xi], y[yi]) + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = __mod_float64(x[xi], yfill) + xi += 1 + else: + # use x fill value + out[out_i] = __mod_float64(xfill, y[yi]) + yi += 1 + + return out, out_index, __mod_float64(xfill, yfill) + + +cpdef sparse_mod_float64(ndarray[float64_t, ndim=1] x, + SparseIndex xindex, float64_t xfill, + ndarray[float64_t, ndim=1] y, + SparseIndex yindex, float64_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_mod_float64(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_mod_float64(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_fill_mod_float64(float64_t xfill, + float64_t yfill): + return __mod_float64(xfill, yfill) + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_mod_int64(ndarray x_, + BlockIndex xindex, + int64_t xfill, + ndarray y_, + BlockIndex yindex, + int64_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[int64_t, ndim=1] x, y + ndarray[int64_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.int64) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = __mod_int64(x[xi], yfill) + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = __mod_int64(xfill, y[yi]) + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = __mod_int64(x[xi], y[yi]) + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = __mod_int64(x[xi], yfill) + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = __mod_int64(xfill, y[yi]) + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, __mod_int64(xfill, yfill) + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_mod_int64(ndarray x_, IntIndex xindex, + int64_t xfill, + ndarray y_, IntIndex yindex, + int64_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[int64_t, ndim=1] x, y + ndarray[int64_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.int64) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = __mod_int64(xfill, y[yi]) + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = __mod_int64(x[xi], yfill) + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = __mod_int64(x[xi], y[yi]) + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = __mod_int64(x[xi], yfill) + xi += 1 + else: + # use x fill value + out[out_i] = __mod_int64(xfill, y[yi]) + yi += 1 + + return out, out_index, __mod_int64(xfill, yfill) + + +cpdef sparse_mod_int64(ndarray[int64_t, ndim=1] x, + SparseIndex xindex, int64_t xfill, + ndarray[int64_t, ndim=1] y, + SparseIndex yindex, int64_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_mod_int64(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_mod_int64(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_fill_mod_int64(int64_t xfill, + int64_t yfill): + return __mod_int64(xfill, yfill) + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_truediv_float64(ndarray x_, + BlockIndex xindex, + float64_t xfill, + ndarray y_, + BlockIndex yindex, + float64_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[float64_t, ndim=1] x, y + ndarray[float64_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.float64) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = __truediv_float64(x[xi], yfill) + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = __truediv_float64(xfill, y[yi]) + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = __truediv_float64(x[xi], y[yi]) + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = __truediv_float64(x[xi], yfill) + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = __truediv_float64(xfill, y[yi]) + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, __truediv_float64(xfill, yfill) + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_truediv_float64(ndarray x_, IntIndex xindex, + float64_t xfill, + ndarray y_, IntIndex yindex, + float64_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[float64_t, ndim=1] x, y + ndarray[float64_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.float64) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = __truediv_float64(xfill, y[yi]) + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = __truediv_float64(x[xi], yfill) + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = __truediv_float64(x[xi], y[yi]) + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = __truediv_float64(x[xi], yfill) + xi += 1 + else: + # use x fill value + out[out_i] = __truediv_float64(xfill, y[yi]) + yi += 1 + + return out, out_index, __truediv_float64(xfill, yfill) + + +cpdef sparse_truediv_float64(ndarray[float64_t, ndim=1] x, + SparseIndex xindex, float64_t xfill, + ndarray[float64_t, ndim=1] y, + SparseIndex yindex, float64_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_truediv_float64(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_truediv_float64(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_fill_truediv_float64(float64_t xfill, + float64_t yfill): + return __truediv_float64(xfill, yfill) + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_truediv_int64(ndarray x_, + BlockIndex xindex, + int64_t xfill, + ndarray y_, + BlockIndex yindex, + int64_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[int64_t, ndim=1] x, y + ndarray[float64_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.float64) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = __truediv_int64(x[xi], yfill) + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = __truediv_int64(xfill, y[yi]) + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = __truediv_int64(x[xi], y[yi]) + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = __truediv_int64(x[xi], yfill) + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = __truediv_int64(xfill, y[yi]) + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, __truediv_int64(xfill, yfill) + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_truediv_int64(ndarray x_, IntIndex xindex, + int64_t xfill, + ndarray y_, IntIndex yindex, + int64_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[int64_t, ndim=1] x, y + ndarray[float64_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.float64) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = __truediv_int64(xfill, y[yi]) + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = __truediv_int64(x[xi], yfill) + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = __truediv_int64(x[xi], y[yi]) + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = __truediv_int64(x[xi], yfill) + xi += 1 + else: + # use x fill value + out[out_i] = __truediv_int64(xfill, y[yi]) + yi += 1 + + return out, out_index, __truediv_int64(xfill, yfill) + + +cpdef sparse_truediv_int64(ndarray[int64_t, ndim=1] x, + SparseIndex xindex, int64_t xfill, + ndarray[int64_t, ndim=1] y, + SparseIndex yindex, int64_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_truediv_int64(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_truediv_int64(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_fill_truediv_int64(int64_t xfill, + int64_t yfill): + return __truediv_int64(xfill, yfill) + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_floordiv_float64(ndarray x_, + BlockIndex xindex, + float64_t xfill, + ndarray y_, + BlockIndex yindex, + float64_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[float64_t, ndim=1] x, y + ndarray[float64_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.float64) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = __floordiv_float64(x[xi], yfill) + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = __floordiv_float64(xfill, y[yi]) + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = __floordiv_float64(x[xi], y[yi]) + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = __floordiv_float64(x[xi], yfill) + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = __floordiv_float64(xfill, y[yi]) + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, __floordiv_float64(xfill, yfill) + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_floordiv_float64(ndarray x_, IntIndex xindex, + float64_t xfill, + ndarray y_, IntIndex yindex, + float64_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[float64_t, ndim=1] x, y + ndarray[float64_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.float64) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = __floordiv_float64(xfill, y[yi]) + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = __floordiv_float64(x[xi], yfill) + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = __floordiv_float64(x[xi], y[yi]) + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = __floordiv_float64(x[xi], yfill) + xi += 1 + else: + # use x fill value + out[out_i] = __floordiv_float64(xfill, y[yi]) + yi += 1 + + return out, out_index, __floordiv_float64(xfill, yfill) + + +cpdef sparse_floordiv_float64(ndarray[float64_t, ndim=1] x, + SparseIndex xindex, float64_t xfill, + ndarray[float64_t, ndim=1] y, + SparseIndex yindex, float64_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_floordiv_float64(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_floordiv_float64(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_fill_floordiv_float64(float64_t xfill, + float64_t yfill): + return __floordiv_float64(xfill, yfill) + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_floordiv_int64(ndarray x_, + BlockIndex xindex, + int64_t xfill, + ndarray y_, + BlockIndex yindex, + int64_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[int64_t, ndim=1] x, y + ndarray[int64_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.int64) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = __floordiv_int64(x[xi], yfill) + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = __floordiv_int64(xfill, y[yi]) + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = __floordiv_int64(x[xi], y[yi]) + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = __floordiv_int64(x[xi], yfill) + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = __floordiv_int64(xfill, y[yi]) + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, __floordiv_int64(xfill, yfill) + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_floordiv_int64(ndarray x_, IntIndex xindex, + int64_t xfill, + ndarray y_, IntIndex yindex, + int64_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[int64_t, ndim=1] x, y + ndarray[int64_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.int64) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = __floordiv_int64(xfill, y[yi]) + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = __floordiv_int64(x[xi], yfill) + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = __floordiv_int64(x[xi], y[yi]) + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = __floordiv_int64(x[xi], yfill) + xi += 1 + else: + # use x fill value + out[out_i] = __floordiv_int64(xfill, y[yi]) + yi += 1 + + return out, out_index, __floordiv_int64(xfill, yfill) + + +cpdef sparse_floordiv_int64(ndarray[int64_t, ndim=1] x, + SparseIndex xindex, int64_t xfill, + ndarray[int64_t, ndim=1] y, + SparseIndex yindex, int64_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_floordiv_int64(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_floordiv_int64(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_fill_floordiv_int64(int64_t xfill, + int64_t yfill): + return __floordiv_int64(xfill, yfill) + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_pow_float64(ndarray x_, + BlockIndex xindex, + float64_t xfill, + ndarray y_, + BlockIndex yindex, + float64_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[float64_t, ndim=1] x, y + ndarray[float64_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.float64) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = x[xi] ** yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = xfill ** y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] ** y[yi] + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] ** yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = xfill ** y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, xfill ** yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_pow_float64(ndarray x_, IntIndex xindex, + float64_t xfill, + ndarray y_, IntIndex yindex, + float64_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[float64_t, ndim=1] x, y + ndarray[float64_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.float64) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = xfill ** y[yi] + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = x[xi] ** yfill + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] ** y[yi] + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] ** yfill + xi += 1 + else: + # use x fill value + out[out_i] = xfill ** y[yi] + yi += 1 + + return out, out_index, xfill ** yfill + + +cpdef sparse_pow_float64(ndarray[float64_t, ndim=1] x, + SparseIndex xindex, float64_t xfill, + ndarray[float64_t, ndim=1] y, + SparseIndex yindex, float64_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_pow_float64(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_pow_float64(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_fill_pow_float64(float64_t xfill, + float64_t yfill): + return xfill ** yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_pow_int64(ndarray x_, + BlockIndex xindex, + int64_t xfill, + ndarray y_, + BlockIndex yindex, + int64_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[int64_t, ndim=1] x, y + ndarray[int64_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.int64) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = x[xi] ** yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = xfill ** y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] ** y[yi] + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] ** yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = xfill ** y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, xfill ** yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_pow_int64(ndarray x_, IntIndex xindex, + int64_t xfill, + ndarray y_, IntIndex yindex, + int64_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[int64_t, ndim=1] x, y + ndarray[int64_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.int64) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = xfill ** y[yi] + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = x[xi] ** yfill + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] ** y[yi] + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] ** yfill + xi += 1 + else: + # use x fill value + out[out_i] = xfill ** y[yi] + yi += 1 + + return out, out_index, xfill ** yfill + + +cpdef sparse_pow_int64(ndarray[int64_t, ndim=1] x, + SparseIndex xindex, int64_t xfill, + ndarray[int64_t, ndim=1] y, + SparseIndex yindex, int64_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_pow_int64(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_pow_int64(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_fill_pow_int64(int64_t xfill, + int64_t yfill): + return xfill ** yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_eq_float64(ndarray x_, + BlockIndex xindex, + float64_t xfill, + ndarray y_, + BlockIndex yindex, + float64_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[float64_t, ndim=1] x, y + ndarray[uint8_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.uint8) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = x[xi] == yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = xfill == y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] == y[yi] + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] == yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = xfill == y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, xfill == yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_eq_float64(ndarray x_, IntIndex xindex, + float64_t xfill, + ndarray y_, IntIndex yindex, + float64_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[float64_t, ndim=1] x, y + ndarray[uint8_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.uint8) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = xfill == y[yi] + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = x[xi] == yfill + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] == y[yi] + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] == yfill + xi += 1 + else: + # use x fill value + out[out_i] = xfill == y[yi] + yi += 1 + + return out, out_index, xfill == yfill + + +cpdef sparse_eq_float64(ndarray[float64_t, ndim=1] x, + SparseIndex xindex, float64_t xfill, + ndarray[float64_t, ndim=1] y, + SparseIndex yindex, float64_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_eq_float64(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_eq_float64(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_fill_eq_float64(float64_t xfill, + float64_t yfill): + return xfill == yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_eq_int64(ndarray x_, + BlockIndex xindex, + int64_t xfill, + ndarray y_, + BlockIndex yindex, + int64_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[int64_t, ndim=1] x, y + ndarray[uint8_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.uint8) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = x[xi] == yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = xfill == y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] == y[yi] + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] == yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = xfill == y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, xfill == yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_eq_int64(ndarray x_, IntIndex xindex, + int64_t xfill, + ndarray y_, IntIndex yindex, + int64_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[int64_t, ndim=1] x, y + ndarray[uint8_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.uint8) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = xfill == y[yi] + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = x[xi] == yfill + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] == y[yi] + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] == yfill + xi += 1 + else: + # use x fill value + out[out_i] = xfill == y[yi] + yi += 1 + + return out, out_index, xfill == yfill + + +cpdef sparse_eq_int64(ndarray[int64_t, ndim=1] x, + SparseIndex xindex, int64_t xfill, + ndarray[int64_t, ndim=1] y, + SparseIndex yindex, int64_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_eq_int64(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_eq_int64(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_fill_eq_int64(int64_t xfill, + int64_t yfill): + return xfill == yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_ne_float64(ndarray x_, + BlockIndex xindex, + float64_t xfill, + ndarray y_, + BlockIndex yindex, + float64_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[float64_t, ndim=1] x, y + ndarray[uint8_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.uint8) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = x[xi] != yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = xfill != y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] != y[yi] + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] != yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = xfill != y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, xfill != yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_ne_float64(ndarray x_, IntIndex xindex, + float64_t xfill, + ndarray y_, IntIndex yindex, + float64_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[float64_t, ndim=1] x, y + ndarray[uint8_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.uint8) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = xfill != y[yi] + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = x[xi] != yfill + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] != y[yi] + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] != yfill + xi += 1 + else: + # use x fill value + out[out_i] = xfill != y[yi] + yi += 1 + + return out, out_index, xfill != yfill + + +cpdef sparse_ne_float64(ndarray[float64_t, ndim=1] x, + SparseIndex xindex, float64_t xfill, + ndarray[float64_t, ndim=1] y, + SparseIndex yindex, float64_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_ne_float64(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_ne_float64(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_fill_ne_float64(float64_t xfill, + float64_t yfill): + return xfill != yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_ne_int64(ndarray x_, + BlockIndex xindex, + int64_t xfill, + ndarray y_, + BlockIndex yindex, + int64_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[int64_t, ndim=1] x, y + ndarray[uint8_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.uint8) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = x[xi] != yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = xfill != y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] != y[yi] + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] != yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = xfill != y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, xfill != yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_ne_int64(ndarray x_, IntIndex xindex, + int64_t xfill, + ndarray y_, IntIndex yindex, + int64_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[int64_t, ndim=1] x, y + ndarray[uint8_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.uint8) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = xfill != y[yi] + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = x[xi] != yfill + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] != y[yi] + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] != yfill + xi += 1 + else: + # use x fill value + out[out_i] = xfill != y[yi] + yi += 1 + + return out, out_index, xfill != yfill + + +cpdef sparse_ne_int64(ndarray[int64_t, ndim=1] x, + SparseIndex xindex, int64_t xfill, + ndarray[int64_t, ndim=1] y, + SparseIndex yindex, int64_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_ne_int64(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_ne_int64(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_fill_ne_int64(int64_t xfill, + int64_t yfill): + return xfill != yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_lt_float64(ndarray x_, + BlockIndex xindex, + float64_t xfill, + ndarray y_, + BlockIndex yindex, + float64_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[float64_t, ndim=1] x, y + ndarray[uint8_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.uint8) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = x[xi] < yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = xfill < y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] < y[yi] + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] < yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = xfill < y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, xfill < yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_lt_float64(ndarray x_, IntIndex xindex, + float64_t xfill, + ndarray y_, IntIndex yindex, + float64_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[float64_t, ndim=1] x, y + ndarray[uint8_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.uint8) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = xfill < y[yi] + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = x[xi] < yfill + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] < y[yi] + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] < yfill + xi += 1 + else: + # use x fill value + out[out_i] = xfill < y[yi] + yi += 1 + + return out, out_index, xfill < yfill + + +cpdef sparse_lt_float64(ndarray[float64_t, ndim=1] x, + SparseIndex xindex, float64_t xfill, + ndarray[float64_t, ndim=1] y, + SparseIndex yindex, float64_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_lt_float64(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_lt_float64(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_fill_lt_float64(float64_t xfill, + float64_t yfill): + return xfill < yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_lt_int64(ndarray x_, + BlockIndex xindex, + int64_t xfill, + ndarray y_, + BlockIndex yindex, + int64_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[int64_t, ndim=1] x, y + ndarray[uint8_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.uint8) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = x[xi] < yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = xfill < y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] < y[yi] + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] < yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = xfill < y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, xfill < yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_lt_int64(ndarray x_, IntIndex xindex, + int64_t xfill, + ndarray y_, IntIndex yindex, + int64_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[int64_t, ndim=1] x, y + ndarray[uint8_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.uint8) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = xfill < y[yi] + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = x[xi] < yfill + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] < y[yi] + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] < yfill + xi += 1 + else: + # use x fill value + out[out_i] = xfill < y[yi] + yi += 1 + + return out, out_index, xfill < yfill + + +cpdef sparse_lt_int64(ndarray[int64_t, ndim=1] x, + SparseIndex xindex, int64_t xfill, + ndarray[int64_t, ndim=1] y, + SparseIndex yindex, int64_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_lt_int64(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_lt_int64(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_fill_lt_int64(int64_t xfill, + int64_t yfill): + return xfill < yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_gt_float64(ndarray x_, + BlockIndex xindex, + float64_t xfill, + ndarray y_, + BlockIndex yindex, + float64_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[float64_t, ndim=1] x, y + ndarray[uint8_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.uint8) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = x[xi] > yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = xfill > y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] > y[yi] + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] > yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = xfill > y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, xfill > yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_gt_float64(ndarray x_, IntIndex xindex, + float64_t xfill, + ndarray y_, IntIndex yindex, + float64_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[float64_t, ndim=1] x, y + ndarray[uint8_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.uint8) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = xfill > y[yi] + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = x[xi] > yfill + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] > y[yi] + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] > yfill + xi += 1 + else: + # use x fill value + out[out_i] = xfill > y[yi] + yi += 1 + + return out, out_index, xfill > yfill + + +cpdef sparse_gt_float64(ndarray[float64_t, ndim=1] x, + SparseIndex xindex, float64_t xfill, + ndarray[float64_t, ndim=1] y, + SparseIndex yindex, float64_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_gt_float64(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_gt_float64(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_fill_gt_float64(float64_t xfill, + float64_t yfill): + return xfill > yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_gt_int64(ndarray x_, + BlockIndex xindex, + int64_t xfill, + ndarray y_, + BlockIndex yindex, + int64_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[int64_t, ndim=1] x, y + ndarray[uint8_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.uint8) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = x[xi] > yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = xfill > y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] > y[yi] + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] > yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = xfill > y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, xfill > yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_gt_int64(ndarray x_, IntIndex xindex, + int64_t xfill, + ndarray y_, IntIndex yindex, + int64_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[int64_t, ndim=1] x, y + ndarray[uint8_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.uint8) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = xfill > y[yi] + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = x[xi] > yfill + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] > y[yi] + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] > yfill + xi += 1 + else: + # use x fill value + out[out_i] = xfill > y[yi] + yi += 1 + + return out, out_index, xfill > yfill + + +cpdef sparse_gt_int64(ndarray[int64_t, ndim=1] x, + SparseIndex xindex, int64_t xfill, + ndarray[int64_t, ndim=1] y, + SparseIndex yindex, int64_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_gt_int64(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_gt_int64(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_fill_gt_int64(int64_t xfill, + int64_t yfill): + return xfill > yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_le_float64(ndarray x_, + BlockIndex xindex, + float64_t xfill, + ndarray y_, + BlockIndex yindex, + float64_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[float64_t, ndim=1] x, y + ndarray[uint8_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.uint8) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = x[xi] <= yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = xfill <= y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] <= y[yi] + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] <= yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = xfill <= y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, xfill <= yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_le_float64(ndarray x_, IntIndex xindex, + float64_t xfill, + ndarray y_, IntIndex yindex, + float64_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[float64_t, ndim=1] x, y + ndarray[uint8_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.uint8) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = xfill <= y[yi] + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = x[xi] <= yfill + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] <= y[yi] + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] <= yfill + xi += 1 + else: + # use x fill value + out[out_i] = xfill <= y[yi] + yi += 1 + + return out, out_index, xfill <= yfill + + +cpdef sparse_le_float64(ndarray[float64_t, ndim=1] x, + SparseIndex xindex, float64_t xfill, + ndarray[float64_t, ndim=1] y, + SparseIndex yindex, float64_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_le_float64(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_le_float64(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_fill_le_float64(float64_t xfill, + float64_t yfill): + return xfill <= yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_le_int64(ndarray x_, + BlockIndex xindex, + int64_t xfill, + ndarray y_, + BlockIndex yindex, + int64_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[int64_t, ndim=1] x, y + ndarray[uint8_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.uint8) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = x[xi] <= yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = xfill <= y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] <= y[yi] + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] <= yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = xfill <= y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, xfill <= yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_le_int64(ndarray x_, IntIndex xindex, + int64_t xfill, + ndarray y_, IntIndex yindex, + int64_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[int64_t, ndim=1] x, y + ndarray[uint8_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.uint8) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = xfill <= y[yi] + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = x[xi] <= yfill + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] <= y[yi] + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] <= yfill + xi += 1 + else: + # use x fill value + out[out_i] = xfill <= y[yi] + yi += 1 + + return out, out_index, xfill <= yfill + + +cpdef sparse_le_int64(ndarray[int64_t, ndim=1] x, + SparseIndex xindex, int64_t xfill, + ndarray[int64_t, ndim=1] y, + SparseIndex yindex, int64_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_le_int64(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_le_int64(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_fill_le_int64(int64_t xfill, + int64_t yfill): + return xfill <= yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_ge_float64(ndarray x_, + BlockIndex xindex, + float64_t xfill, + ndarray y_, + BlockIndex yindex, + float64_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[float64_t, ndim=1] x, y + ndarray[uint8_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.uint8) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = x[xi] >= yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = xfill >= y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] >= y[yi] + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] >= yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = xfill >= y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, xfill >= yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_ge_float64(ndarray x_, IntIndex xindex, + float64_t xfill, + ndarray y_, IntIndex yindex, + float64_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[float64_t, ndim=1] x, y + ndarray[uint8_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.uint8) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = xfill >= y[yi] + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = x[xi] >= yfill + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] >= y[yi] + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] >= yfill + xi += 1 + else: + # use x fill value + out[out_i] = xfill >= y[yi] + yi += 1 + + return out, out_index, xfill >= yfill + + +cpdef sparse_ge_float64(ndarray[float64_t, ndim=1] x, + SparseIndex xindex, float64_t xfill, + ndarray[float64_t, ndim=1] y, + SparseIndex yindex, float64_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_ge_float64(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_ge_float64(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_fill_ge_float64(float64_t xfill, + float64_t yfill): + return xfill >= yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_ge_int64(ndarray x_, + BlockIndex xindex, + int64_t xfill, + ndarray y_, + BlockIndex yindex, + int64_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[int64_t, ndim=1] x, y + ndarray[uint8_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.uint8) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = x[xi] >= yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = xfill >= y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] >= y[yi] + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] >= yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = xfill >= y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, xfill >= yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_ge_int64(ndarray x_, IntIndex xindex, + int64_t xfill, + ndarray y_, IntIndex yindex, + int64_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[int64_t, ndim=1] x, y + ndarray[uint8_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.uint8) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = xfill >= y[yi] + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = x[xi] >= yfill + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] >= y[yi] + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] >= yfill + xi += 1 + else: + # use x fill value + out[out_i] = xfill >= y[yi] + yi += 1 + + return out, out_index, xfill >= yfill + + +cpdef sparse_ge_int64(ndarray[int64_t, ndim=1] x, + SparseIndex xindex, int64_t xfill, + ndarray[int64_t, ndim=1] y, + SparseIndex yindex, int64_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_ge_int64(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_ge_int64(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_fill_ge_int64(int64_t xfill, + int64_t yfill): + return xfill >= yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_and_int64(ndarray x_, + BlockIndex xindex, + int64_t xfill, + ndarray y_, + BlockIndex yindex, + int64_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[int64_t, ndim=1] x, y + ndarray[uint8_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.uint8) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = x[xi] & yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = xfill & y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] & y[yi] + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] & yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = xfill & y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, xfill & yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_and_int64(ndarray x_, IntIndex xindex, + int64_t xfill, + ndarray y_, IntIndex yindex, + int64_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[int64_t, ndim=1] x, y + ndarray[uint8_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.uint8) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = xfill & y[yi] + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = x[xi] & yfill + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] & y[yi] + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] & yfill + xi += 1 + else: + # use x fill value + out[out_i] = xfill & y[yi] + yi += 1 + + return out, out_index, xfill & yfill + + +cpdef sparse_and_int64(ndarray[int64_t, ndim=1] x, + SparseIndex xindex, int64_t xfill, + ndarray[int64_t, ndim=1] y, + SparseIndex yindex, int64_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_and_int64(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_and_int64(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_fill_and_int64(int64_t xfill, + int64_t yfill): + return xfill & yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_and_uint8(ndarray x_, + BlockIndex xindex, + uint8_t xfill, + ndarray y_, + BlockIndex yindex, + uint8_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[uint8_t, ndim=1] x, y + ndarray[uint8_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.uint8) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = x[xi] & yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = xfill & y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] & y[yi] + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] & yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = xfill & y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, xfill & yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_and_uint8(ndarray x_, IntIndex xindex, + uint8_t xfill, + ndarray y_, IntIndex yindex, + uint8_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[uint8_t, ndim=1] x, y + ndarray[uint8_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.uint8) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = xfill & y[yi] + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = x[xi] & yfill + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] & y[yi] + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] & yfill + xi += 1 + else: + # use x fill value + out[out_i] = xfill & y[yi] + yi += 1 + + return out, out_index, xfill & yfill + + +cpdef sparse_and_uint8(ndarray[uint8_t, ndim=1] x, + SparseIndex xindex, uint8_t xfill, + ndarray[uint8_t, ndim=1] y, + SparseIndex yindex, uint8_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_and_uint8(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_and_uint8(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_fill_and_uint8(uint8_t xfill, + uint8_t yfill): + return xfill & yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_or_int64(ndarray x_, + BlockIndex xindex, + int64_t xfill, + ndarray y_, + BlockIndex yindex, + int64_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[int64_t, ndim=1] x, y + ndarray[uint8_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.uint8) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = x[xi] | yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = xfill | y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] | y[yi] + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] | yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = xfill | y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, xfill | yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_or_int64(ndarray x_, IntIndex xindex, + int64_t xfill, + ndarray y_, IntIndex yindex, + int64_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[int64_t, ndim=1] x, y + ndarray[uint8_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.uint8) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = xfill | y[yi] + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = x[xi] | yfill + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] | y[yi] + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] | yfill + xi += 1 + else: + # use x fill value + out[out_i] = xfill | y[yi] + yi += 1 + + return out, out_index, xfill | yfill + + +cpdef sparse_or_int64(ndarray[int64_t, ndim=1] x, + SparseIndex xindex, int64_t xfill, + ndarray[int64_t, ndim=1] y, + SparseIndex yindex, int64_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_or_int64(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_or_int64(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_fill_or_int64(int64_t xfill, + int64_t yfill): + return xfill | yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_or_uint8(ndarray x_, + BlockIndex xindex, + uint8_t xfill, + ndarray y_, + BlockIndex yindex, + uint8_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[uint8_t, ndim=1] x, y + ndarray[uint8_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.uint8) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = x[xi] | yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = xfill | y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] | y[yi] + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] | yfill + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = xfill | y[yi] + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, xfill | yfill + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_or_uint8(ndarray x_, IntIndex xindex, + uint8_t xfill, + ndarray y_, IntIndex yindex, + uint8_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[uint8_t, ndim=1] x, y + ndarray[uint8_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.uint8) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = xfill | y[yi] + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = x[xi] | yfill + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = x[xi] | y[yi] + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = x[xi] | yfill + xi += 1 + else: + # use x fill value + out[out_i] = xfill | y[yi] + yi += 1 + + return out, out_index, xfill | yfill + + +cpdef sparse_or_uint8(ndarray[uint8_t, ndim=1] x, + SparseIndex xindex, uint8_t xfill, + ndarray[uint8_t, ndim=1] y, + SparseIndex yindex, uint8_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_or_uint8(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_or_uint8(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_fill_or_uint8(uint8_t xfill, + uint8_t yfill): + return xfill | yfill diff --git a/pandas/src/sparse_op_helper.pxi.in b/pandas/src/sparse_op_helper.pxi.in new file mode 100644 index 0000000000000..d1d9a6f02a72c --- /dev/null +++ b/pandas/src/sparse_op_helper.pxi.in @@ -0,0 +1,341 @@ +""" +Template for each `dtype` helper function for sparse ops + +WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in +""" + +#---------------------------------------------------------------------- +# Sparse op +#---------------------------------------------------------------------- + +{{py: + +# dtype, float_group +dtypes = [('float64', True), ('int64', False)] + +}} + +{{for dtype, float_group in dtypes}} + +{{if float_group}} + +cdef inline {{dtype}}_t __div_{{dtype}}({{dtype}}_t a, {{dtype}}_t b): + if b == 0: + if a > 0: + return INF + elif a < 0: + return -INF + else: + return NaN + else: + return float(a) / b + +cdef inline {{dtype}}_t __truediv_{{dtype}}({{dtype}}_t a, {{dtype}}_t b): + return __div_{{dtype}}(a, b) + +cdef inline {{dtype}}_t __floordiv_{{dtype}}({{dtype}}_t a, {{dtype}}_t b): + if b == 0: + # numpy >= 1.11 returns NaN + # for a // 0, rather than +-inf + if _np_version_under1p11: + if a > 0: + return INF + elif a < 0: + return -INF + return NaN + else: + return a // b + +cdef inline {{dtype}}_t __mod_{{dtype}}({{dtype}}_t a, {{dtype}}_t b): + if b == 0: + return NaN + else: + return a % b + +{{else}} + +cdef inline float64_t __div_{{dtype}}({{dtype}}_t a, {{dtype}}_t b): + if b == 0: + if a > 0: + return INF + elif a < 0: + return -INF + else: + return NaN + else: + return float(a) / b + +cdef inline float64_t __truediv_{{dtype}}({{dtype}}_t a, {{dtype}}_t b): + return __div_{{dtype}}(a, b) + +cdef inline {{dtype}}_t __floordiv_{{dtype}}({{dtype}}_t a, {{dtype}}_t b): + if b == 0: + return 0 + else: + return a // b + +cdef inline {{dtype}}_t __mod_{{dtype}}({{dtype}}_t a, {{dtype}}_t b): + if b == 0: + return 0 + else: + return a % b + +{{endif}} + +{{endfor}} + +#---------------------------------------------------------------------- +# sparse array op +#---------------------------------------------------------------------- + +{{py: + +# dtype, arith_comp_group, logical_group +dtypes = [('float64', True, False), + ('int64', True, True), + ('uint8', False, True)] +# do not generate arithmetic / comparison template for uint8, +# it should be done in fused types + +def get_op(tup): + assert isinstance(tup, tuple) + assert len(tup) == 4 + + opname, lval, rval, dtype = tup + + ops_dict = {'add': '{0} + {1}', + 'sub': '{0} - {1}', + 'mul': '{0} * {1}', + 'div': '__div_{2}({0}, {1})', + 'mod': '__mod_{2}({0}, {1})', + 'truediv': '__truediv_{2}({0}, {1})', + 'floordiv': '__floordiv_{2}({0}, {1})', + 'pow': '{0} ** {1}', + 'eq': '{0} == {1}', + 'ne': '{0} != {1}', + 'lt': '{0} < {1}', + 'gt': '{0} > {1}', + 'le': '{0} <= {1}', + 'ge': '{0} >= {1}', + + 'and': '{0} & {1}', # logical op + 'or': '{0} | {1}'} + + return ops_dict[opname].format(lval, rval, dtype) + + +def get_dispatch(dtypes): + + ops_list = ['add', 'sub', 'mul', 'div', 'mod', 'truediv', + 'floordiv', 'pow', + 'eq', 'ne', 'lt', 'gt', 'le', 'ge', + 'and', 'or'] + + for opname in ops_list: + for dtype, arith_comp_group, logical_group in dtypes: + + if opname in ('div', 'truediv'): + rdtype = 'float64' + elif opname in ('eq', 'ne', 'lt', 'gt', 'le', 'ge'): + # comparison op + rdtype = 'uint8' + elif opname in ('and', 'or'): + # logical op + rdtype = 'uint8' + else: + rdtype = dtype + + if opname in ('and', 'or'): + if logical_group: + yield opname, dtype, rdtype + else: + if arith_comp_group: + yield opname, dtype, rdtype + +}} + + +{{for opname, dtype, rdtype in get_dispatch(dtypes)}} + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple block_op_{{opname}}_{{dtype}}(ndarray x_, + BlockIndex xindex, + {{dtype}}_t xfill, + ndarray y_, + BlockIndex yindex, + {{dtype}}_t yfill): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[{{dtype}}_t, ndim=1] x, y + ndarray[{{rdtype}}_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.{{rdtype}}) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = {{(opname, 'x[xi]', 'yfill', dtype) | get_op}} + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = {{(opname, 'xfill', 'y[yi]', dtype) | get_op}} + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = {{(opname, 'x[xi]', 'y[yi]', dtype) | get_op}} + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = {{(opname, 'x[xi]', 'yfill', dtype) | get_op}} + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = {{(opname, 'xfill', 'y[yi]', dtype) | get_op}} + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index, {{(opname, 'xfill', 'yfill', dtype) | get_op}} + + +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline tuple int_op_{{opname}}_{{dtype}}(ndarray x_, IntIndex xindex, + {{dtype}}_t xfill, + ndarray y_, IntIndex yindex, + {{dtype}}_t yfill): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[{{dtype}}_t, ndim=1] x, y + ndarray[{{rdtype}}_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.{{rdtype}}) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = {{(opname, 'xfill', 'y[yi]', dtype) | get_op}} + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = {{(opname, 'x[xi]', 'yfill', dtype) | get_op}} + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = {{(opname, 'x[xi]', 'y[yi]', dtype) | get_op}} + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = {{(opname, 'x[xi]', 'yfill', dtype) | get_op}} + xi += 1 + else: + # use x fill value + out[out_i] = {{(opname, 'xfill', 'y[yi]', dtype) | get_op}} + yi += 1 + + return out, out_index, {{(opname, 'xfill', 'yfill', dtype) | get_op}} + + +cpdef sparse_{{opname}}_{{dtype}}(ndarray[{{dtype}}_t, ndim=1] x, + SparseIndex xindex, {{dtype}}_t xfill, + ndarray[{{dtype}}_t, ndim=1] y, + SparseIndex yindex, {{dtype}}_t yfill): + + if isinstance(xindex, BlockIndex): + return block_op_{{opname}}_{{dtype}}(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill) + elif isinstance(xindex, IntIndex): + return int_op_{{opname}}_{{dtype}}(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill) + else: + raise NotImplementedError + + +cpdef sparse_fill_{{opname}}_{{dtype}}({{dtype}}_t xfill, + {{dtype}}_t yfill): + return {{(opname, 'xfill', 'yfill', dtype) | get_op}} + +{{endfor}} diff --git a/pandas/src/testing.pyx b/pandas/src/testing.pyx index 6780cf311c244..cda21ba9c4ce1 100644 --- a/pandas/src/testing.pyx +++ b/pandas/src/testing.pyx @@ -1,7 +1,8 @@ import numpy as np from pandas import compat -from pandas.core.common import isnull, array_equivalent, is_dtype_equal +from pandas.types.missing import isnull, array_equivalent +from pandas.types.common import is_dtype_equal cdef NUMERIC_TYPES = ( bool, @@ -67,13 +68,14 @@ cpdef assert_almost_equal(a, b, b : object check_less_precise : bool or int, default False Specify comparison precision. - 5 digits (False) or 3 digits (True) after decimal points are compared. - If an integer, then this will be the number of decimal points to compare + 5 digits (False) or 3 digits (True) after decimal points are + compared. If an integer, then this will be the number of decimal + points to compare check_dtype: bool, default True check dtype if both a and b are np.ndarray obj : str, default None - Specify object name being compared, internally used to show appropriate - assertion message + Specify object name being compared, internally used to show + appropriate assertion message lobj : str, default None Specify left object name being compared, internally used to show appropriate assertion message @@ -128,8 +130,9 @@ cpdef assert_almost_equal(a, b, na, nb = a.size, b.size if a.shape != b.shape: from pandas.util.testing import raise_assert_detail - raise_assert_detail(obj, '{0} shapes are different'.format(obj), - a.shape, b.shape) + raise_assert_detail( + obj, '{0} shapes are different'.format(obj), + a.shape, b.shape) if check_dtype and not is_dtype_equal(a, b): from pandas.util.testing import assert_attr_equal @@ -145,19 +148,28 @@ cpdef assert_almost_equal(a, b, if na != nb: from pandas.util.testing import raise_assert_detail + + # if we have a small diff set, print it + if abs(na - nb) < 10: + r = list(set(a) ^ set(b)) + else: + r = None + raise_assert_detail(obj, '{0} length are different'.format(obj), - na, nb) + na, nb, r) for i in xrange(len(a)): try: - assert_almost_equal(a[i], b[i], check_less_precise=check_less_precise) + assert_almost_equal(a[i], b[i], + check_less_precise=check_less_precise) except AssertionError: is_unequal = True diff += 1 if is_unequal: from pandas.util.testing import raise_assert_detail - msg = '{0} values are different ({1} %)'.format(obj, np.round(diff * 100.0 / na, 5)) + msg = '{0} values are different ({1} %)'.format( + obj, np.round(diff * 100.0 / na, 5)) raise_assert_detail(obj, msg, lobj, robj) return True @@ -190,12 +202,12 @@ cpdef assert_almost_equal(a, b, # case for zero if abs(fa) < 1e-5: if not decimal_almost_equal(fa, fb, decimal): - assert False, ( - '(very low values) expected %.5f but got %.5f, with decimal %d' % (fb, fa, decimal) - ) + assert False, ('(very low values) expected %.5f but ' + 'got %.5f, with decimal %d' % (fb, fa, decimal)) else: if not decimal_almost_equal(1, fb / fa, decimal): - assert False, 'expected %.5f but got %.5f, with decimal %d' % (fb, fa, decimal) + assert False, ('expected %.5f but got %.5f, ' + 'with decimal %d' % (fb, fa, decimal)) return True raise AssertionError("{0} != {1}".format(a, b)) diff --git a/pandas/src/ujson/lib/ultrajson.h b/pandas/src/ujson/lib/ultrajson.h index f83f74a0fe0da..c37fe8c8e6c38 100644 --- a/pandas/src/ujson/lib/ultrajson.h +++ b/pandas/src/ujson/lib/ultrajson.h @@ -26,7 +26,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc) -http://code.google.com/p/stringencoders/ +https://github.com/client9/stringencoders Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. Numeric decoder derived from from TCL library diff --git a/pandas/src/ujson/lib/ultrajsondec.c b/pandas/src/ujson/lib/ultrajsondec.c index 9a4d5972b101b..5496068832f2e 100644 --- a/pandas/src/ujson/lib/ultrajsondec.c +++ b/pandas/src/ujson/lib/ultrajsondec.c @@ -26,7 +26,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc) -http://code.google.com/p/stringencoders/ +https://github.com/client9/stringencoders Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. Numeric decoder derived from from TCL library diff --git a/pandas/src/ujson/lib/ultrajsonenc.c b/pandas/src/ujson/lib/ultrajsonenc.c index 5e2a226ae8d63..2adf3cb707bdb 100644 --- a/pandas/src/ujson/lib/ultrajsonenc.c +++ b/pandas/src/ujson/lib/ultrajsonenc.c @@ -26,7 +26,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc) -http://code.google.com/p/stringencoders/ +https://github.com/client9/stringencoders Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. Numeric decoder derived from from TCL library diff --git a/pandas/src/ujson/python/JSONtoObj.c b/pandas/src/ujson/python/JSONtoObj.c index 9c1b4febd9895..e4d02db4cb60a 100644 --- a/pandas/src/ujson/python/JSONtoObj.c +++ b/pandas/src/ujson/python/JSONtoObj.c @@ -26,7 +26,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc) -http://code.google.com/p/stringencoders/ +https://github.com/client9/stringencoders Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. Numeric decoder derived from from TCL library diff --git a/pandas/src/ujson/python/objToJSON.c b/pandas/src/ujson/python/objToJSON.c index 46ae623ae88a7..75de63acbd7d6 100644 --- a/pandas/src/ujson/python/objToJSON.c +++ b/pandas/src/ujson/python/objToJSON.c @@ -26,7 +26,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc) -http://code.google.com/p/stringencoders/ +https://github.com/client9/stringencoders Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. Numeric decoder derived from from TCL library @@ -450,7 +450,7 @@ static void *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc, void *outValue, si static void *PandasDateTimeStructToJSON(pandas_datetimestruct *dts, JSONTypeContext *tc, void *outValue, size_t *_outLen) { - int base = ((PyObjectEncoder*) tc->encoder)->datetimeUnit; + PANDAS_DATETIMEUNIT base = ((PyObjectEncoder*) tc->encoder)->datetimeUnit; if (((PyObjectEncoder*) tc->encoder)->datetimeIso) { @@ -493,7 +493,7 @@ static void *NpyDateTimeScalarToJSON(JSOBJ _obj, JSONTypeContext *tc, void *outV PyDatetimeScalarObject *obj = (PyDatetimeScalarObject *) _obj; PRINTMARK(); - pandas_datetime_to_datetimestruct(obj->obval, obj->obmeta.base, &dts); + pandas_datetime_to_datetimestruct(obj->obval, (PANDAS_DATETIMEUNIT)obj->obmeta.base, &dts); return PandasDateTimeStructToJSON(&dts, tc, outValue, _outLen); } diff --git a/pandas/src/ujson/python/py_defines.h b/pandas/src/ujson/python/py_defines.h index 7a5083e131512..723eaed336f6b 100644 --- a/pandas/src/ujson/python/py_defines.h +++ b/pandas/src/ujson/python/py_defines.h @@ -26,7 +26,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc) -http://code.google.com/p/stringencoders/ +https://github.com/client9/stringencoders Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. Numeric decoder derived from from TCL library diff --git a/pandas/src/ujson/python/ujson.c b/pandas/src/ujson/python/ujson.c index 2eb8a80c0325c..48ea92ed3bc8c 100644 --- a/pandas/src/ujson/python/ujson.c +++ b/pandas/src/ujson/python/ujson.c @@ -26,7 +26,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc) -http://code.google.com/p/stringencoders/ +https://github.com/client9/stringencoders Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. Numeric decoder derived from from TCL library diff --git a/pandas/src/ujson/python/version.h b/pandas/src/ujson/python/version.h index 0ccfbfe74521c..2d4fd137edefe 100644 --- a/pandas/src/ujson/python/version.h +++ b/pandas/src/ujson/python/version.h @@ -26,7 +26,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc) -http://code.google.com/p/stringencoders/ +https://github.com/client9/stringencoders Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. Numeric decoder derived from from TCL library diff --git a/pandas/src/util.pxd b/pandas/src/util.pxd index 84b331f1e8e6f..fcb5583a0a6e7 100644 --- a/pandas/src/util.pxd +++ b/pandas/src/util.pxd @@ -24,6 +24,20 @@ cdef extern from "numpy_helper.h": object sarr_from_data(cnp.dtype, int length, void* data) inline object unbox_if_zerodim(object arr) +ctypedef fused numeric: + cnp.int8_t + cnp.int16_t + cnp.int32_t + cnp.int64_t + + cnp.uint8_t + cnp.uint16_t + cnp.uint32_t + cnp.uint64_t + + cnp.float32_t + cnp.float64_t + cdef inline object get_value_at(ndarray arr, object loc): cdef: Py_ssize_t i, sz @@ -84,4 +98,4 @@ cdef inline bint _checknan(object val): return not cnp.PyArray_Check(val) and val != val cdef inline bint is_period_object(object val): - return getattr(val,'_typ','_typ') == 'period' + return getattr(val, '_typ', '_typ') == 'period' diff --git a/pandas/stats/fama_macbeth.py b/pandas/stats/fama_macbeth.py index caad53df2c7fe..f7d50e8e72a5c 100644 --- a/pandas/stats/fama_macbeth.py +++ b/pandas/stats/fama_macbeth.py @@ -37,7 +37,8 @@ def __init__(self, y, x, intercept=True, nw_lags=None, import warnings warnings.warn("The pandas.stats.fama_macbeth module is deprecated and will be " "removed in a future version. We refer to external packages " - "like statsmodels, see here: http://statsmodels.sourceforge.net/stable/index.html", + "like statsmodels, see here: " + "http://www.statsmodels.org/stable/index.html", FutureWarning, stacklevel=4) if dropped_dummies is None: diff --git a/pandas/stats/moments.py b/pandas/stats/moments.py index 46d30ab7fe313..95b209aee0b0c 100644 --- a/pandas/stats/moments.py +++ b/pandas/stats/moments.py @@ -6,7 +6,7 @@ import warnings import numpy as np -from pandas import lib +from pandas.types.common import is_scalar from pandas.core.api import DataFrame, Series from pandas.util.decorators import Substitution, Appender @@ -226,7 +226,7 @@ def ensure_compat(dispatch, name, arg, func_kw=None, *args, **kwargs): aargs += ',' def f(a, b): - if lib.isscalar(b): + if is_scalar(b): return "{a}={b}".format(a=a, b=b) return "{a}=<{b}>".format(a=a, b=type(b).__name__) aargs = ','.join([f(a, b) for a, b in kwds.items() if b is not None]) @@ -271,6 +271,9 @@ def rolling_count(arg, window, **kwargs): The `freq` keyword is used to conform time series data to a specified frequency by resampling the data. This is done with the default parameters of :meth:`~pandas.Series.resample` (i.e. using the `mean`). + + To learn more about the frequency strings, please see `this link + `__. """ return ensure_compat('rolling', 'count', arg, window=window, **kwargs) @@ -521,6 +524,9 @@ def rolling_quantile(arg, window, quantile, min_periods=None, freq=None, The `freq` keyword is used to conform time series data to a specified frequency by resampling the data. This is done with the default parameters of :meth:`~pandas.Series.resample` (i.e. using the `mean`). + + To learn more about the frequency strings, please see `this link + `__. """ return ensure_compat('rolling', 'quantile', @@ -570,6 +576,9 @@ def rolling_apply(arg, window, func, min_periods=None, freq=None, The `freq` keyword is used to conform time series data to a specified frequency by resampling the data. This is done with the default parameters of :meth:`~pandas.Series.resample` (i.e. using the `mean`). + + To learn more about the frequency strings, please see `this link + `__. """ return ensure_compat('rolling', 'apply', @@ -642,6 +651,9 @@ def rolling_window(arg, window=None, win_type=None, min_periods=None, The `freq` keyword is used to conform time series data to a specified frequency by resampling the data. This is done with the default parameters of :meth:`~pandas.Series.resample` (i.e. using the `mean`). + + To learn more about the frequency strings, please see `this link + `__. """ func = 'mean' if mean else 'sum' return ensure_compat('rolling', @@ -707,6 +719,9 @@ def expanding_count(arg, freq=None): The `freq` keyword is used to conform time series data to a specified frequency by resampling the data. This is done with the default parameters of :meth:`~pandas.Series.resample` (i.e. using the `mean`). + + To learn more about the frequency strings, please see `this link + `__. """ return ensure_compat('expanding', 'count', arg, freq=freq) @@ -735,6 +750,9 @@ def expanding_quantile(arg, quantile, min_periods=1, freq=None): The `freq` keyword is used to conform time series data to a specified frequency by resampling the data. This is done with the default parameters of :meth:`~pandas.Series.resample` (i.e. using the `mean`). + + To learn more about the frequency strings, please see `this link + `__. """ return ensure_compat('expanding', 'quantile', @@ -818,6 +836,9 @@ def expanding_apply(arg, func, min_periods=1, freq=None, The `freq` keyword is used to conform time series data to a specified frequency by resampling the data. This is done with the default parameters of :meth:`~pandas.Series.resample` (i.e. using the `mean`). + + To learn more about the frequency strings, please see `this link + `__. """ return ensure_compat('expanding', 'apply', diff --git a/pandas/stats/ols.py b/pandas/stats/ols.py index e2375ea180ed2..b533d255bd196 100644 --- a/pandas/stats/ols.py +++ b/pandas/stats/ols.py @@ -13,7 +13,7 @@ from pandas.core.api import DataFrame, Series, isnull from pandas.core.base import StringMixin -from pandas.core.common import _ensure_float64 +from pandas.types.common import _ensure_float64 from pandas.core.index import MultiIndex from pandas.core.panel import Panel from pandas.util.decorators import cache_readonly @@ -51,7 +51,8 @@ def __init__(self, y, x, intercept=True, weights=None, nw_lags=None, import warnings warnings.warn("The pandas.stats.ols module is deprecated and will be " "removed in a future version. We refer to external packages " - "like statsmodels, see some examples here: http://statsmodels.sourceforge.net/stable/regression.html", + "like statsmodels, see some examples here: " + "http://www.statsmodels.org/stable/regression.html", FutureWarning, stacklevel=4) try: diff --git a/pandas/stats/plm.py b/pandas/stats/plm.py index dca1977fb19bd..099c45d5ec60b 100644 --- a/pandas/stats/plm.py +++ b/pandas/stats/plm.py @@ -18,7 +18,6 @@ from pandas.core.frame import DataFrame from pandas.core.reshape import get_dummies from pandas.core.series import Series -from pandas.core.sparse import SparsePanel from pandas.stats.ols import OLS, MovingOLS import pandas.stats.common as com import pandas.stats.math as math @@ -39,7 +38,8 @@ def __init__(self, y, x, weights=None, intercept=True, nw_lags=None, import warnings warnings.warn("The pandas.stats.plm module is deprecated and will be " "removed in a future version. We refer to external packages " - "like statsmodels, see some examples here: http://statsmodels.sourceforge.net/stable/mixed_linear.html", + "like statsmodels, see some examples here: " + "http://www.statsmodels.org/stable/mixed_linear.html", FutureWarning, stacklevel=4) self._x_orig = x self._y_orig = y @@ -136,8 +136,7 @@ def _filter_data(self): if isinstance(data, Panel): data = data.copy() - if not isinstance(data, SparsePanel): - data, cat_mapping = self._convert_x(data) + data, cat_mapping = self._convert_x(data) if not isinstance(data, Panel): data = Panel.from_dict(data, intersect=True) @@ -743,7 +742,8 @@ def __init__(self, y, x, window_type='full_sample', window=None, import warnings warnings.warn("The pandas.stats.plm module is deprecated and will be " "removed in a future version. We refer to external packages " - "like statsmodels, see some examples here: http://statsmodels.sourceforge.net/stable/mixed_linear.html", + "like statsmodels, see some examples here: " + "http://www.statsmodels.org/stable/mixed_linear.html", FutureWarning, stacklevel=4) for attr in self.ATTRIBUTES: diff --git a/pandas/stats/tests/test_ols.py b/pandas/stats/tests/test_ols.py index bac824f0b4840..6f688649affb0 100644 --- a/pandas/stats/tests/test_ols.py +++ b/pandas/stats/tests/test_ols.py @@ -16,7 +16,7 @@ from pandas import date_range, bdate_range from pandas.core.panel import Panel -from pandas import DataFrame, Index, Series, notnull, datetools +from pandas import DataFrame, Index, Series, notnull, offsets from pandas.stats.api import ols from pandas.stats.ols import _filter_data from pandas.stats.plm import NonPooledPanelOLS, PanelOLS @@ -24,7 +24,7 @@ assert_frame_equal, assertRaisesRegexp, slow) import pandas.util.testing as tm import pandas.compat as compat -from .common import BaseTest +from pandas.stats.tests.common import BaseTest _have_statsmodels = True try: @@ -645,6 +645,7 @@ def testWithXEffects(self): exp_x = DataFrame([[0., 0., 14., 1.], [0, 1, 17, 1], [1, 0, 48, 1]], columns=['x1_30', 'x1_9', 'x2', 'intercept'], index=res.index, dtype=float) + exp_x[['x1_30', 'x1_9']] = exp_x[['x1_30', 'x1_9']].astype(np.uint8) assert_frame_equal(res, exp_x.reindex(columns=res.columns)) def testWithXEffectsAndDroppedDummies(self): @@ -659,6 +660,7 @@ def testWithXEffectsAndDroppedDummies(self): exp_x = DataFrame([[1., 0., 14., 1.], [0, 1, 17, 1], [0, 0, 48, 1]], columns=['x1_6', 'x1_9', 'x2', 'intercept'], index=res.index, dtype=float) + exp_x[['x1_6', 'x1_9']] = exp_x[['x1_6', 'x1_9']].astype(np.uint8) assert_frame_equal(res, exp_x.reindex(columns=res.columns)) @@ -896,22 +898,22 @@ class TestOLSFilter(tm.TestCase): def setUp(self): date_index = date_range(datetime(2009, 12, 11), periods=3, - freq=datetools.bday) + freq=offsets.BDay()) ts = Series([3, 1, 4], index=date_index) self.TS1 = ts date_index = date_range(datetime(2009, 12, 11), periods=5, - freq=datetools.bday) + freq=offsets.BDay()) ts = Series([1, 5, 9, 2, 6], index=date_index) self.TS2 = ts date_index = date_range(datetime(2009, 12, 11), periods=3, - freq=datetools.bday) + freq=offsets.BDay()) ts = Series([5, np.nan, 3], index=date_index) self.TS3 = ts date_index = date_range(datetime(2009, 12, 11), periods=5, - freq=datetools.bday) + freq=offsets.BDay()) ts = Series([np.nan, 5, 8, 9, 7], index=date_index) self.TS4 = ts diff --git a/pandas/stats/var.py b/pandas/stats/var.py index cc78ca2886fb3..db4028d60f5c8 100644 --- a/pandas/stats/var.py +++ b/pandas/stats/var.py @@ -31,7 +31,8 @@ def __init__(self, data, p=1, intercept=True): import warnings warnings.warn("The pandas.stats.var module is deprecated and will be " "removed in a future version. We refer to external packages " - "like statsmodels, see some examples here: http://statsmodels.sourceforge.net/stable/vector_ar.html#var", + "like statsmodels, see some examples here: " + "http://www.statsmodels.org/stable/vector_ar.html#var", FutureWarning, stacklevel=4) try: diff --git a/pandas/tests/formats/test_format.py b/pandas/tests/formats/test_format.py index e67fe2cddde77..7e55c04fec7cc 100644 --- a/pandas/tests/formats/test_format.py +++ b/pandas/tests/formats/test_format.py @@ -456,6 +456,28 @@ def test_to_string_with_formatters(self): '2 0x3 [ 3.0] -False-')) self.assertEqual(result, result2) + def test_to_string_with_datetime64_monthformatter(self): + months = [datetime(2016, 1, 1), datetime(2016, 2, 2)] + x = DataFrame({'months': months}) + + def format_func(x): + return x.strftime('%Y-%m') + result = x.to_string(formatters={'months': format_func}) + expected = 'months\n0 2016-01\n1 2016-02' + self.assertEqual(result.strip(), expected) + + def test_to_string_with_datetime64_hourformatter(self): + + x = DataFrame({'hod': pd.to_datetime(['10:10:10.100', '12:12:12.120'], + format='%H:%M:%S.%f')}) + + def format_func(x): + return x.strftime('%H:%M') + + result = x.to_string(formatters={'hod': format_func}) + expected = 'hod\n0 10:10\n1 12:12' + self.assertEqual(result.strip(), expected) + def test_to_string_with_formatters_unicode(self): df = DataFrame({u('c/\u03c3'): [1, 2, 3]}) result = df.to_string(formatters={u('c/\u03c3'): lambda x: '%s' % x}) @@ -1233,6 +1255,63 @@ def test_to_html_index_formatter(self): self.assertEqual(result, expected) + def test_to_html_datetime64_monthformatter(self): + months = [datetime(2016, 1, 1), datetime(2016, 2, 2)] + x = DataFrame({'months': months}) + + def format_func(x): + return x.strftime('%Y-%m') + result = x.to_html(formatters={'months': format_func}) + expected = """\ + + + + + + + + + + + + + + + + + +
    months
    02016-01
    12016-02
    """ + self.assertEqual(result, expected) + + def test_to_html_datetime64_hourformatter(self): + + x = DataFrame({'hod': pd.to_datetime(['10:10:10.100', '12:12:12.120'], + format='%H:%M:%S.%f')}) + + def format_func(x): + return x.strftime('%H:%M') + result = x.to_html(formatters={'hod': format_func}) + expected = """\ + + + + + + + + + + + + + + + + + +
    hod
    010:10
    112:12
    """ + self.assertEqual(result, expected) + def test_to_html_regression_GH6098(self): df = DataFrame({u('clé1'): [u('a'), u('a'), u('b'), u('b'), u('a')], u('clé2'): [u('1er'), u('2ème'), u('1er'), u('2ème'), @@ -1571,6 +1650,23 @@ def test_to_html_truncate_multi_index_sparse_off(self): expected = expected.decode('utf-8') self.assertEqual(result, expected) + def test_to_html_border(self): + df = DataFrame({'A': [1, 2]}) + result = df.to_html() + assert 'border="1"' in result + + def test_to_html_border_option(self): + df = DataFrame({'A': [1, 2]}) + with pd.option_context('html.border', 0): + result = df.to_html() + self.assertTrue('border="0"' in result) + self.assertTrue('border="0"' in df._repr_html_()) + + def test_to_html_border_zero(self): + df = DataFrame({'A': [1, 2]}) + result = df.to_html(border=0) + self.assertTrue('border="0"' in result) + def test_nonunicode_nonascii_alignment(self): df = DataFrame([["aa\xc3\xa4\xc3\xa4", 1], ["bbbb", 2]]) rep_str = df.to_string() @@ -1589,7 +1685,7 @@ def test_string_repr_encoding(self): def test_repr_corner(self): # representing infs poses no problems - df = DataFrame({'foo': np.inf * np.empty(10)}) + df = DataFrame({'foo': [-np.inf, np.inf]}) repr(df) def test_frame_info_encoding(self): @@ -1885,6 +1981,14 @@ def test_to_string_no_index(self): self.assertEqual(df_s, expected) + def test_to_string_line_width_no_index(self): + df = DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) + + df_s = df.to_string(line_width=1, index=False) + expected = "x \\\n1 \n2 \n3 \n\ny \n4 \n5 \n6" + + self.assertEqual(df_s, expected) + def test_to_string_float_formatting(self): self.reset_display_options() fmt.set_option('display.precision', 5, 'display.column_space', 12, @@ -2775,6 +2879,33 @@ def test_to_latex_format(self): self.assertEqual(withindex_result, withindex_expected) + def test_to_latex_with_formatters(self): + df = DataFrame({'int': [1, 2, 3], + 'float': [1.0, 2.0, 3.0], + 'object': [(1, 2), True, False], + 'datetime64': [datetime(2016, 1, 1), + datetime(2016, 2, 5), + datetime(2016, 3, 3)]}) + + formatters = {'int': lambda x: '0x%x' % x, + 'float': lambda x: '[% 4.1f]' % x, + 'object': lambda x: '-%s-' % str(x), + 'datetime64': lambda x: x.strftime('%Y-%m'), + '__index__': lambda x: 'index: %s' % x} + result = df.to_latex(formatters=dict(formatters)) + + expected = r"""\begin{tabular}{llrrl} +\toprule +{} & datetime64 & float & int & object \\ +\midrule +index: 0 & 2016-01 & [ 1.0] & 0x1 & -(1, 2)- \\ +index: 1 & 2016-02 & [ 2.0] & 0x2 & -True- \\ +index: 2 & 2016-03 & [ 3.0] & 0x3 & -False- \\ +\bottomrule +\end{tabular} +""" + self.assertEqual(result, expected) + def test_to_latex_multiindex(self): df = DataFrame({('x', 'y'): ['a']}) result = df.to_latex() @@ -3033,10 +3164,6 @@ def test_to_csv_quotechar(self): df.to_csv(path, quoting=1) # 1=QUOTE_ALL with open(path, 'r') as f: self.assertEqual(f.read(), expected) - with tm.ensure_clean('test.csv') as path: - df.to_csv(path, quoting=1, engine='python') - with open(path, 'r') as f: - self.assertEqual(f.read(), expected) expected = """\ $$,$col$ @@ -3048,17 +3175,10 @@ def test_to_csv_quotechar(self): df.to_csv(path, quoting=1, quotechar="$") with open(path, 'r') as f: self.assertEqual(f.read(), expected) - with tm.ensure_clean('test.csv') as path: - df.to_csv(path, quoting=1, quotechar="$", engine='python') - with open(path, 'r') as f: - self.assertEqual(f.read(), expected) with tm.ensure_clean('test.csv') as path: with tm.assertRaisesRegexp(TypeError, 'quotechar'): df.to_csv(path, quoting=1, quotechar=None) - with tm.ensure_clean('test.csv') as path: - with tm.assertRaisesRegexp(TypeError, 'quotechar'): - df.to_csv(path, quoting=1, quotechar=None, engine='python') def test_to_csv_doublequote(self): df = DataFrame({'col': ['a"a', '"bb"']}) @@ -3072,18 +3192,11 @@ def test_to_csv_doublequote(self): df.to_csv(path, quoting=1, doublequote=True) # QUOTE_ALL with open(path, 'r') as f: self.assertEqual(f.read(), expected) - with tm.ensure_clean('test.csv') as path: - df.to_csv(path, quoting=1, doublequote=True, engine='python') - with open(path, 'r') as f: - self.assertEqual(f.read(), expected) from _csv import Error with tm.ensure_clean('test.csv') as path: with tm.assertRaisesRegexp(Error, 'escapechar'): df.to_csv(path, doublequote=False) # no escapechar set - with tm.ensure_clean('test.csv') as path: - with tm.assertRaisesRegexp(Error, 'escapechar'): - df.to_csv(path, doublequote=False, engine='python') def test_to_csv_escapechar(self): df = DataFrame({'col': ['a"a', '"bb"']}) @@ -3097,11 +3210,6 @@ def test_to_csv_escapechar(self): df.to_csv(path, quoting=1, doublequote=False, escapechar='\\') with open(path, 'r') as f: self.assertEqual(f.read(), expected) - with tm.ensure_clean('test.csv') as path: - df.to_csv(path, quoting=1, doublequote=False, escapechar='\\', - engine='python') - with open(path, 'r') as f: - self.assertEqual(f.read(), expected) df = DataFrame({'col': ['a,a', ',bb,']}) expected = """\ @@ -3114,10 +3222,6 @@ def test_to_csv_escapechar(self): df.to_csv(path, quoting=3, escapechar='\\') # QUOTE_NONE with open(path, 'r') as f: self.assertEqual(f.read(), expected) - with tm.ensure_clean('test.csv') as path: - df.to_csv(path, quoting=3, escapechar='\\', engine='python') - with open(path, 'r') as f: - self.assertEqual(f.read(), expected) def test_csv_to_string(self): df = DataFrame({'col': [1, 2]}) @@ -3223,11 +3327,32 @@ def test_to_csv_date_format(self): self.assertEqual(df_sec_grouped.mean().to_csv(date_format='%Y-%m-%d'), expected_ymd_sec) - # deprecation GH11274 - def test_to_csv_engine_kw_deprecation(self): - with tm.assert_produces_warning(FutureWarning): - df = DataFrame({'col1': [1], 'col2': ['a'], 'col3': [10.1]}) - df.to_csv(engine='python') + def test_to_csv_multi_index(self): + # see gh-6618 + df = DataFrame([1], columns=pd.MultiIndex.from_arrays([[1],[2]])) + + exp = ",1\n,2\n0,1\n" + self.assertEqual(df.to_csv(), exp) + + exp = "1\n2\n1\n" + self.assertEqual(df.to_csv(index=False), exp) + + df = DataFrame([1], columns=pd.MultiIndex.from_arrays([[1],[2]]), + index=pd.MultiIndex.from_arrays([[1],[2]])) + + exp = ",,1\n,,2\n1,2,1\n" + self.assertEqual(df.to_csv(), exp) + + exp = "1\n2\n1\n" + self.assertEqual(df.to_csv(index=False), exp) + + df = DataFrame([1], columns=pd.MultiIndex.from_arrays([['foo'],['bar']])) + + exp = ",foo\n,bar\n0,1\n" + self.assertEqual(df.to_csv(), exp) + + exp = "foo\nbar\n1\n" + self.assertEqual(df.to_csv(index=False), exp) def test_period(self): # GH 12615 @@ -4161,6 +4286,28 @@ def test_dates_display(self): self.assertEqual(result[1].strip(), "NaT") self.assertEqual(result[4].strip(), "2013-01-01 09:00:00.000000004") + def test_datetime64formatter_yearmonth(self): + x = Series([datetime(2016, 1, 1), datetime(2016, 2, 2)]) + + def format_func(x): + return x.strftime('%Y-%m') + + formatter = fmt.Datetime64Formatter(x, formatter=format_func) + result = formatter.get_result() + self.assertEqual(result, ['2016-01', '2016-02']) + + def test_datetime64formatter_hoursecond(self): + + x = Series(pd.to_datetime(['10:10:10.100', '12:12:12.120'], + format='%H:%M:%S.%f')) + + def format_func(x): + return x.strftime('%H:%M') + + formatter = fmt.Datetime64Formatter(x, formatter=format_func) + result = formatter.get_result() + self.assertEqual(result, ['10:10', '12:12']) + class TestNaTFormatting(tm.TestCase): diff --git a/pandas/tests/formats/test_printing.py b/pandas/tests/formats/test_printing.py index 3bcceca1f50a7..880f7413544dd 100644 --- a/pandas/tests/formats/test_printing.py +++ b/pandas/tests/formats/test_printing.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- import nose from pandas import compat +from pandas import DataFrame import pandas.formats.printing as printing import pandas.formats.format as fmt import pandas.util.testing as tm @@ -8,6 +9,16 @@ _multiprocess_can_split_ = True +# Added due to issue #13032 as part of PR #13350 +def test_to_string_formatters_index_header(): + frame = DataFrame(data={0: 0, 1: 0}, index=[0]) + expected = ' 0 0' + + formatter = lambda x: '{:4d}'.format(x) + + string = frame.to_string(formatters=[formatter, formatter], index=False, + header=False) + assert(string == expected) def test_adjoin(): data = [['a', 'b', 'c'], ['dd', 'ee', 'ff'], ['ggg', 'hhh', 'iii']] diff --git a/pandas/tests/formats/test_style.py b/pandas/tests/formats/test_style.py index 5a79e3f6897f0..3083750e582fc 100644 --- a/pandas/tests/formats/test_style.py +++ b/pandas/tests/formats/test_style.py @@ -8,7 +8,7 @@ from pandas.util.testing import TestCase import pandas.util.testing as tm -# this is a mess. Getting failures on a python 2.7 build with +# Getting failures on a python 2.7 build with # whenever we try to import jinja, whether it's installed or not. # so we're explicitly skipping that one *before* we try to import # jinja. We still need to export the imports as globals, @@ -22,7 +22,7 @@ import jinja2 # noqa except ImportError: raise SkipTest("No Jinja2") -from pandas.formats.style import Styler # noqa +from pandas.formats.style import Styler, _get_level_lengths # noqa class TestStyler(TestCase): @@ -46,6 +46,17 @@ def h(x, foo='bar'): 'c': pd.Categorical(['a', 'b'])}) ] + def test_init_non_pandas(self): + with tm.assertRaises(TypeError): + Styler([1, 2, 3]) + + def test_init_series(self): + result = Styler(pd.Series([1, 2])) + self.assertEqual(result.data.ndim, 2) + + def test_repr_html_ok(self): + self.styler._repr_html_() + def test_update_ctx(self): self.styler._update_ctx(self.attrs) expected = {(0, 0): ['color: red'], @@ -102,7 +113,7 @@ def test_clear(self): def test_render(self): df = pd.DataFrame({"A": [0, 1]}) style = lambda x: pd.Series(["color: red", "color: blue"], name=x.name) - s = Styler(df, uuid='AB').apply(style).apply(style, axis=1) + s = Styler(df, uuid='AB').apply(style) s.render() # it worked? @@ -137,19 +148,29 @@ def test_empty_index_name_doesnt_display(self): df = pd.DataFrame({'A': [1, 2], 'B': [3, 4], 'C': [5, 6]}) result = df.style._translate() - expected = [[{'class': 'blank', 'type': 'th', 'value': ''}, + expected = [[{'class': 'blank level0', 'type': 'th', 'value': '', + 'is_visible': True, 'display_value': ''}, {'class': 'col_heading level0 col0', 'display_value': 'A', 'type': 'th', - 'value': 'A'}, + 'value': 'A', + 'is_visible': True, + 'attributes': ["colspan=1"], + }, {'class': 'col_heading level0 col1', 'display_value': 'B', 'type': 'th', - 'value': 'B'}, + 'value': 'B', + 'is_visible': True, + 'attributes': ["colspan=1"], + }, {'class': 'col_heading level0 col2', 'display_value': 'C', 'type': 'th', - 'value': 'C'}]] + 'value': 'C', + 'is_visible': True, + 'attributes': ["colspan=1"], + }]] self.assertEqual(result['head'], expected) @@ -158,12 +179,15 @@ def test_index_name(self): df = pd.DataFrame({'A': [1, 2], 'B': [3, 4], 'C': [5, 6]}) result = df.set_index('A').style._translate() - expected = [[{'class': 'blank', 'type': 'th', 'value': ''}, + expected = [[{'class': 'blank level0', 'type': 'th', 'value': '', + 'display_value': '', 'is_visible': True}, {'class': 'col_heading level0 col0', 'type': 'th', - 'value': 'B', 'display_value': 'B'}, + 'value': 'B', 'display_value': 'B', + 'is_visible': True, 'attributes': ['colspan=1']}, {'class': 'col_heading level0 col1', 'type': 'th', - 'value': 'C', 'display_value': 'C'}], - [{'class': 'col_heading level2 col0', 'type': 'th', + 'value': 'C', 'display_value': 'C', + 'is_visible': True, 'attributes': ['colspan=1']}], + [{'class': 'index_name level0', 'type': 'th', 'value': 'A'}, {'class': 'blank', 'type': 'th', 'value': ''}, {'class': 'blank', 'type': 'th', 'value': ''}]] @@ -175,15 +199,20 @@ def test_multiindex_name(self): df = pd.DataFrame({'A': [1, 2], 'B': [3, 4], 'C': [5, 6]}) result = df.set_index(['A', 'B']).style._translate() - expected = [[{'class': 'blank', 'type': 'th', 'value': ''}, - {'class': 'blank', 'type': 'th', 'value': ''}, - {'class': 'col_heading level0 col0', 'type': 'th', - 'value': 'C', 'display_value': 'C'}], - [{'class': 'col_heading level2 col0', 'type': 'th', - 'value': 'A'}, - {'class': 'col_heading level2 col1', 'type': 'th', - 'value': 'B'}, - {'class': 'blank', 'type': 'th', 'value': ''}]] + expected = [[ + {'class': 'blank', 'type': 'th', 'value': '', + 'display_value': '', 'is_visible': True}, + {'class': 'blank level0', 'type': 'th', 'value': '', + 'display_value': '', 'is_visible': True}, + {'class': 'col_heading level0 col0', 'type': 'th', + 'value': 'C', 'display_value': 'C', + 'is_visible': True, 'attributes': ['colspan=1'], + }], + [{'class': 'index_name level0', 'type': 'th', + 'value': 'A'}, + {'class': 'index_name level1', 'type': 'th', + 'value': 'B'}, + {'class': 'blank', 'type': 'th', 'value': ''}]] self.assertEqual(result['head'], expected) @@ -539,6 +568,167 @@ def test_display_dict(self): self.assertEqual(ctx['body'][0][1]['display_value'], '0.1') self.assertEqual(ctx['body'][0][3]['display_value'], 'AAA') + def test_bad_apply_shape(self): + df = pd.DataFrame([[1, 2], [3, 4]]) + with tm.assertRaises(ValueError): + df.style._apply(lambda x: 'x', subset=pd.IndexSlice[[0, 1], :]) + + with tm.assertRaises(ValueError): + df.style._apply(lambda x: [''], subset=pd.IndexSlice[[0, 1], :]) + + with tm.assertRaises(ValueError): + df.style._apply(lambda x: ['', '', '', '']) + + with tm.assertRaises(ValueError): + df.style._apply(lambda x: ['', '', ''], subset=1) + + with tm.assertRaises(ValueError): + df.style._apply(lambda x: ['', '', ''], axis=1) + + def test_apply_bad_return(self): + def f(x): + return '' + df = pd.DataFrame([[1, 2], [3, 4]]) + with tm.assertRaises(TypeError): + df.style._apply(f, axis=None) + + def test_apply_bad_labels(self): + def f(x): + return pd.DataFrame(index=[1, 2], columns=['a', 'b']) + df = pd.DataFrame([[1, 2], [3, 4]]) + with tm.assertRaises(ValueError): + df.style._apply(f, axis=None) + + def test_get_level_lengths(self): + index = pd.MultiIndex.from_product([['a', 'b'], [0, 1, 2]]) + expected = {(0, 0): 3, (0, 3): 3, (1, 0): 1, (1, 1): 1, (1, 2): 1, + (1, 3): 1, (1, 4): 1, (1, 5): 1} + result = _get_level_lengths(index) + tm.assert_dict_equal(result, expected) + + def test_get_level_lengths_un_sorted(self): + index = pd.MultiIndex.from_arrays([ + [1, 1, 2, 1], + ['a', 'b', 'b', 'd'] + ]) + expected = {(0, 0): 2, (0, 2): 1, (0, 3): 1, + (1, 0): 1, (1, 1): 1, (1, 2): 1, (1, 3): 1} + result = _get_level_lengths(index) + tm.assert_dict_equal(result, expected) + + def test_mi_sparse(self): + df = pd.DataFrame({'A': [1, 2]}, + index=pd.MultiIndex.from_arrays([['a', 'a'], + [0, 1]])) + result = df.style._translate() + body_0 = result['body'][0][0] + expected_0 = { + "value": "a", "display_value": "a", "is_visible": True, + "type": "th", "attributes": ["rowspan=2"], + "class": "row_heading level0 row0", + } + tm.assert_dict_equal(body_0, expected_0) + + body_1 = result['body'][0][1] + expected_1 = { + "value": 0, "display_value": 0, "is_visible": True, + "type": "th", "attributes": ["rowspan=1"], + "class": "row_heading level1 row0", + } + tm.assert_dict_equal(body_1, expected_1) + + body_10 = result['body'][1][0] + expected_10 = { + "value": 'a', "display_value": 'a', "is_visible": False, + "type": "th", "attributes": ["rowspan=1"], + "class": "row_heading level0 row1", + } + tm.assert_dict_equal(body_10, expected_10) + + head = result['head'][0] + expected = [ + {'type': 'th', 'class': 'blank', 'value': '', + 'is_visible': True, "display_value": ''}, + {'type': 'th', 'class': 'blank level0', 'value': '', + 'is_visible': True, 'display_value': ''}, + {'attributes': ['colspan=1'], 'class': 'col_heading level0 col0', + 'is_visible': True, 'type': 'th', 'value': 'A', + 'display_value': 'A'}] + self.assertEqual(head, expected) + + def test_mi_sparse_disabled(self): + with pd.option_context('display.multi_sparse', False): + df = pd.DataFrame({'A': [1, 2]}, + index=pd.MultiIndex.from_arrays([['a', 'a'], + [0, 1]])) + result = df.style._translate() + body = result['body'] + for row in body: + self.assertEqual(row[0]['attributes'], ['rowspan=1']) + + def test_mi_sparse_index_names(self): + df = pd.DataFrame({'A': [1, 2]}, index=pd.MultiIndex.from_arrays( + [['a', 'a'], [0, 1]], + names=['idx_level_0', 'idx_level_1']) + ) + result = df.style._translate() + head = result['head'][1] + expected = [{ + 'class': 'index_name level0', 'value': 'idx_level_0', + 'type': 'th'}, + {'class': 'index_name level1', 'value': 'idx_level_1', + 'type': 'th'}, + {'class': 'blank', 'value': '', 'type': 'th'}] + + self.assertEqual(head, expected) + + def test_mi_sparse_column_names(self): + df = pd.DataFrame( + np.arange(16).reshape(4, 4), + index=pd.MultiIndex.from_arrays( + [['a', 'a', 'b', 'a'], [0, 1, 1, 2]], + names=['idx_level_0', 'idx_level_1']), + columns=pd.MultiIndex.from_arrays( + [['C1', 'C1', 'C2', 'C2'], [1, 0, 1, 0]], + names=['col_0', 'col_1'] + ) + ) + result = df.style._translate() + head = result['head'][1] + expected = [ + {'class': 'blank', 'value': '', 'display_value': '', + 'type': 'th', 'is_visible': True}, + {'class': 'index_name level1', 'value': 'col_1', + 'display_value': 'col_1', 'is_visible': True, 'type': 'th'}, + {'attributes': ['colspan=1'], + 'class': 'col_heading level1 col0', + 'display_value': 1, + 'is_visible': True, + 'type': 'th', + 'value': 1}, + {'attributes': ['colspan=1'], + 'class': 'col_heading level1 col1', + 'display_value': 0, + 'is_visible': True, + 'type': 'th', + 'value': 0}, + + {'attributes': ['colspan=1'], + 'class': 'col_heading level1 col2', + 'display_value': 1, + 'is_visible': True, + 'type': 'th', + 'value': 1}, + + {'attributes': ['colspan=1'], + 'class': 'col_heading level1 col3', + 'display_value': 0, + 'is_visible': True, + 'type': 'th', + 'value': 0}, + ] + self.assertEqual(head, expected) + @tm.mplskip class TestStylerMatplotlibDep(TestCase): diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index 3b50dd2c1d49f..46f0fff7bb4b8 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -268,6 +268,7 @@ def test_set_index_cast_datetimeindex(self): lambda d: pd.Timestamp(d, tz=tz)) assert_frame_equal(df.reset_index(), expected) + def test_set_index_timezone(self): # GH 12358 # tz-aware Series should retain the tz i = pd.to_datetime(["2014-01-01 10:10:10"], @@ -277,6 +278,25 @@ def test_set_index_cast_datetimeindex(self): self.assertEqual(pd.DatetimeIndex(pd.Series(df.i))[0].hour, 11) self.assertEqual(df.set_index(df.i).index[0].hour, 11) + def test_set_index_dst(self): + di = pd.date_range('2006-10-29 00:00:00', periods=3, + req='H', tz='US/Pacific') + + df = pd.DataFrame(data={'a': [0, 1, 2], 'b': [3, 4, 5]}, + index=di).reset_index() + # single level + res = df.set_index('index') + exp = pd.DataFrame(data={'a': [0, 1, 2], 'b': [3, 4, 5]}, + index=pd.Index(di, name='index')) + tm.assert_frame_equal(res, exp) + + # GH 12920 + res = df.set_index(['index', 'a']) + exp_index = pd.MultiIndex.from_arrays([di, [0, 1, 2]], + names=['index', 'a']) + exp = pd.DataFrame({'b': [3, 4, 5]}, index=exp_index) + tm.assert_frame_equal(res, exp) + def test_set_index_multiindexcolumns(self): columns = MultiIndex.from_tuples([('foo', 1), ('foo', 2), ('bar', 1)]) df = DataFrame(np.random.randn(3, 3), columns=columns) @@ -644,3 +664,18 @@ def test_assign_columns(self): frame.columns = ['foo', 'bar', 'baz', 'quux', 'foo2'] assert_series_equal(self.frame['C'], frame['baz'], check_names=False) assert_series_equal(self.frame['hi'], frame['foo2'], check_names=False) + + def test_set_index_preserve_categorical_dtype(self): + # GH13743, GH13854 + df = DataFrame({'A': [1, 2, 1, 1, 2], + 'B': [10, 16, 22, 28, 34], + 'C1': pd.Categorical(list("abaab"), + categories=list("bac"), + ordered=False), + 'C2': pd.Categorical(list("abaab"), + categories=list("bac"), + ordered=True)}) + for cols in ['C1', 'C2', ['A', 'C1'], ['A', 'C2'], ['C1', 'C2']]: + result = df.set_index(cols).reset_index() + result = result.reindex(columns=df.columns) + tm.assert_frame_equal(result, df) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index b71235a8f6576..390d796ced006 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -249,6 +249,39 @@ def test_bool_describe_in_mixed_frame(self): index=['count', 'unique', 'top', 'freq']) tm.assert_frame_equal(result, expected) + def test_describe_bool_frame(self): + # GH 13891 + df = pd.DataFrame({ + 'bool_data_1': [False, False, True, True], + 'bool_data_2': [False, True, True, True] + }) + result = df.describe() + expected = DataFrame({'bool_data_1': [4, 2, True, 2], + 'bool_data_2': [4, 2, True, 3]}, + index=['count', 'unique', 'top', 'freq']) + tm.assert_frame_equal(result, expected) + + df = pd.DataFrame({ + 'bool_data': [False, False, True, True, False], + 'int_data': [0, 1, 2, 3, 4] + }) + result = df.describe() + expected = DataFrame({'int_data': [5, 2, df.int_data.std(), 0, 1, + 2, 3, 4]}, + index=['count', 'mean', 'std', 'min', '25%', + '50%', '75%', 'max']) + tm.assert_frame_equal(result, expected) + + df = pd.DataFrame({ + 'bool_data': [False, False, True, True], + 'str_data': ['a', 'b', 'c', 'a'] + }) + result = df.describe() + expected = DataFrame({'bool_data': [4, 2, True, 2], + 'str_data': [4, 3, 'a', 2]}, + index=['count', 'unique', 'top', 'freq']) + tm.assert_frame_equal(result, expected) + def test_describe_categorical_columns(self): # GH 11558 columns = pd.CategoricalIndex(['int1', 'int2', 'obj'], @@ -295,6 +328,43 @@ def test_describe_datetime_columns(self): self.assertEqual(result.columns.freq, 'MS') self.assertEqual(result.columns.tz, expected.columns.tz) + def test_describe_timedelta_values(self): + # GH 6145 + t1 = pd.timedelta_range('1 days', freq='D', periods=5) + t2 = pd.timedelta_range('1 hours', freq='H', periods=5) + df = pd.DataFrame({'t1': t1, 't2': t2}) + + expected = DataFrame({'t1': [5, pd.Timedelta('3 days'), + df.iloc[:, 0].std(), + pd.Timedelta('1 days'), + pd.Timedelta('2 days'), + pd.Timedelta('3 days'), + pd.Timedelta('4 days'), + pd.Timedelta('5 days')], + 't2': [5, pd.Timedelta('3 hours'), + df.iloc[:, 1].std(), + pd.Timedelta('1 hours'), + pd.Timedelta('2 hours'), + pd.Timedelta('3 hours'), + pd.Timedelta('4 hours'), + pd.Timedelta('5 hours')]}, + index=['count', 'mean', 'std', 'min', '25%', + '50%', '75%', 'max']) + + res = df.describe() + tm.assert_frame_equal(res, expected) + + exp_repr = (" t1 t2\n" + "count 5 5\n" + "mean 3 days 00:00:00 0 days 03:00:00\n" + "std 1 days 13:56:50.394919 0 days 01:34:52.099788\n" + "min 1 days 00:00:00 0 days 01:00:00\n" + "25% 2 days 00:00:00 0 days 02:00:00\n" + "50% 3 days 00:00:00 0 days 03:00:00\n" + "75% 4 days 00:00:00 0 days 04:00:00\n" + "max 5 days 00:00:00 0 days 05:00:00") + self.assertEqual(repr(res), exp_repr) + def test_reduce_mixed_frame(self): # GH 6806 df = DataFrame({ diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index 2b619b84a5994..5cadb4dba577f 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -10,7 +10,7 @@ from pandas import (notnull, DataFrame, Series, MultiIndex, date_range, Timestamp, compat) import pandas as pd -import pandas.core.common as com +from pandas.types.dtypes import CategoricalDtype from pandas.util.testing import (assert_series_equal, assert_frame_equal) import pandas.util.testing as tm @@ -22,18 +22,19 @@ class TestDataFrameApply(tm.TestCase, TestData): _multiprocess_can_split_ = True def test_apply(self): - # ufunc - applied = self.frame.apply(np.sqrt) - assert_series_equal(np.sqrt(self.frame['A']), applied['A']) + with np.errstate(all='ignore'): + # ufunc + applied = self.frame.apply(np.sqrt) + assert_series_equal(np.sqrt(self.frame['A']), applied['A']) - # aggregator - applied = self.frame.apply(np.mean) - self.assertEqual(applied['A'], np.mean(self.frame['A'])) + # aggregator + applied = self.frame.apply(np.mean) + self.assertEqual(applied['A'], np.mean(self.frame['A'])) - d = self.frame.index[0] - applied = self.frame.apply(np.mean, axis=1) - self.assertEqual(applied[d], np.mean(self.frame.xs(d))) - self.assertIs(applied.index, self.frame.index) # want this + d = self.frame.index[0] + applied = self.frame.apply(np.mean, axis=1) + self.assertEqual(applied[d], np.mean(self.frame.xs(d))) + self.assertIs(applied.index, self.frame.index) # want this # invalid axis df = DataFrame( @@ -45,8 +46,8 @@ def test_apply(self): 'c1': ['C', 'C', 'D', 'D']}) df = df.apply(lambda ts: ts.astype('category')) self.assertEqual(df.shape, (4, 2)) - self.assertTrue(isinstance(df['c0'].dtype, com.CategoricalDtype)) - self.assertTrue(isinstance(df['c1'].dtype, com.CategoricalDtype)) + self.assertTrue(isinstance(df['c0'].dtype, CategoricalDtype)) + self.assertTrue(isinstance(df['c1'].dtype, CategoricalDtype)) def test_apply_mixed_datetimelike(self): # mixed datetimelike @@ -187,10 +188,11 @@ def _checkit(axis=0, raw=False): _checkit(raw=True) _checkit(axis=0, raw=True) - _check(no_cols, lambda x: x) - _check(no_cols, lambda x: x.mean()) - _check(no_index, lambda x: x) - _check(no_index, lambda x: x.mean()) + with np.errstate(all='ignore'): + _check(no_cols, lambda x: x) + _check(no_cols, lambda x: x.mean()) + _check(no_index, lambda x: x) + _check(no_index, lambda x: x.mean()) result = no_cols.apply(lambda x: x.mean(), broadcast=True) tm.assertIsInstance(result, DataFrame) diff --git a/pandas/tests/frame/test_asof.py b/pandas/tests/frame/test_asof.py new file mode 100644 index 0000000000000..6c15c75cb5427 --- /dev/null +++ b/pandas/tests/frame/test_asof.py @@ -0,0 +1,72 @@ +# coding=utf-8 + +import nose + +import numpy as np +from pandas import DataFrame, date_range + +from pandas.util.testing import assert_frame_equal +import pandas.util.testing as tm + +from .common import TestData + + +class TestFrameAsof(TestData, tm.TestCase): + _multiprocess_can_split_ = True + + def setUp(self): + self.N = N = 50 + rng = date_range('1/1/1990', periods=N, freq='53s') + self.df = DataFrame({'A': np.arange(N), 'B': np.arange(N)}, + index=rng) + + def test_basic(self): + + df = self.df.copy() + df.ix[15:30, 'A'] = np.nan + dates = date_range('1/1/1990', periods=self.N * 3, + freq='25s') + + result = df.asof(dates) + self.assertTrue(result.notnull().all(1).all()) + lb = df.index[14] + ub = df.index[30] + + dates = list(dates) + result = df.asof(dates) + self.assertTrue(result.notnull().all(1).all()) + + mask = (result.index >= lb) & (result.index < ub) + rs = result[mask] + self.assertTrue((rs == 14).all(1).all()) + + def test_subset(self): + + N = 10 + rng = date_range('1/1/1990', periods=N, freq='53s') + df = DataFrame({'A': np.arange(N), 'B': np.arange(N)}, + index=rng) + df.ix[4:8, 'A'] = np.nan + dates = date_range('1/1/1990', periods=N * 3, + freq='25s') + + # with a subset of A should be the same + result = df.asof(dates, subset='A') + expected = df.asof(dates) + assert_frame_equal(result, expected) + + # same with A/B + result = df.asof(dates, subset=['A', 'B']) + expected = df.asof(dates) + assert_frame_equal(result, expected) + + # B gives self.df.asof + result = df.asof(dates, subset='B') + expected = df.resample('25s', closed='right').ffill().reindex(dates) + expected.iloc[20:] = 9 + + assert_frame_equal(result, expected) + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/frame/test_axis_select_reindex.py b/pandas/tests/frame/test_axis_select_reindex.py index 07fe28f13b7d0..9da1b31d259c5 100644 --- a/pandas/tests/frame/test_axis_select_reindex.py +++ b/pandas/tests/frame/test_axis_select_reindex.py @@ -661,8 +661,24 @@ def test_filter(self): assert_frame_equal(filtered, expected) # pass in None + with assertRaisesRegexp(TypeError, 'Must pass'): + self.frame.filter() with assertRaisesRegexp(TypeError, 'Must pass'): self.frame.filter(items=None) + with assertRaisesRegexp(TypeError, 'Must pass'): + self.frame.filter(axis=1) + + # test mutually exclusive arguments + with assertRaisesRegexp(TypeError, 'mutually exclusive'): + self.frame.filter(items=['one', 'three'], regex='e$', like='bbi') + with assertRaisesRegexp(TypeError, 'mutually exclusive'): + self.frame.filter(items=['one', 'three'], regex='e$', axis=1) + with assertRaisesRegexp(TypeError, 'mutually exclusive'): + self.frame.filter(items=['one', 'three'], regex='e$') + with assertRaisesRegexp(TypeError, 'mutually exclusive'): + self.frame.filter(items=['one', 'three'], like='bbi', axis=0) + with assertRaisesRegexp(TypeError, 'mutually exclusive'): + self.frame.filter(items=['one', 'three'], like='bbi') # objects filtered = self.mixed_frame.filter(like='foo') diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index 0421cf2ba42d2..e51cc0f5a6ec7 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -104,15 +104,21 @@ def test_as_matrix_lcd(self): values = self.mixed_float.as_matrix(['C']) self.assertEqual(values.dtype, np.float16) + # GH 10364 + # B uint64 forces float because there are other signed int types values = self.mixed_int.as_matrix(['A', 'B', 'C', 'D']) - self.assertEqual(values.dtype, np.int64) + self.assertEqual(values.dtype, np.float64) values = self.mixed_int.as_matrix(['A', 'D']) self.assertEqual(values.dtype, np.int64) - # guess all ints are cast to uints.... + # B uint64 forces float because there are other signed int types values = self.mixed_int.as_matrix(['A', 'B', 'C']) - self.assertEqual(values.dtype, np.int64) + self.assertEqual(values.dtype, np.float64) + + # as B and C are both unsigned, no forcing to float is needed + values = self.mixed_int.as_matrix(['B', 'C']) + self.assertEqual(values.dtype, np.uint64) values = self.mixed_int.as_matrix(['A', 'C']) self.assertEqual(values.dtype, np.int32) @@ -372,11 +378,13 @@ def test_consolidate_datetime64(self): ser_starting.index = ser_starting.values ser_starting = ser_starting.tz_localize('US/Eastern') ser_starting = ser_starting.tz_convert('UTC') + ser_starting.index.name = 'starting' ser_ending = df.ending ser_ending.index = ser_ending.values ser_ending = ser_ending.tz_localize('US/Eastern') ser_ending = ser_ending.tz_convert('UTC') + ser_ending.index.name = 'ending' df.starting = ser_starting.index df.ending = ser_ending.index diff --git a/pandas/tests/frame/test_combine_concat.py b/pandas/tests/frame/test_combine_concat.py index 7202915f13258..e5aaba26135e7 100644 --- a/pandas/tests/frame/test_combine_concat.py +++ b/pandas/tests/frame/test_combine_concat.py @@ -20,23 +20,11 @@ from pandas.tests.frame.common import TestData -class TestDataFrameCombineConcat(tm.TestCase, TestData): +class TestDataFrameConcatCommon(tm.TestCase, TestData): _multiprocess_can_split_ = True - def test_combine_first_mixed(self): - a = Series(['a', 'b'], index=lrange(2)) - b = Series(lrange(2), index=lrange(2)) - f = DataFrame({'A': a, 'B': b}) - - a = Series(['a', 'b'], index=lrange(5, 7)) - b = Series(lrange(2), index=lrange(5, 7)) - g = DataFrame({'A': a, 'B': b}) - - # TODO(wesm): no verification? - combined = f.combine_first(g) # noqa - - def test_combine_multiple_frames_dtypes(self): + def test_concat_multiple_frames_dtypes(self): # GH 2759 A = DataFrame(data=np.ones((10, 2)), columns=[ @@ -46,7 +34,7 @@ def test_combine_multiple_frames_dtypes(self): expected = Series(dict(float64=2, float32=2)) assert_series_equal(results, expected) - def test_combine_multiple_tzs(self): + def test_concat_multiple_tzs(self): # GH 12467 # combining datetime tz-aware and naive DataFrames ts1 = Timestamp('2015-01-01', tz=None) @@ -194,147 +182,6 @@ def test_append_dtypes(self): expected = DataFrame({'bar': Series([Timestamp('20130101'), 1])}) assert_frame_equal(result, expected) - def test_combine_first(self): - # disjoint - head, tail = self.frame[:5], self.frame[5:] - - combined = head.combine_first(tail) - reordered_frame = self.frame.reindex(combined.index) - assert_frame_equal(combined, reordered_frame) - self.assertTrue(tm.equalContents(combined.columns, self.frame.columns)) - assert_series_equal(combined['A'], reordered_frame['A']) - - # same index - fcopy = self.frame.copy() - fcopy['A'] = 1 - del fcopy['C'] - - fcopy2 = self.frame.copy() - fcopy2['B'] = 0 - del fcopy2['D'] - - combined = fcopy.combine_first(fcopy2) - - self.assertTrue((combined['A'] == 1).all()) - assert_series_equal(combined['B'], fcopy['B']) - assert_series_equal(combined['C'], fcopy2['C']) - assert_series_equal(combined['D'], fcopy['D']) - - # overlap - head, tail = reordered_frame[:10].copy(), reordered_frame - head['A'] = 1 - - combined = head.combine_first(tail) - self.assertTrue((combined['A'][:10] == 1).all()) - - # reverse overlap - tail['A'][:10] = 0 - combined = tail.combine_first(head) - self.assertTrue((combined['A'][:10] == 0).all()) - - # no overlap - f = self.frame[:10] - g = self.frame[10:] - combined = f.combine_first(g) - assert_series_equal(combined['A'].reindex(f.index), f['A']) - assert_series_equal(combined['A'].reindex(g.index), g['A']) - - # corner cases - comb = self.frame.combine_first(self.empty) - assert_frame_equal(comb, self.frame) - - comb = self.empty.combine_first(self.frame) - assert_frame_equal(comb, self.frame) - - comb = self.frame.combine_first(DataFrame(index=["faz", "boo"])) - self.assertTrue("faz" in comb.index) - - # #2525 - df = DataFrame({'a': [1]}, index=[datetime(2012, 1, 1)]) - df2 = DataFrame({}, columns=['b']) - result = df.combine_first(df2) - self.assertTrue('b' in result) - - def test_combine_first_mixed_bug(self): - idx = Index(['a', 'b', 'c', 'e']) - ser1 = Series([5.0, -9.0, 4.0, 100.], index=idx) - ser2 = Series(['a', 'b', 'c', 'e'], index=idx) - ser3 = Series([12, 4, 5, 97], index=idx) - - frame1 = DataFrame({"col0": ser1, - "col2": ser2, - "col3": ser3}) - - idx = Index(['a', 'b', 'c', 'f']) - ser1 = Series([5.0, -9.0, 4.0, 100.], index=idx) - ser2 = Series(['a', 'b', 'c', 'f'], index=idx) - ser3 = Series([12, 4, 5, 97], index=idx) - - frame2 = DataFrame({"col1": ser1, - "col2": ser2, - "col5": ser3}) - - combined = frame1.combine_first(frame2) - self.assertEqual(len(combined.columns), 5) - - # gh 3016 (same as in update) - df = DataFrame([[1., 2., False, True], [4., 5., True, False]], - columns=['A', 'B', 'bool1', 'bool2']) - - other = DataFrame([[45, 45]], index=[0], columns=['A', 'B']) - result = df.combine_first(other) - assert_frame_equal(result, df) - - df.ix[0, 'A'] = np.nan - result = df.combine_first(other) - df.ix[0, 'A'] = 45 - assert_frame_equal(result, df) - - # doc example - df1 = DataFrame({'A': [1., np.nan, 3., 5., np.nan], - 'B': [np.nan, 2., 3., np.nan, 6.]}) - - df2 = DataFrame({'A': [5., 2., 4., np.nan, 3., 7.], - 'B': [np.nan, np.nan, 3., 4., 6., 8.]}) - - result = df1.combine_first(df2) - expected = DataFrame( - {'A': [1, 2, 3, 5, 3, 7.], 'B': [np.nan, 2, 3, 4, 6, 8]}) - assert_frame_equal(result, expected) - - # GH3552, return object dtype with bools - df1 = DataFrame( - [[np.nan, 3., True], [-4.6, np.nan, True], [np.nan, 7., False]]) - df2 = DataFrame( - [[-42.6, np.nan, True], [-5., 1.6, False]], index=[1, 2]) - - result = df1.combine_first(df2)[2] - expected = Series([True, True, False], name=2) - assert_series_equal(result, expected) - - # GH 3593, converting datetime64[ns] incorrecly - df0 = DataFrame({"a": [datetime(2000, 1, 1), - datetime(2000, 1, 2), - datetime(2000, 1, 3)]}) - df1 = DataFrame({"a": [None, None, None]}) - df2 = df1.combine_first(df0) - assert_frame_equal(df2, df0) - - df2 = df0.combine_first(df1) - assert_frame_equal(df2, df0) - - df0 = DataFrame({"a": [datetime(2000, 1, 1), - datetime(2000, 1, 2), - datetime(2000, 1, 3)]}) - df1 = DataFrame({"a": [datetime(2000, 1, 2), None, None]}) - df2 = df1.combine_first(df0) - result = df0.copy() - result.iloc[0, :] = df1.iloc[0, :] - assert_frame_equal(df2, result) - - df2 = df0.combine_first(df1) - assert_frame_equal(df2, df0) - def test_update(self): df = DataFrame([[1.5, nan, 3.], [1.5, nan, 3.], @@ -476,3 +323,305 @@ def test_join_multiindex_leftright(self): assert_frame_equal(df1.join(df2, how='right'), exp) assert_frame_equal(df2.join(df1, how='left'), exp[['value2', 'value1']]) + + +class TestDataFrameCombineFirst(tm.TestCase, TestData): + + _multiprocess_can_split_ = True + + def test_combine_first_mixed(self): + a = Series(['a', 'b'], index=lrange(2)) + b = Series(lrange(2), index=lrange(2)) + f = DataFrame({'A': a, 'B': b}) + + a = Series(['a', 'b'], index=lrange(5, 7)) + b = Series(lrange(2), index=lrange(5, 7)) + g = DataFrame({'A': a, 'B': b}) + + exp = pd.DataFrame({'A': list('abab'), 'B': [0., 1., 0., 1.]}, + index=[0, 1, 5, 6]) + combined = f.combine_first(g) + tm.assert_frame_equal(combined, exp) + + def test_combine_first(self): + # disjoint + head, tail = self.frame[:5], self.frame[5:] + + combined = head.combine_first(tail) + reordered_frame = self.frame.reindex(combined.index) + assert_frame_equal(combined, reordered_frame) + self.assertTrue(tm.equalContents(combined.columns, self.frame.columns)) + assert_series_equal(combined['A'], reordered_frame['A']) + + # same index + fcopy = self.frame.copy() + fcopy['A'] = 1 + del fcopy['C'] + + fcopy2 = self.frame.copy() + fcopy2['B'] = 0 + del fcopy2['D'] + + combined = fcopy.combine_first(fcopy2) + + self.assertTrue((combined['A'] == 1).all()) + assert_series_equal(combined['B'], fcopy['B']) + assert_series_equal(combined['C'], fcopy2['C']) + assert_series_equal(combined['D'], fcopy['D']) + + # overlap + head, tail = reordered_frame[:10].copy(), reordered_frame + head['A'] = 1 + + combined = head.combine_first(tail) + self.assertTrue((combined['A'][:10] == 1).all()) + + # reverse overlap + tail['A'][:10] = 0 + combined = tail.combine_first(head) + self.assertTrue((combined['A'][:10] == 0).all()) + + # no overlap + f = self.frame[:10] + g = self.frame[10:] + combined = f.combine_first(g) + assert_series_equal(combined['A'].reindex(f.index), f['A']) + assert_series_equal(combined['A'].reindex(g.index), g['A']) + + # corner cases + comb = self.frame.combine_first(self.empty) + assert_frame_equal(comb, self.frame) + + comb = self.empty.combine_first(self.frame) + assert_frame_equal(comb, self.frame) + + comb = self.frame.combine_first(DataFrame(index=["faz", "boo"])) + self.assertTrue("faz" in comb.index) + + # #2525 + df = DataFrame({'a': [1]}, index=[datetime(2012, 1, 1)]) + df2 = DataFrame({}, columns=['b']) + result = df.combine_first(df2) + self.assertTrue('b' in result) + + def test_combine_first_mixed_bug(self): + idx = Index(['a', 'b', 'c', 'e']) + ser1 = Series([5.0, -9.0, 4.0, 100.], index=idx) + ser2 = Series(['a', 'b', 'c', 'e'], index=idx) + ser3 = Series([12, 4, 5, 97], index=idx) + + frame1 = DataFrame({"col0": ser1, + "col2": ser2, + "col3": ser3}) + + idx = Index(['a', 'b', 'c', 'f']) + ser1 = Series([5.0, -9.0, 4.0, 100.], index=idx) + ser2 = Series(['a', 'b', 'c', 'f'], index=idx) + ser3 = Series([12, 4, 5, 97], index=idx) + + frame2 = DataFrame({"col1": ser1, + "col2": ser2, + "col5": ser3}) + + combined = frame1.combine_first(frame2) + self.assertEqual(len(combined.columns), 5) + + # gh 3016 (same as in update) + df = DataFrame([[1., 2., False, True], [4., 5., True, False]], + columns=['A', 'B', 'bool1', 'bool2']) + + other = DataFrame([[45, 45]], index=[0], columns=['A', 'B']) + result = df.combine_first(other) + assert_frame_equal(result, df) + + df.ix[0, 'A'] = np.nan + result = df.combine_first(other) + df.ix[0, 'A'] = 45 + assert_frame_equal(result, df) + + # doc example + df1 = DataFrame({'A': [1., np.nan, 3., 5., np.nan], + 'B': [np.nan, 2., 3., np.nan, 6.]}) + + df2 = DataFrame({'A': [5., 2., 4., np.nan, 3., 7.], + 'B': [np.nan, np.nan, 3., 4., 6., 8.]}) + + result = df1.combine_first(df2) + expected = DataFrame( + {'A': [1, 2, 3, 5, 3, 7.], 'B': [np.nan, 2, 3, 4, 6, 8]}) + assert_frame_equal(result, expected) + + # GH3552, return object dtype with bools + df1 = DataFrame( + [[np.nan, 3., True], [-4.6, np.nan, True], [np.nan, 7., False]]) + df2 = DataFrame( + [[-42.6, np.nan, True], [-5., 1.6, False]], index=[1, 2]) + + result = df1.combine_first(df2)[2] + expected = Series([True, True, False], name=2) + assert_series_equal(result, expected) + + # GH 3593, converting datetime64[ns] incorrecly + df0 = DataFrame({"a": [datetime(2000, 1, 1), + datetime(2000, 1, 2), + datetime(2000, 1, 3)]}) + df1 = DataFrame({"a": [None, None, None]}) + df2 = df1.combine_first(df0) + assert_frame_equal(df2, df0) + + df2 = df0.combine_first(df1) + assert_frame_equal(df2, df0) + + df0 = DataFrame({"a": [datetime(2000, 1, 1), + datetime(2000, 1, 2), + datetime(2000, 1, 3)]}) + df1 = DataFrame({"a": [datetime(2000, 1, 2), None, None]}) + df2 = df1.combine_first(df0) + result = df0.copy() + result.iloc[0, :] = df1.iloc[0, :] + assert_frame_equal(df2, result) + + df2 = df0.combine_first(df1) + assert_frame_equal(df2, df0) + + def test_combine_first_align_nan(self): + # GH 7509 (not fixed) + dfa = pd.DataFrame([[pd.Timestamp('2011-01-01'), 2]], + columns=['a', 'b']) + dfb = pd.DataFrame([[4], [5]], columns=['b']) + self.assertEqual(dfa['a'].dtype, 'datetime64[ns]') + self.assertEqual(dfa['b'].dtype, 'int64') + + res = dfa.combine_first(dfb) + exp = pd.DataFrame({'a': [pd.Timestamp('2011-01-01'), pd.NaT], + 'b': [2., 5.]}, columns=['a', 'b']) + tm.assert_frame_equal(res, exp) + self.assertEqual(res['a'].dtype, 'datetime64[ns]') + # ToDo: this must be int64 + self.assertEqual(res['b'].dtype, 'float64') + + res = dfa.iloc[:0].combine_first(dfb) + exp = pd.DataFrame({'a': [np.nan, np.nan], + 'b': [4, 5]}, columns=['a', 'b']) + tm.assert_frame_equal(res, exp) + # ToDo: this must be datetime64 + self.assertEqual(res['a'].dtype, 'float64') + # ToDo: this must be int64 + self.assertEqual(res['b'].dtype, 'int64') + + def test_combine_first_timezone(self): + # GH 7630 + data1 = pd.to_datetime('20100101 01:01').tz_localize('UTC') + df1 = pd.DataFrame(columns=['UTCdatetime', 'abc'], + data=data1, + index=pd.date_range('20140627', periods=1)) + data2 = pd.to_datetime('20121212 12:12').tz_localize('UTC') + df2 = pd.DataFrame(columns=['UTCdatetime', 'xyz'], + data=data2, + index=pd.date_range('20140628', periods=1)) + res = df2[['UTCdatetime']].combine_first(df1) + exp = pd.DataFrame({'UTCdatetime': [pd.Timestamp('2010-01-01 01:01', + tz='UTC'), + pd.Timestamp('2012-12-12 12:12', + tz='UTC')], + 'abc': [pd.Timestamp('2010-01-01 01:01:00', + tz='UTC'), pd.NaT]}, + columns=['UTCdatetime', 'abc'], + index=pd.date_range('20140627', periods=2, + freq='D')) + tm.assert_frame_equal(res, exp) + self.assertEqual(res['UTCdatetime'].dtype, 'datetime64[ns, UTC]') + self.assertEqual(res['abc'].dtype, 'datetime64[ns, UTC]') + + # GH 10567 + dts1 = pd.date_range('2015-01-01', '2015-01-05', tz='UTC') + df1 = pd.DataFrame({'DATE': dts1}) + dts2 = pd.date_range('2015-01-03', '2015-01-05', tz='UTC') + df2 = pd.DataFrame({'DATE': dts2}) + + res = df1.combine_first(df2) + tm.assert_frame_equal(res, df1) + self.assertEqual(res['DATE'].dtype, 'datetime64[ns, UTC]') + + dts1 = pd.DatetimeIndex(['2011-01-01', 'NaT', '2011-01-03', + '2011-01-04'], tz='US/Eastern') + df1 = pd.DataFrame({'DATE': dts1}, index=[1, 3, 5, 7]) + dts2 = pd.DatetimeIndex(['2012-01-01', '2012-01-02', + '2012-01-03'], tz='US/Eastern') + df2 = pd.DataFrame({'DATE': dts2}, index=[2, 4, 5]) + + res = df1.combine_first(df2) + exp_dts = pd.DatetimeIndex(['2011-01-01', '2012-01-01', 'NaT', + '2012-01-02', '2011-01-03', '2011-01-04'], + tz='US/Eastern') + exp = pd.DataFrame({'DATE': exp_dts}, index=[1, 2, 3, 4, 5, 7]) + tm.assert_frame_equal(res, exp) + + # different tz + dts1 = pd.date_range('2015-01-01', '2015-01-05', tz='US/Eastern') + df1 = pd.DataFrame({'DATE': dts1}) + dts2 = pd.date_range('2015-01-03', '2015-01-05') + df2 = pd.DataFrame({'DATE': dts2}) + + # if df1 doesn't have NaN, keep its dtype + res = df1.combine_first(df2) + tm.assert_frame_equal(res, df1) + self.assertEqual(res['DATE'].dtype, 'datetime64[ns, US/Eastern]') + + dts1 = pd.date_range('2015-01-01', '2015-01-02', tz='US/Eastern') + df1 = pd.DataFrame({'DATE': dts1}) + dts2 = pd.date_range('2015-01-01', '2015-01-03') + df2 = pd.DataFrame({'DATE': dts2}) + + res = df1.combine_first(df2) + exp_dts = [pd.Timestamp('2015-01-01', tz='US/Eastern'), + pd.Timestamp('2015-01-02', tz='US/Eastern'), + pd.Timestamp('2015-01-03')] + exp = pd.DataFrame({'DATE': exp_dts}) + tm.assert_frame_equal(res, exp) + self.assertEqual(res['DATE'].dtype, 'object') + + def test_combine_first_timedelta(self): + data1 = pd.TimedeltaIndex(['1 day', 'NaT', '3 day', '4day']) + df1 = pd.DataFrame({'TD': data1}, index=[1, 3, 5, 7]) + data2 = pd.TimedeltaIndex(['10 day', '11 day', '12 day']) + df2 = pd.DataFrame({'TD': data2}, index=[2, 4, 5]) + + res = df1.combine_first(df2) + exp_dts = pd.TimedeltaIndex(['1 day', '10 day', 'NaT', + '11 day', '3 day', '4 day']) + exp = pd.DataFrame({'TD': exp_dts}, index=[1, 2, 3, 4, 5, 7]) + tm.assert_frame_equal(res, exp) + self.assertEqual(res['TD'].dtype, 'timedelta64[ns]') + + def test_combine_first_period(self): + data1 = pd.PeriodIndex(['2011-01', 'NaT', '2011-03', + '2011-04'], freq='M') + df1 = pd.DataFrame({'P': data1}, index=[1, 3, 5, 7]) + data2 = pd.PeriodIndex(['2012-01-01', '2012-02', + '2012-03'], freq='M') + df2 = pd.DataFrame({'P': data2}, index=[2, 4, 5]) + + res = df1.combine_first(df2) + exp_dts = pd.PeriodIndex(['2011-01', '2012-01', 'NaT', + '2012-02', '2011-03', '2011-04'], + freq='M') + exp = pd.DataFrame({'P': exp_dts}, index=[1, 2, 3, 4, 5, 7]) + tm.assert_frame_equal(res, exp) + self.assertEqual(res['P'].dtype, 'object') + + # different freq + dts2 = pd.PeriodIndex(['2012-01-01', '2012-01-02', + '2012-01-03'], freq='D') + df2 = pd.DataFrame({'P': dts2}, index=[2, 4, 5]) + + res = df1.combine_first(df2) + exp_dts = [pd.Period('2011-01', freq='M'), + pd.Period('2012-01-01', freq='D'), + pd.NaT, + pd.Period('2012-01-02', freq='D'), + pd.Period('2011-03', freq='M'), + pd.Period('2011-04', freq='M')] + exp = pd.DataFrame({'P': exp_dts}, index=[1, 2, 3, 4, 5, 7]) + tm.assert_frame_equal(res, exp) + self.assertEqual(res['P'].dtype, 'object') diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index b42aef9447373..d21db5ba52a45 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -14,6 +14,7 @@ import numpy.ma as ma import numpy.ma.mrecords as mrecords +from pandas.types.common import is_integer_dtype from pandas.compat import (lmap, long, zip, range, lrange, lzip, OrderedDict, is_platform_little_endian) from pandas import compat @@ -809,7 +810,7 @@ def test_constructor_list_of_lists(self): # GH #484 l = [[1, 'a'], [2, 'b']] df = DataFrame(data=l, columns=["num", "str"]) - self.assertTrue(com.is_integer_dtype(df['num'])) + self.assertTrue(is_integer_dtype(df['num'])) self.assertEqual(df['str'].dtype, np.object_) # GH 4851 diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index 5f95ff6b6b601..817770b9da610 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -1,15 +1,13 @@ # -*- coding: utf-8 -*- from __future__ import print_function - from datetime import timedelta import numpy as np - from pandas import (DataFrame, Series, date_range, Timedelta, Timestamp, - compat, option_context) + compat, concat, option_context) from pandas.compat import u -from pandas.core import common as com +from pandas.types.dtypes import DatetimeTZDtype from pandas.tests.frame.common import TestData from pandas.util.testing import (assert_series_equal, assert_frame_equal, @@ -84,8 +82,8 @@ def test_datetime_with_tz_dtypes(self): tzframe.iloc[1, 2] = pd.NaT result = tzframe.dtypes.sort_index() expected = Series([np.dtype('datetime64[ns]'), - com.DatetimeTZDtype('datetime64[ns, US/Eastern]'), - com.DatetimeTZDtype('datetime64[ns, CET]')], + DatetimeTZDtype('datetime64[ns, US/Eastern]'), + DatetimeTZDtype('datetime64[ns, CET]')], ['A', 'B', 'C']) assert_series_equal(result, expected) @@ -398,6 +396,69 @@ def test_astype_str(self): expected = DataFrame(['1.12345678901']) assert_frame_equal(result, expected) + def test_astype_dict(self): + # GH7271 + a = Series(date_range('2010-01-04', periods=5)) + b = Series(range(5)) + c = Series([0.0, 0.2, 0.4, 0.6, 0.8]) + d = Series(['1.0', '2', '3.14', '4', '5.4']) + df = DataFrame({'a': a, 'b': b, 'c': c, 'd': d}) + original = df.copy(deep=True) + + # change type of a subset of columns + result = df.astype({'b': 'str', 'd': 'float32'}) + expected = DataFrame({ + 'a': a, + 'b': Series(['0', '1', '2', '3', '4']), + 'c': c, + 'd': Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype='float32')}) + assert_frame_equal(result, expected) + assert_frame_equal(df, original) + + result = df.astype({'b': np.float32, 'c': 'float32', 'd': np.float64}) + expected = DataFrame({ + 'a': a, + 'b': Series([0.0, 1.0, 2.0, 3.0, 4.0], dtype='float32'), + 'c': Series([0.0, 0.2, 0.4, 0.6, 0.8], dtype='float32'), + 'd': Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype='float64')}) + assert_frame_equal(result, expected) + assert_frame_equal(df, original) + + # change all columns + assert_frame_equal(df.astype({'a': str, 'b': str, 'c': str, 'd': str}), + df.astype(str)) + assert_frame_equal(df, original) + + # error should be raised when using something other than column labels + # in the keys of the dtype dict + self.assertRaises(KeyError, df.astype, {'b': str, 2: str}) + self.assertRaises(KeyError, df.astype, {'e': str}) + assert_frame_equal(df, original) + + # if the dtypes provided are the same as the original dtypes, the + # resulting DataFrame should be the same as the original DataFrame + equiv = df.astype({col: df[col].dtype for col in df.columns}) + assert_frame_equal(df, equiv) + assert_frame_equal(df, original) + + def test_astype_duplicate_col(self): + a1 = Series([1, 2, 3, 4, 5], name='a') + b = Series([0.1, 0.2, 0.4, 0.6, 0.8], name='b') + a2 = Series([0, 1, 2, 3, 4], name='a') + df = concat([a1, b, a2], axis=1) + + result = df.astype(str) + a1_str = Series(['1', '2', '3', '4', '5'], dtype='str', name='a') + b_str = Series(['0.1', '0.2', '0.4', '0.6', '0.8'], dtype=str, + name='b') + a2_str = Series(['0', '1', '2', '3', '4'], dtype='str', name='a') + expected = concat([a1_str, b_str, a2_str], axis=1) + assert_frame_equal(result, expected) + + result = df.astype({'a': 'str'}) + expected = concat([a1_str, b, a2_str], axis=1) + assert_frame_equal(result, expected) + def test_timedeltas(self): df = DataFrame(dict(A=Series(date_range('2012-1-1', periods=3, freq='D')), diff --git a/pandas/tests/frame/test_indexing.py b/pandas/tests/frame/test_indexing.py index 78354f32acbda..720dcdd62dd89 100644 --- a/pandas/tests/frame/test_indexing.py +++ b/pandas/tests/frame/test_indexing.py @@ -17,6 +17,10 @@ date_range) import pandas as pd +from pandas.tseries.offsets import BDay +from pandas.types.common import (is_float_dtype, + is_integer, + is_scalar) from pandas.util.testing import (assert_almost_equal, assert_numpy_array_equal, assert_series_equal, @@ -26,7 +30,6 @@ from pandas.core.indexing import IndexingError import pandas.util.testing as tm -import pandas.lib as lib from pandas.tests.frame.common import TestData @@ -207,6 +210,16 @@ def test_setitem_callable(self): exp = pd.DataFrame({'A': [11, 12, 13, 14], 'B': [5, 6, 7, 8]}) tm.assert_frame_equal(df, exp) + def test_setitem_other_callable(self): + # GH 13299 + inc = lambda x: x + 1 + + df = pd.DataFrame([[-1, 1], [1, -1]]) + df[df > 0] = inc + + expected = pd.DataFrame([[-1, inc], [inc, -1]]) + tm.assert_frame_equal(df, expected) + def test_getitem_boolean(self): # boolean indexing d = self.tsframe.index[10] @@ -1409,15 +1422,15 @@ def test_setitem_single_column_mixed_datetime(self): # set an allowable datetime64 type from pandas import tslib df.ix['b', 'timestamp'] = tslib.iNaT - self.assertTrue(com.isnull(df.ix['b', 'timestamp'])) + self.assertTrue(isnull(df.ix['b', 'timestamp'])) # allow this syntax df.ix['c', 'timestamp'] = nan - self.assertTrue(com.isnull(df.ix['c', 'timestamp'])) + self.assertTrue(isnull(df.ix['c', 'timestamp'])) # allow this syntax df.ix['d', :] = nan - self.assertTrue(com.isnull(df.ix['c', :]).all() == False) # noqa + self.assertTrue(isnull(df.ix['c', :]).all() == False) # noqa # as of GH 3216 this will now work! # try to set with a list like item @@ -1609,7 +1622,7 @@ def test_set_value_resize(self): res = self.frame.copy() res3 = res.set_value('foobar', 'baz', 5) - self.assertTrue(com.is_float_dtype(res3['baz'])) + self.assertTrue(is_float_dtype(res3['baz'])) self.assertTrue(isnull(res3['baz'].drop(['foobar'])).all()) self.assertRaises(ValueError, res3.set_value, 'foobar', 'baz', 'sam') @@ -1652,7 +1665,7 @@ def test_single_element_ix_dont_upcast(self): (int, np.integer))) result = self.frame.ix[self.frame.index[5], 'E'] - self.assertTrue(com.is_integer(result)) + self.assertTrue(is_integer(result)) def test_irow(self): df = DataFrame(np.random.randn(10, 4), index=lrange(0, 20, 2)) @@ -2056,8 +2069,6 @@ def test_at_time_between_time_datetimeindex(self): assert_frame_equal(result, df) def test_xs(self): - from pandas.core.datetools import bday - idx = self.frame.index[5] xs = self.frame.xs(idx) for item, value in compat.iteritems(xs): @@ -2078,7 +2089,7 @@ def test_xs(self): self.assertEqual(xs['B'], '1') with tm.assertRaises(KeyError): - self.tsframe.xs(self.tsframe.index[0] - bday) + self.tsframe.xs(self.tsframe.index[0] - BDay()) # xs get column series = self.frame.xs('A', axis=1) @@ -2258,7 +2269,7 @@ def _check_align(df, cond, other, check_dtypes=True): d = df[k].values c = cond[k].reindex(df[k].index).fillna(False).values - if lib.isscalar(other): + if is_scalar(other): o = other else: if isinstance(other, np.ndarray): @@ -2760,3 +2771,9 @@ def test_transpose(self): expected = DataFrame(self.df.values.T) expected.index = ['A', 'B'] assert_frame_equal(result, expected) + + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/frame/test_misc_api.py b/pandas/tests/frame/test_misc_api.py index 03b3c0a5e65d0..089b71b30119b 100644 --- a/pandas/tests/frame/test_misc_api.py +++ b/pandas/tests/frame/test_misc_api.py @@ -207,7 +207,8 @@ def test_new_empty_index(self): self.assertIsNone(df2.index.name) def test_array_interface(self): - result = np.sqrt(self.frame) + with np.errstate(all='ignore'): + result = np.sqrt(self.frame) tm.assertIsInstance(result, type(self.frame)) self.assertIs(result.index, self.frame.index) self.assertIs(result.columns, self.frame.columns) diff --git a/pandas/tests/frame/test_mutate_columns.py b/pandas/tests/frame/test_mutate_columns.py index 2bdd6657eaf18..5beab1565e538 100644 --- a/pandas/tests/frame/test_mutate_columns.py +++ b/pandas/tests/frame/test_mutate_columns.py @@ -156,6 +156,13 @@ def test_insert(self): df.insert(0, 'baz', df['c']) self.assertEqual(df.columns.name, 'some_name') + # GH 13522 + df = DataFrame(index=['A', 'B', 'C']) + df['X'] = df.index + df['X'] = ['x', 'y', 'z'] + exp = DataFrame(data={'X': ['x', 'y', 'z']}, index=['A', 'B', 'C']) + assert_frame_equal(df, exp) + def test_delitem(self): del self.frame['A'] self.assertNotIn('A', self.frame) diff --git a/pandas/tests/frame/test_operators.py b/pandas/tests/frame/test_operators.py index ee7c296f563f0..85aadee8b0900 100644 --- a/pandas/tests/frame/test_operators.py +++ b/pandas/tests/frame/test_operators.py @@ -217,7 +217,9 @@ def test_modulo(self): assert_frame_equal(result, expected) # numpy has a slightly different (wrong) treatement - result2 = DataFrame(p.values % p.values, index=p.index, + with np.errstate(all='ignore'): + arr = p.values % p.values + result2 = DataFrame(arr, index=p.index, columns=p.columns, dtype='float64') result2.iloc[0:3, 1] = np.nan assert_frame_equal(result2, expected) @@ -227,8 +229,9 @@ def test_modulo(self): assert_frame_equal(result, expected) # numpy has a slightly different (wrong) treatement - result2 = DataFrame(p.values.astype('float64') % - 0, index=p.index, columns=p.columns) + with np.errstate(all='ignore'): + arr = p.values.astype('float64') % 0 + result2 = DataFrame(arr, index=p.index, columns=p.columns) assert_frame_equal(result2, expected) # not commutative with series @@ -248,7 +251,9 @@ def test_div(self): 'second': Series([nan, nan, nan, 1])}) assert_frame_equal(result, expected) - result2 = DataFrame(p.values.astype('float') / p.values, index=p.index, + with np.errstate(all='ignore'): + arr = p.values.astype('float') / p.values + result2 = DataFrame(arr, index=p.index, columns=p.columns) assert_frame_equal(result2, expected) @@ -258,7 +263,9 @@ def test_div(self): assert_frame_equal(result, expected) # numpy has a slightly different (wrong) treatement - result2 = DataFrame(p.values.astype('float64') / 0, index=p.index, + with np.errstate(all='ignore'): + arr = p.values.astype('float64') / 0 + result2 = DataFrame(arr, index=p.index, columns=p.columns) assert_frame_equal(result2, expected) @@ -417,10 +424,11 @@ def test_arith_flex_frame(self): # ndim >= 3 ndim_5 = np.ones(self.frame.shape + (3, 4, 5)) - with assertRaisesRegexp(ValueError, 'shape'): + msg = "Unable to coerce to Series/DataFrame" + with assertRaisesRegexp(ValueError, msg): f(self.frame, ndim_5) - with assertRaisesRegexp(ValueError, 'shape'): + with assertRaisesRegexp(ValueError, msg): getattr(self.frame, op)(ndim_5) # res_add = self.frame.add(self.frame) @@ -581,8 +589,9 @@ def _check_unaligned_frame(meth, op, df, other): # scalar assert_frame_equal(f(0), o(df, 0)) # NAs + msg = "Unable to coerce to Series/DataFrame" assert_frame_equal(f(np.nan), o(df, np.nan)) - with assertRaisesRegexp(ValueError, 'shape'): + with assertRaisesRegexp(ValueError, msg): f(ndim_5) # Series @@ -662,6 +671,17 @@ def _test_seq(df, idx_ser, col_ser): exp = DataFrame({'col': [False, True, False]}) assert_frame_equal(result, exp) + def test_dti_tz_convert_to_utc(self): + base = pd.DatetimeIndex(['2011-01-01', '2011-01-02', + '2011-01-03'], tz='UTC') + idx1 = base.tz_convert('Asia/Tokyo')[:2] + idx2 = base.tz_convert('US/Eastern')[1:] + + df1 = DataFrame({'A': [1, 2]}, index=idx1) + df2 = DataFrame({'A': [1, 1]}, index=idx2) + exp = DataFrame({'A': [np.nan, 3, np.nan]}, index=base) + assert_frame_equal(df1 + df2, exp) + def test_arith_flex_series(self): df = self.simple @@ -909,6 +929,15 @@ def test_comp(func): test_comp(operator.ge) test_comp(operator.le) + def test_comparison_protected_from_errstate(self): + missing_df = tm.makeDataFrame() + missing_df.iloc[0]['A'] = np.nan + with np.errstate(invalid='ignore'): + expected = missing_df.values < 0 + with np.errstate(invalid='raise'): + result = (missing_df < 0).values + self.assert_numpy_array_equal(result, expected) + def test_string_comparison(self): df = DataFrame([{"a": 1, "b": "foo"}, {"a": 2, "b": "bar"}]) mask_a = df.a > 1 @@ -1000,44 +1029,52 @@ def test_combineAdd(self): with tm.assert_produces_warning(FutureWarning): # trivial comb = self.frame.combineAdd(self.frame) - assert_frame_equal(comb, self.frame * 2) + assert_frame_equal(comb, self.frame * 2) - # more rigorous - a = DataFrame([[1., nan, nan, 2., nan]], - columns=np.arange(5)) - b = DataFrame([[2., 3., nan, 2., 6., nan]], - columns=np.arange(6)) - expected = DataFrame([[3., 3., nan, 4., 6., nan]], - columns=np.arange(6)) + # more rigorous + a = DataFrame([[1., nan, nan, 2., nan]], + columns=np.arange(5)) + b = DataFrame([[2., 3., nan, 2., 6., nan]], + columns=np.arange(6)) + expected = DataFrame([[3., 3., nan, 4., 6., nan]], + columns=np.arange(6)) + with tm.assert_produces_warning(FutureWarning): result = a.combineAdd(b) - assert_frame_equal(result, expected) + assert_frame_equal(result, expected) + + with tm.assert_produces_warning(FutureWarning): result2 = a.T.combineAdd(b.T) - assert_frame_equal(result2, expected.T) + assert_frame_equal(result2, expected.T) - expected2 = a.combine(b, operator.add, fill_value=0.) - assert_frame_equal(expected, expected2) + expected2 = a.combine(b, operator.add, fill_value=0.) + assert_frame_equal(expected, expected2) - # corner cases + # corner cases + with tm.assert_produces_warning(FutureWarning): comb = self.frame.combineAdd(self.empty) - assert_frame_equal(comb, self.frame) + assert_frame_equal(comb, self.frame) + with tm.assert_produces_warning(FutureWarning): comb = self.empty.combineAdd(self.frame) - assert_frame_equal(comb, self.frame) + assert_frame_equal(comb, self.frame) - # integer corner case - df1 = DataFrame({'x': [5]}) - df2 = DataFrame({'x': [1]}) - df3 = DataFrame({'x': [6]}) + # integer corner case + df1 = DataFrame({'x': [5]}) + df2 = DataFrame({'x': [1]}) + df3 = DataFrame({'x': [6]}) + + with tm.assert_produces_warning(FutureWarning): comb = df1.combineAdd(df2) - assert_frame_equal(comb, df3) + assert_frame_equal(comb, df3) - # mixed type GH2191 - df1 = DataFrame({'A': [1, 2], 'B': [3, 4]}) - df2 = DataFrame({'A': [1, 2], 'C': [5, 6]}) + # mixed type GH2191 + df1 = DataFrame({'A': [1, 2], 'B': [3, 4]}) + df2 = DataFrame({'A': [1, 2], 'C': [5, 6]}) + with tm.assert_produces_warning(FutureWarning): rs = df1.combineAdd(df2) - xp = DataFrame({'A': [2, 4], 'B': [3, 4.], 'C': [5, 6.]}) - assert_frame_equal(xp, rs) + xp = DataFrame({'A': [2, 4], 'B': [3, 4.], 'C': [5, 6.]}) + assert_frame_equal(xp, rs) # TODO: test integer fill corner? @@ -1176,6 +1213,53 @@ def test_inplace_ops_identity(self): assert_frame_equal(df2, expected) self.assertIs(df._data, df2._data) + def test_alignment_non_pandas(self): + index = ['A', 'B', 'C'] + columns = ['X', 'Y', 'Z'] + df = pd.DataFrame(np.random.randn(3, 3), index=index, columns=columns) + + align = pd.core.ops._align_method_FRAME + + for val in [[1, 2, 3], (1, 2, 3), np.array([1, 2, 3], dtype=np.int64)]: + + tm.assert_series_equal(align(df, val, 'index'), + Series([1, 2, 3], index=df.index)) + tm.assert_series_equal(align(df, val, 'columns'), + Series([1, 2, 3], index=df.columns)) + + # length mismatch + msg = 'Unable to coerce to Series, length must be 3: given 2' + for val in [[1, 2], (1, 2), np.array([1, 2])]: + with tm.assertRaisesRegexp(ValueError, msg): + align(df, val, 'index') + + with tm.assertRaisesRegexp(ValueError, msg): + align(df, val, 'columns') + + val = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + tm.assert_frame_equal(align(df, val, 'index'), + DataFrame(val, index=df.index, + columns=df.columns)) + tm.assert_frame_equal(align(df, val, 'columns'), + DataFrame(val, index=df.index, + columns=df.columns)) + + # shape mismatch + msg = 'Unable to coerce to DataFrame, shape must be' + val = np.array([[1, 2, 3], [4, 5, 6]]) + with tm.assertRaisesRegexp(ValueError, msg): + align(df, val, 'index') + + with tm.assertRaisesRegexp(ValueError, msg): + align(df, val, 'columns') + + val = np.zeros((3, 3, 3)) + with tm.assertRaises(ValueError): + align(df, val, 'index') + with tm.assertRaises(ValueError): + align(df, val, 'columns') + + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index 49b0ce66999d8..85159de64d83e 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -147,6 +147,16 @@ def test_query_non_str(self): with tm.assertRaisesRegexp(ValueError, msg): df.query(111) + def test_eval_resolvers_as_list(self): + # GH 14095 + df = DataFrame(randn(10, 2), columns=list('ab')) + dict1 = {'a': 1} + dict2 = {'b': 2} + self.assertTrue(df.eval('a + b', resolvers=[dict1, dict2]) == + dict1['a'] + dict2['b']) + self.assertTrue(pd.eval('a + b', resolvers=[dict1, dict2]) == + dict1['a'] + dict2['b']) + class TestDataFrameQueryWithMultiIndex(tm.TestCase): diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index 066485e966a42..8b1b1130dc2fc 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -707,3 +707,19 @@ def _test_stack_with_multiindex(multiindex): columns=Index(['B', 'C'], name='Upper'), dtype=df.dtypes[0]) assert_frame_equal(result, expected) + + def test_stack_preserve_categorical_dtype(self): + # GH13854 + for ordered in [False, True]: + for labels in [list("yxz"), list("yxy")]: + cidx = pd.CategoricalIndex(labels, categories=list("xyz"), + ordered=ordered) + df = DataFrame([[10, 11, 12]], columns=cidx) + result = df.stack() + + # `MutliIndex.from_product` preserves categorical dtype - + # it's tested elsewhere. + midx = pd.MultiIndex.from_product([df.index, cidx]) + expected = Series([10, 11, 12], index=midx) + + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/test_sorting.py b/pandas/tests/frame/test_sorting.py index ff2159f8b6f40..b7a38e9e13ebd 100644 --- a/pandas/tests/frame/test_sorting.py +++ b/pandas/tests/frame/test_sorting.py @@ -21,77 +21,70 @@ class TestDataFrameSorting(tm.TestCase, TestData): _multiprocess_can_split_ = True - def test_sort_values(self): - # API for 9816 + def test_sort_index(self): + # GH13496 - # sort_index frame = DataFrame(np.arange(16).reshape(4, 4), index=[1, 2, 3, 4], columns=['A', 'B', 'C', 'D']) - # 9816 deprecated - with tm.assert_produces_warning(FutureWarning): - frame.sort(columns='A') - with tm.assert_produces_warning(FutureWarning): - frame.sort() - + # axis=0 : sort rows by index labels unordered = frame.ix[[3, 2, 4, 1]] - expected = unordered.sort_index() - result = unordered.sort_index(axis=0) + expected = frame assert_frame_equal(result, expected) - unordered = frame.ix[:, [2, 1, 3, 0]] - expected = unordered.sort_index(axis=1) + result = unordered.sort_index(ascending=False) + expected = frame[::-1] + assert_frame_equal(result, expected) + # axis=1 : sort columns by column names + unordered = frame.ix[:, [2, 1, 3, 0]] result = unordered.sort_index(axis=1) - assert_frame_equal(result, expected) + assert_frame_equal(result, frame) + + result = unordered.sort_index(axis=1, ascending=False) + expected = frame.ix[:, ::-1] assert_frame_equal(result, expected) - # sortlevel - mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC')) + def test_sort_index_multiindex(self): + # GH13496 + + # sort rows by specified level of multi-index + mi = MultiIndex.from_tuples([[2, 1, 3], [1, 1, 1]], names=list('ABC')) df = DataFrame([[1, 2], [3, 4]], mi) result = df.sort_index(level='A', sort_remaining=False) expected = df.sortlevel('A', sort_remaining=False) assert_frame_equal(result, expected) + # sort columns by specified level of multi-index df = df.T result = df.sort_index(level='A', axis=1, sort_remaining=False) expected = df.sortlevel('A', axis=1, sort_remaining=False) assert_frame_equal(result, expected) - # MI sort, but no by + # MI sort, but no level: sort_level has no effect mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC')) df = DataFrame([[1, 2], [3, 4]], mi) result = df.sort_index(sort_remaining=False) expected = df.sort_index() assert_frame_equal(result, expected) - def test_sort_index(self): + def test_sort(self): frame = DataFrame(np.arange(16).reshape(4, 4), index=[1, 2, 3, 4], columns=['A', 'B', 'C', 'D']) - # axis=0 - unordered = frame.ix[[3, 2, 4, 1]] - sorted_df = unordered.sort_index(axis=0) - expected = frame - assert_frame_equal(sorted_df, expected) - - sorted_df = unordered.sort_index(ascending=False) - expected = frame[::-1] - assert_frame_equal(sorted_df, expected) - - # axis=1 - unordered = frame.ix[:, ['D', 'B', 'C', 'A']] - sorted_df = unordered.sort_index(axis=1) - expected = frame - assert_frame_equal(sorted_df, expected) + # 9816 deprecated + with tm.assert_produces_warning(FutureWarning): + frame.sort(columns='A') + with tm.assert_produces_warning(FutureWarning): + frame.sort() - sorted_df = unordered.sort_index(axis=1, ascending=False) - expected = frame.ix[:, ::-1] - assert_frame_equal(sorted_df, expected) + def test_sort_values(self): + frame = DataFrame([[1, 1, 2], [3, 1, 0], [4, 5, 6]], + index=[1, 2, 3], columns=list('ABC')) - # by column + # by column (axis=0) sorted_df = frame.sort_values(by='A') indexer = frame['A'].argsort().values expected = frame.ix[frame.index[indexer]] @@ -109,27 +102,69 @@ def test_sort_index(self): sorted_df = frame.sort_values(by=['A'], ascending=[False]) assert_frame_equal(sorted_df, expected) - # check for now - sorted_df = frame.sort_values(by='A') - assert_frame_equal(sorted_df, expected[::-1]) - expected = frame.sort_values(by='A') + # multiple bys + sorted_df = frame.sort_values(by=['B', 'C']) + expected = frame.loc[[2, 1, 3]] assert_frame_equal(sorted_df, expected) - expected = frame.sort_values(by=['A', 'B'], ascending=False) - sorted_df = frame.sort_values(by=['A', 'B']) + sorted_df = frame.sort_values(by=['B', 'C'], ascending=False) assert_frame_equal(sorted_df, expected[::-1]) + sorted_df = frame.sort_values(by=['B', 'A'], ascending=[True, False]) + assert_frame_equal(sorted_df, expected) + self.assertRaises(ValueError, lambda: frame.sort_values( by=['A', 'B'], axis=2, inplace=True)) - msg = 'When sorting by column, axis must be 0' - with assertRaisesRegexp(ValueError, msg): - frame.sort_values(by='A', axis=1) + # by row (axis=1): GH 10806 + sorted_df = frame.sort_values(by=3, axis=1) + expected = frame + assert_frame_equal(sorted_df, expected) + + sorted_df = frame.sort_values(by=3, axis=1, ascending=False) + expected = frame.reindex(columns=['C', 'B', 'A']) + assert_frame_equal(sorted_df, expected) + + sorted_df = frame.sort_values(by=[1, 2], axis='columns') + expected = frame.reindex(columns=['B', 'A', 'C']) + assert_frame_equal(sorted_df, expected) + + sorted_df = frame.sort_values(by=[1, 3], axis=1, + ascending=[True, False]) + assert_frame_equal(sorted_df, expected) + + sorted_df = frame.sort_values(by=[1, 3], axis=1, ascending=False) + expected = frame.reindex(columns=['C', 'B', 'A']) + assert_frame_equal(sorted_df, expected) msg = r'Length of ascending \(5\) != length of by \(2\)' with assertRaisesRegexp(ValueError, msg): frame.sort_values(by=['A', 'B'], axis=0, ascending=[True] * 5) + def test_sort_values_inplace(self): + frame = DataFrame(np.random.randn(4, 4), index=[1, 2, 3, 4], + columns=['A', 'B', 'C', 'D']) + + sorted_df = frame.copy() + sorted_df.sort_values(by='A', inplace=True) + expected = frame.sort_values(by='A') + assert_frame_equal(sorted_df, expected) + + sorted_df = frame.copy() + sorted_df.sort_values(by=1, axis=1, inplace=True) + expected = frame.sort_values(by=1, axis=1) + assert_frame_equal(sorted_df, expected) + + sorted_df = frame.copy() + sorted_df.sort_values(by='A', ascending=False, inplace=True) + expected = frame.sort_values(by='A', ascending=False) + assert_frame_equal(sorted_df, expected) + + sorted_df = frame.copy() + sorted_df.sort_values(by=['A', 'B'], ascending=False, inplace=True) + expected = frame.sort_values(by=['A', 'B'], ascending=False) + assert_frame_equal(sorted_df, expected) + def test_sort_index_categorical_index(self): df = (DataFrame({'A': np.arange(6, dtype='int64'), @@ -166,6 +201,10 @@ def test_sort_nan(self): sorted_df = df.sort_values(['A'], na_position='first', ascending=False) assert_frame_equal(sorted_df, expected) + expected = df.reindex(columns=['B', 'A']) + sorted_df = df.sort_values(by=1, axis=1, na_position='first') + assert_frame_equal(sorted_df, expected) + # na_position='last', order expected = DataFrame( {'A': [1, 1, 2, 4, 6, 8, nan], @@ -361,25 +400,6 @@ def test_sort_index_different_sortorder(self): result = idf['C'].sort_index(ascending=[1, 0]) assert_series_equal(result, expected['C']) - def test_sort_inplace(self): - frame = DataFrame(np.random.randn(4, 4), index=[1, 2, 3, 4], - columns=['A', 'B', 'C', 'D']) - - sorted_df = frame.copy() - sorted_df.sort_values(by='A', inplace=True) - expected = frame.sort_values(by='A') - assert_frame_equal(sorted_df, expected) - - sorted_df = frame.copy() - sorted_df.sort_values(by='A', ascending=False, inplace=True) - expected = frame.sort_values(by='A', ascending=False) - assert_frame_equal(sorted_df, expected) - - sorted_df = frame.copy() - sorted_df.sort_values(by=['A', 'B'], ascending=False, inplace=True) - expected = frame.sort_values(by=['A', 'B'], ascending=False) - assert_frame_equal(sorted_df, expected) - def test_sort_index_duplicates(self): # with 9816, these are all translated to .sort_values diff --git a/pandas/tests/frame/test_subclass.py b/pandas/tests/frame/test_subclass.py index ee12d9e84511c..6a57f67a6cb3d 100644 --- a/pandas/tests/frame/test_subclass.py +++ b/pandas/tests/frame/test_subclass.py @@ -210,3 +210,40 @@ def test_subclass_align_combinations(self): tm.assert_series_equal(res1, exp2) tm.assertIsInstance(res2, tm.SubclassedDataFrame) tm.assert_frame_equal(res2, exp1) + + def test_subclass_iterrows(self): + # GH 13977 + df = tm.SubclassedDataFrame({'a': [1]}) + for i, row in df.iterrows(): + tm.assertIsInstance(row, tm.SubclassedSeries) + tm.assert_series_equal(row, df.loc[i]) + + def test_subclass_sparse_slice(self): + rows = [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]] + ssdf = tm.SubclassedSparseDataFrame(rows) + ssdf.testattr = "testattr" + + tm.assert_sp_frame_equal(ssdf.loc[:2], + tm.SubclassedSparseDataFrame(rows[:3])) + tm.assert_sp_frame_equal(ssdf.iloc[:2], + tm.SubclassedSparseDataFrame(rows[:2])) + tm.assert_sp_frame_equal(ssdf[:2], + tm.SubclassedSparseDataFrame(rows[:2])) + tm.assert_equal(ssdf.loc[:2].testattr, "testattr") + tm.assert_equal(ssdf.iloc[:2].testattr, "testattr") + tm.assert_equal(ssdf[:2].testattr, "testattr") + + tm.assert_sp_series_equal(ssdf.loc[1], + tm.SubclassedSparseSeries(rows[1]), + check_names=False) + tm.assert_sp_series_equal(ssdf.iloc[1], + tm.SubclassedSparseSeries(rows[1]), + check_names=False) + + def test_subclass_sparse_transpose(self): + ossdf = tm.SubclassedSparseDataFrame([[1, 2, 3], + [4, 5, 6]]) + essdf = tm.SubclassedSparseDataFrame([[1, 4], + [2, 5], + [3, 6]]) + tm.assert_sp_frame_equal(ossdf.T, essdf) diff --git a/pandas/tests/frame/test_timeseries.py b/pandas/tests/frame/test_timeseries.py index b9baae6cbeda7..9758c2b9c805e 100644 --- a/pandas/tests/frame/test_timeseries.py +++ b/pandas/tests/frame/test_timeseries.py @@ -10,7 +10,7 @@ from pandas import DataFrame, Series, Index, Timestamp, DatetimeIndex import pandas as pd -import pandas.core.datetools as datetools +import pandas.tseries.offsets as offsets from pandas.util.testing import (assert_almost_equal, assert_series_equal, @@ -136,14 +136,14 @@ def test_shift(self): assert_frame_equal(unshifted, self.tsframe) # shift by DateOffset - shiftedFrame = self.tsframe.shift(5, freq=datetools.BDay()) + shiftedFrame = self.tsframe.shift(5, freq=offsets.BDay()) self.assertEqual(len(shiftedFrame), len(self.tsframe)) shiftedFrame2 = self.tsframe.shift(5, freq='B') assert_frame_equal(shiftedFrame, shiftedFrame2) d = self.tsframe.index[0] - shifted_d = d + datetools.BDay(5) + shifted_d = d + offsets.BDay(5) assert_series_equal(self.tsframe.xs(d), shiftedFrame.xs(shifted_d), check_names=False) @@ -160,7 +160,7 @@ def test_shift(self): ps.ix[:-1, 0].values) shifted2 = ps.shift(1, 'B') - shifted3 = ps.shift(1, datetools.bday) + shifted3 = ps.shift(1, offsets.BDay()) assert_frame_equal(shifted2, shifted3) assert_frame_equal(ps, shifted2.shift(-1, 'B')) @@ -222,7 +222,7 @@ def test_tshift(self): shifted2 = ps.tshift(freq='B') assert_frame_equal(shifted, shifted2) - shifted3 = ps.tshift(freq=datetools.bday) + shifted3 = ps.tshift(freq=offsets.BDay()) assert_frame_equal(shifted, shifted3) assertRaisesRegexp(ValueError, 'does not match', ps.tshift, freq='M') @@ -297,7 +297,7 @@ def test_truncate_copy(self): self.assertFalse((self.tsframe.values[5:11] == 5).any()) def test_asfreq(self): - offset_monthly = self.tsframe.asfreq(datetools.bmonthEnd) + offset_monthly = self.tsframe.asfreq(offsets.BMonthEnd()) rule_monthly = self.tsframe.asfreq('BM') assert_almost_equal(offset_monthly['A'], rule_monthly['A']) @@ -341,3 +341,33 @@ def test_first_last_valid(self): empty = DataFrame() self.assertIsNone(empty.last_valid_index()) self.assertIsNone(empty.first_valid_index()) + + def test_operation_on_NaT(self): + # Both NaT and Timestamp are in DataFrame. + df = pd.DataFrame({'foo': [pd.NaT, pd.NaT, + pd.Timestamp('2012-05-01')]}) + + res = df.min() + exp = pd.Series([pd.Timestamp('2012-05-01')], index=["foo"]) + tm.assert_series_equal(res, exp) + + res = df.max() + exp = pd.Series([pd.Timestamp('2012-05-01')], index=["foo"]) + tm.assert_series_equal(res, exp) + + # GH12941, only NaTs are in DataFrame. + df = pd.DataFrame({'foo': [pd.NaT, pd.NaT]}) + + res = df.min() + exp = pd.Series([pd.NaT], index=["foo"]) + tm.assert_series_equal(res, exp) + + res = df.max() + exp = pd.Series([pd.NaT], index=["foo"]) + tm.assert_series_equal(res, exp) + + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index bacf604c491b1..6d09378ca864e 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -10,7 +10,7 @@ from pandas.compat import (lmap, range, lrange, StringIO, u) from pandas.parser import CParserError from pandas import (DataFrame, Index, Series, MultiIndex, Timestamp, - date_range, read_csv, compat) + date_range, read_csv, compat, to_datetime) import pandas as pd from pandas.util.testing import (assert_almost_equal, @@ -139,7 +139,7 @@ def test_to_csv_from_csv5(self): self.tzframe.to_csv(path) result = pd.read_csv(path, index_col=0, parse_dates=['A']) - converter = lambda c: pd.to_datetime(result[c]).dt.tz_localize( + converter = lambda c: to_datetime(result[c]).dt.tz_localize( 'UTC').dt.tz_convert(self.tzframe[c].dt.tz) result['B'] = converter('B') result['C'] = converter('C') @@ -162,15 +162,6 @@ def test_to_csv_cols_reordering(self): assert_frame_equal(df[cols], rs_c, check_names=False) - def test_to_csv_legacy_raises_on_dupe_cols(self): - df = mkdf(10, 3) - df.columns = ['a', 'a', 'b'] - with ensure_clean() as path: - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - self.assertRaises(NotImplementedError, - df.to_csv, path, engine='python') - def test_to_csv_new_dupe_cols(self): import pandas as pd @@ -300,10 +291,10 @@ def _to_uni(x): elif r_dtype == 'p': r_dtype = 'O' recons.index = np.array( - list(map(Timestamp, recons.index.to_datetime())), + list(map(Timestamp, to_datetime(recons.index))), dtype=r_dtype) df.index = np.array( - list(map(Timestamp, df.index.to_datetime())), + list(map(Timestamp, df.index.to_timestamp())), dtype=r_dtype) else: r_dtype = type_map.get(r_dtype) @@ -325,10 +316,10 @@ def _to_uni(x): elif c_dtype == 'p': c_dtype = 'O' recons.columns = np.array( - lmap(Timestamp, recons.columns.to_datetime()), + lmap(Timestamp, to_datetime(recons.columns)), dtype=c_dtype) df.columns = np.array( - lmap(Timestamp, df.columns.to_datetime()), + lmap(Timestamp, df.columns.to_timestamp()), dtype=c_dtype) else: c_dtype = type_map.get(c_dtype) @@ -596,21 +587,9 @@ def _make_frame(names=None): df = _make_frame(True) df.to_csv(path, tupleize_cols=False) - # catch invalid headers - with assertRaisesRegexp(CParserError, - 'Passed header=\[0,1,2\] are too many ' - 'rows for this multi_index of columns'): - read_csv(path, tupleize_cols=False, - header=lrange(3), index_col=0) - - with assertRaisesRegexp(CParserError, - 'Passed header=\[0,1,2,3,4,5,6\], len of ' - '7, but only 6 lines in file'): - read_csv(path, tupleize_cols=False, - header=lrange(7), index_col=0) - - for i in [4, 5, 6]: - with tm.assertRaises(CParserError): + for i in [5, 6, 7]: + msg = 'len of {i}, but only 5 lines in file'.format(i=i) + with assertRaisesRegexp(CParserError, msg): read_csv(path, tupleize_cols=False, header=lrange(i), index_col=0) @@ -712,7 +691,6 @@ def test_to_csv_dups_cols(self): cols.extend([0, 1, 2]) df.columns = cols - from pandas import to_datetime with ensure_clean() as filename: df.to_csv(filename) result = read_csv(filename, index_col=0) @@ -824,35 +802,6 @@ def test_to_csv_float_format(self): index=['A', 'B'], columns=['X', 'Y', 'Z']) assert_frame_equal(rs, xp) - def test_to_csv_quoting(self): - df = DataFrame({'A': [1, 2, 3], 'B': ['foo', 'bar', 'baz']}) - - buf = StringIO() - df.to_csv(buf, index=False, quoting=csv.QUOTE_NONNUMERIC) - - result = buf.getvalue() - expected = ('"A","B"\n' - '1,"foo"\n' - '2,"bar"\n' - '3,"baz"\n') - - self.assertEqual(result, expected) - - # quoting windows line terminators, presents with encoding? - # #3503 - text = 'a,b,c\n1,"test \r\n",3\n' - df = pd.read_csv(StringIO(text)) - buf = StringIO() - df.to_csv(buf, encoding='utf-8', index=False) - self.assertEqual(buf.getvalue(), text) - - # testing if quoting parameter is passed through with multi-indexes - # related to issue #7791 - df = pd.DataFrame({'a': [1, 2], 'b': [3, 4], 'c': [5, 6]}) - df = df.set_index(['a', 'b']) - expected = '"a","b","c"\n"1","3","5"\n"2","4","6"\n' - self.assertEqual(df.to_csv(quoting=csv.QUOTE_ALL), expected) - def test_to_csv_unicodewriter_quoting(self): df = DataFrame({'A': [1, 2, 3], 'B': ['foo', 'bar', 'baz']}) @@ -935,7 +884,7 @@ def test_to_csv_path_is_none(self): # GH 8215 # Make sure we return string for consistency with # Series.to_csv() - csv_str = self.frame.to_csv(path=None) + csv_str = self.frame.to_csv(path_or_buf=None) self.assertIsInstance(csv_str, str) recons = pd.read_csv(StringIO(csv_str), index_col=0) assert_frame_equal(self.frame, recons) @@ -1022,72 +971,57 @@ def test_to_csv_compression_value_error(self): filename, compression="zip") def test_to_csv_date_format(self): - from pandas import to_datetime with ensure_clean('__tmp_to_csv_date_format__') as path: - for engine in [None, 'python']: - w = FutureWarning if engine == 'python' else None - - dt_index = self.tsframe.index - datetime_frame = DataFrame( - {'A': dt_index, 'B': dt_index.shift(1)}, index=dt_index) - - with tm.assert_produces_warning(w, check_stacklevel=False): - datetime_frame.to_csv( - path, date_format='%Y%m%d', engine=engine) - - # Check that the data was put in the specified format - test = read_csv(path, index_col=0) + dt_index = self.tsframe.index + datetime_frame = DataFrame( + {'A': dt_index, 'B': dt_index.shift(1)}, index=dt_index) + datetime_frame.to_csv(path, date_format='%Y%m%d') - datetime_frame_int = datetime_frame.applymap( - lambda x: int(x.strftime('%Y%m%d'))) - datetime_frame_int.index = datetime_frame_int.index.map( - lambda x: int(x.strftime('%Y%m%d'))) + # Check that the data was put in the specified format + test = read_csv(path, index_col=0) - assert_frame_equal(test, datetime_frame_int) + datetime_frame_int = datetime_frame.applymap( + lambda x: int(x.strftime('%Y%m%d'))) + datetime_frame_int.index = datetime_frame_int.index.map( + lambda x: int(x.strftime('%Y%m%d'))) - with tm.assert_produces_warning(w, check_stacklevel=False): - datetime_frame.to_csv( - path, date_format='%Y-%m-%d', engine=engine) + assert_frame_equal(test, datetime_frame_int) - # Check that the data was put in the specified format - test = read_csv(path, index_col=0) - datetime_frame_str = datetime_frame.applymap( - lambda x: x.strftime('%Y-%m-%d')) - datetime_frame_str.index = datetime_frame_str.index.map( - lambda x: x.strftime('%Y-%m-%d')) + datetime_frame.to_csv(path, date_format='%Y-%m-%d') - assert_frame_equal(test, datetime_frame_str) + # Check that the data was put in the specified format + test = read_csv(path, index_col=0) + datetime_frame_str = datetime_frame.applymap( + lambda x: x.strftime('%Y-%m-%d')) + datetime_frame_str.index = datetime_frame_str.index.map( + lambda x: x.strftime('%Y-%m-%d')) - # Check that columns get converted - datetime_frame_columns = datetime_frame.T + assert_frame_equal(test, datetime_frame_str) - with tm.assert_produces_warning(w, check_stacklevel=False): - datetime_frame_columns.to_csv( - path, date_format='%Y%m%d', engine=engine) + # Check that columns get converted + datetime_frame_columns = datetime_frame.T + datetime_frame_columns.to_csv(path, date_format='%Y%m%d') - test = read_csv(path, index_col=0) + test = read_csv(path, index_col=0) - datetime_frame_columns = datetime_frame_columns.applymap( - lambda x: int(x.strftime('%Y%m%d'))) - # Columns don't get converted to ints by read_csv - datetime_frame_columns.columns = ( - datetime_frame_columns.columns - .map(lambda x: x.strftime('%Y%m%d'))) + datetime_frame_columns = datetime_frame_columns.applymap( + lambda x: int(x.strftime('%Y%m%d'))) + # Columns don't get converted to ints by read_csv + datetime_frame_columns.columns = ( + datetime_frame_columns.columns + .map(lambda x: x.strftime('%Y%m%d'))) - assert_frame_equal(test, datetime_frame_columns) + assert_frame_equal(test, datetime_frame_columns) - # test NaTs - nat_index = to_datetime( - ['NaT'] * 10 + ['2000-01-01', '1/1/2000', '1-1-2000']) - nat_frame = DataFrame({'A': nat_index}, index=nat_index) + # test NaTs + nat_index = to_datetime( + ['NaT'] * 10 + ['2000-01-01', '1/1/2000', '1-1-2000']) + nat_frame = DataFrame({'A': nat_index}, index=nat_index) + nat_frame.to_csv(path, date_format='%Y-%m-%d') - with tm.assert_produces_warning(w, check_stacklevel=False): - nat_frame.to_csv( - path, date_format='%Y-%m-%d', engine=engine) + test = read_csv(path, parse_dates=[0, 1], index_col=0) - test = read_csv(path, parse_dates=[0, 1], index_col=0) - - assert_frame_equal(test, nat_frame) + assert_frame_equal(test, nat_frame) def test_to_csv_with_dst_transitions(self): @@ -1106,7 +1040,7 @@ def test_to_csv_with_dst_transitions(self): # we have to reconvert the index as we # don't parse the tz's result = read_csv(path, index_col=0) - result.index = pd.to_datetime(result.index).tz_localize( + result.index = to_datetime(result.index).tz_localize( 'UTC').tz_convert('Europe/London') assert_frame_equal(result, df) @@ -1118,9 +1052,9 @@ def test_to_csv_with_dst_transitions(self): with ensure_clean('csv_date_format_with_dst') as path: df.to_csv(path, index=True) result = read_csv(path, index_col=0) - result.index = pd.to_datetime(result.index).tz_localize( + result.index = to_datetime(result.index).tz_localize( 'UTC').tz_convert('Europe/Paris') - result['idx'] = pd.to_datetime(result['idx']).astype( + result['idx'] = to_datetime(result['idx']).astype( 'datetime64[ns, Europe/Paris]') assert_frame_equal(result, df) @@ -1131,3 +1065,89 @@ def test_to_csv_with_dst_transitions(self): df.to_pickle(path) result = pd.read_pickle(path) assert_frame_equal(result, df) + + def test_to_csv_quoting(self): + df = DataFrame({ + 'c_string': ['a', 'b,c'], + 'c_int': [42, np.nan], + 'c_float': [1.0, 3.2], + 'c_bool': [True, False], + }) + + expected = """\ +,c_bool,c_float,c_int,c_string +0,True,1.0,42.0,a +1,False,3.2,,"b,c" +""" + result = df.to_csv() + self.assertEqual(result, expected) + + result = df.to_csv(quoting=None) + self.assertEqual(result, expected) + + result = df.to_csv(quoting=csv.QUOTE_MINIMAL) + self.assertEqual(result, expected) + + expected = """\ +"","c_bool","c_float","c_int","c_string" +"0","True","1.0","42.0","a" +"1","False","3.2","","b,c" +""" + result = df.to_csv(quoting=csv.QUOTE_ALL) + self.assertEqual(result, expected) + + # see gh-12922, gh-13259: make sure changes to + # the formatters do not break this behaviour + expected = """\ +"","c_bool","c_float","c_int","c_string" +0,True,1.0,42.0,"a" +1,False,3.2,"","b,c" +""" + result = df.to_csv(quoting=csv.QUOTE_NONNUMERIC) + self.assertEqual(result, expected) + + msg = "need to escape, but no escapechar set" + tm.assertRaisesRegexp(csv.Error, msg, df.to_csv, + quoting=csv.QUOTE_NONE) + tm.assertRaisesRegexp(csv.Error, msg, df.to_csv, + quoting=csv.QUOTE_NONE, + escapechar=None) + + expected = """\ +,c_bool,c_float,c_int,c_string +0,True,1.0,42.0,a +1,False,3.2,,b!,c +""" + result = df.to_csv(quoting=csv.QUOTE_NONE, + escapechar='!') + self.assertEqual(result, expected) + + expected = """\ +,c_bool,c_ffloat,c_int,c_string +0,True,1.0,42.0,a +1,False,3.2,,bf,c +""" + result = df.to_csv(quoting=csv.QUOTE_NONE, + escapechar='f') + self.assertEqual(result, expected) + + # see gh-3503: quoting Windows line terminators + # presents with encoding? + text = 'a,b,c\n1,"test \r\n",3\n' + df = pd.read_csv(StringIO(text)) + buf = StringIO() + df.to_csv(buf, encoding='utf-8', index=False) + self.assertEqual(buf.getvalue(), text) + + # xref gh-7791: make sure the quoting parameter is passed through + # with multi-indexes + df = pd.DataFrame({'a': [1, 2], 'b': [3, 4], 'c': [5, 6]}) + df = df.set_index(['a', 'b']) + expected = '"a","b","c"\n"1","3","5"\n"2","4","6"\n' + self.assertEqual(df.to_csv(quoting=csv.QUOTE_ALL), expected) + + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index e342eee2aabbb..773f20532e4ff 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -8,6 +8,7 @@ from pandas import (Series, Index, Float64Index, Int64Index, RangeIndex, MultiIndex, CategoricalIndex, DatetimeIndex, TimedeltaIndex, PeriodIndex, notnull) +from pandas.types.common import needs_i8_conversion from pandas.util.testing import assertRaisesRegexp import pandas.util.testing as tm @@ -110,7 +111,7 @@ def f(): def test_reindex_base(self): idx = self.create_index() - expected = np.arange(idx.size) + expected = np.arange(idx.size, dtype=np.intp) actual = idx.get_indexer(idx) tm.assert_numpy_array_equal(expected, actual) @@ -149,10 +150,7 @@ def test_dtype_str(self): for idx in self.indices.values(): dtype = idx.dtype_str self.assertIsInstance(dtype, compat.string_types) - if isinstance(idx, PeriodIndex): - self.assertEqual(dtype, 'period') - else: - self.assertEqual(dtype, str(idx.dtype)) + self.assertEqual(dtype, str(idx.dtype)) def test_repr_max_seq_item_setting(self): # GH10182 @@ -205,6 +203,62 @@ def test_hash_error(self): type(ind).__name__): hash(ind) + def test_copy_name(self): + # Check that "name" argument passed at initialization is honoured + # GH12309 + for name, index in compat.iteritems(self.indices): + if isinstance(index, MultiIndex): + continue + + first = index.__class__(index, copy=True, name='mario') + second = first.__class__(first, copy=False) + + # Even though "copy=False", we want a new object. + self.assertIsNot(first, second) + # Not using tm.assert_index_equal() since names differ: + self.assertTrue(index.equals(first)) + + self.assertEqual(first.name, 'mario') + self.assertEqual(second.name, 'mario') + + s1 = Series(2, index=first) + s2 = Series(3, index=second[:-1]) + if not isinstance(index, CategoricalIndex): # See GH13365 + s3 = s1 * s2 + self.assertEqual(s3.index.name, 'mario') + + def test_ensure_copied_data(self): + # Check the "copy" argument of each Index.__new__ is honoured + # GH12309 + for name, index in compat.iteritems(self.indices): + init_kwargs = {} + if isinstance(index, PeriodIndex): + # Needs "freq" specification: + init_kwargs['freq'] = index.freq + elif isinstance(index, (RangeIndex, MultiIndex, CategoricalIndex)): + # RangeIndex cannot be initialized from data + # MultiIndex and CategoricalIndex are tested separately + continue + + index_type = index.__class__ + result = index_type(index.values, copy=True, **init_kwargs) + tm.assert_index_equal(index, result) + tm.assert_numpy_array_equal(index.values, result.values, + check_same='copy') + + if not isinstance(index, PeriodIndex): + result = index_type(index.values, copy=False, **init_kwargs) + tm.assert_numpy_array_equal(index.values, result.values, + check_same='same') + tm.assert_numpy_array_equal(index._values, result._values, + check_same='same') + else: + # .values an object array of Period, thus copied + result = index_type(ordinal=index.asi8, copy=False, + **init_kwargs) + tm.assert_numpy_array_equal(index._values, result._values, + check_same='same') + def test_copy_and_deepcopy(self): from copy import copy, deepcopy @@ -240,6 +294,53 @@ def test_duplicates(self): self.assertEqual(result.name, 'foo') self.assert_index_equal(result, Index([ind[0]], name='foo')) + def test_get_unique_index(self): + for ind in self.indices.values(): + + # MultiIndex tested separately + if not len(ind) or isinstance(ind, MultiIndex): + continue + + idx = ind[[0] * 5] + idx_unique = ind[[0]] + # We test against `idx_unique`, so first we make sure it's unique + # and doesn't contain nans. + self.assertTrue(idx_unique.is_unique) + try: + self.assertFalse(idx_unique.hasnans) + except NotImplementedError: + pass + + for dropna in [False, True]: + result = idx._get_unique_index(dropna=dropna) + self.assert_index_equal(result, idx_unique) + + # nans: + + if not ind._can_hold_na: + continue + + if needs_i8_conversion(ind): + vals = ind.asi8[[0] * 5] + vals[0] = pd.tslib.iNaT + else: + vals = ind.values[[0] * 5] + vals[0] = np.nan + + vals_unique = vals[:2] + idx_nan = ind._shallow_copy(vals) + idx_unique_nan = ind._shallow_copy(vals_unique) + self.assertTrue(idx_unique_nan.is_unique) + + self.assertEqual(idx_nan.dtype, ind.dtype) + self.assertEqual(idx_unique_nan.dtype, ind.dtype) + + for dropna, expected in zip([False, True], + [idx_unique_nan, idx_unique]): + for i in [idx_nan, idx_unique_nan]: + result = i._get_unique_index(dropna=dropna) + self.assert_index_equal(result, expected) + def test_sort(self): for ind in self.indices.values(): self.assertRaises(TypeError, ind.sort) @@ -549,6 +650,20 @@ def test_delete_base(self): # either depending on numpy version result = idx.delete(len(idx)) + def test_equals(self): + + for name, idx in compat.iteritems(self.indices): + self.assertTrue(idx.equals(idx)) + self.assertTrue(idx.equals(idx.copy())) + self.assertTrue(idx.equals(idx.astype(object))) + + self.assertFalse(idx.equals(list(idx))) + self.assertFalse(idx.equals(np.array(idx))) + + if idx.nlevels == 1: + # do not test MultiIndex + self.assertFalse(idx.equals(pd.Series(idx))) + def test_equals_op(self): # GH9947, GH10637 index_a = self.create_index() @@ -593,7 +708,8 @@ def test_equals_op(self): index_a == series_d with tm.assertRaisesRegexp(ValueError, "Lengths must match"): index_a == array_d - with tm.assertRaisesRegexp(ValueError, "Series lengths must match"): + msg = "Can only compare identically-labeled Series objects" + with tm.assertRaisesRegexp(ValueError, msg): series_a == series_d with tm.assertRaisesRegexp(ValueError, "Lengths must match"): series_a == array_d @@ -626,11 +742,13 @@ def test_numpy_ufuncs(self): # raise TypeError or ValueError (PeriodIndex) # PeriodIndex behavior should be changed in future version with tm.assertRaises(Exception): - func(idx) + with np.errstate(all='ignore'): + func(idx) elif isinstance(idx, (Float64Index, Int64Index)): # coerces to float (e.g. np.sin) - result = func(idx) - exp = Index(func(idx.values), name=idx.name) + with np.errstate(all='ignore'): + result = func(idx) + exp = Index(func(idx.values), name=idx.name) self.assert_index_equal(result, exp) self.assertIsInstance(result, pd.Float64Index) else: @@ -639,7 +757,8 @@ def test_numpy_ufuncs(self): continue else: with tm.assertRaises(Exception): - func(idx) + with np.errstate(all='ignore'): + func(idx) for func in [np.isfinite, np.isinf, np.isnan, np.signbit]: if isinstance(idx, pd.tseries.base.DatetimeIndexOpsMixin): diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index aa007c039f8ee..7f68318d4d7d3 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -2,9 +2,6 @@ from datetime import datetime, timedelta -# TODO(wesm): fix long line flake8 issues -# flake8: noqa - import pandas.util.testing as tm from pandas.indexes.api import Index, MultiIndex from .common import Base @@ -152,8 +149,8 @@ def test_constructor_from_series(self): expected = DatetimeIndex([Timestamp('20110101'), Timestamp('20120101'), Timestamp('20130101')]) - s = Series([Timestamp('20110101'), Timestamp('20120101'), Timestamp( - '20130101')]) + s = Series([Timestamp('20110101'), Timestamp('20120101'), + Timestamp('20130101')]) result = Index(s) self.assert_index_equal(result, expected) result = DatetimeIndex(s) @@ -172,6 +169,7 @@ def test_constructor_from_series(self): df['date'] = ['1-1-1990', '2-1-1990', '3-1-1990', '4-1-1990', '5-1-1990'] result = DatetimeIndex(df['date'], freq='MS') + expected.name = 'date' self.assert_index_equal(result, expected) self.assertEqual(df['date'].dtype, object) @@ -202,6 +200,49 @@ def __array__(self, dtype=None): result = pd.Index(ArrayLike(array)) self.assert_index_equal(result, expected) + def test_index_ctor_infer_nan_nat(self): + # GH 13467 + exp = pd.Float64Index([np.nan, np.nan]) + self.assertEqual(exp.dtype, np.float64) + tm.assert_index_equal(Index([np.nan, np.nan]), exp) + tm.assert_index_equal(Index(np.array([np.nan, np.nan])), exp) + + exp = pd.DatetimeIndex([pd.NaT, pd.NaT]) + self.assertEqual(exp.dtype, 'datetime64[ns]') + tm.assert_index_equal(Index([pd.NaT, pd.NaT]), exp) + tm.assert_index_equal(Index(np.array([pd.NaT, pd.NaT])), exp) + + exp = pd.DatetimeIndex([pd.NaT, pd.NaT]) + self.assertEqual(exp.dtype, 'datetime64[ns]') + + for data in [[pd.NaT, np.nan], [np.nan, pd.NaT], + [np.nan, np.datetime64('nat')], + [np.datetime64('nat'), np.nan]]: + tm.assert_index_equal(Index(data), exp) + tm.assert_index_equal(Index(np.array(data, dtype=object)), exp) + + exp = pd.TimedeltaIndex([pd.NaT, pd.NaT]) + self.assertEqual(exp.dtype, 'timedelta64[ns]') + + for data in [[np.nan, np.timedelta64('nat')], + [np.timedelta64('nat'), np.nan], + [pd.NaT, np.timedelta64('nat')], + [np.timedelta64('nat'), pd.NaT]]: + + tm.assert_index_equal(Index(data), exp) + tm.assert_index_equal(Index(np.array(data, dtype=object)), exp) + + # mixed np.datetime64/timedelta64 nat results in object + data = [np.datetime64('nat'), np.timedelta64('nat')] + exp = pd.Index(data, dtype=object) + tm.assert_index_equal(Index(data), exp) + tm.assert_index_equal(Index(np.array(data, dtype=object)), exp) + + data = [np.timedelta64('nat'), np.datetime64('nat')] + exp = pd.Index(data, dtype=object) + tm.assert_index_equal(Index(data), exp) + tm.assert_index_equal(Index(np.array(data, dtype=object)), exp) + def test_index_ctor_infer_periodindex(self): xp = period_range('2012-1-1', freq='M', periods=3) rs = Index(xp) @@ -242,7 +283,8 @@ def test_constructor_dtypes(self): for idx in [Index(np.array([True, False, True], dtype=bool)), Index([True, False, True]), - Index(np.array([True, False, True], dtype=bool), dtype=bool), + Index(np.array([True, False, True], dtype=bool), + dtype=bool), Index([True, False, True], dtype=bool)]: self.assertIsInstance(idx, Index) self.assertEqual(idx.dtype, object) @@ -250,8 +292,10 @@ def test_constructor_dtypes(self): for idx in [Index(np.array([1, 2, 3], dtype=int), dtype='category'), Index([1, 2, 3], dtype='category'), Index(np.array([np_datetime64_compat('2011-01-01'), - np_datetime64_compat('2011-01-02')]), dtype='category'), - Index([datetime(2011, 1, 1), datetime(2011, 1, 2)], dtype='category')]: + np_datetime64_compat('2011-01-02')]), + dtype='category'), + Index([datetime(2011, 1, 1), datetime(2011, 1, 2)], + dtype='category')]: self.assertIsInstance(idx, CategoricalIndex) for idx in [Index(np.array([np_datetime64_compat('2011-01-01'), @@ -260,7 +304,8 @@ def test_constructor_dtypes(self): self.assertIsInstance(idx, DatetimeIndex) for idx in [Index(np.array([np_datetime64_compat('2011-01-01'), - np_datetime64_compat('2011-01-02')]), dtype=object), + np_datetime64_compat('2011-01-02')]), + dtype=object), Index([datetime(2011, 1, 1), datetime(2011, 1, 2)], dtype=object)]: self.assertNotIsInstance(idx, DatetimeIndex) @@ -278,6 +323,45 @@ def test_constructor_dtypes(self): self.assertIsInstance(idx, Index) self.assertEqual(idx.dtype, object) + def test_constructor_dtypes_datetime(self): + + for tz in [None, 'UTC', 'US/Eastern', 'Asia/Tokyo']: + idx = pd.date_range('2011-01-01', periods=5, tz=tz) + dtype = idx.dtype + + # pass values without timezone, as DatetimeIndex localizes it + for values in [pd.date_range('2011-01-01', periods=5).values, + pd.date_range('2011-01-01', periods=5).asi8]: + + for res in [pd.Index(values, tz=tz), + pd.Index(values, dtype=dtype), + pd.Index(list(values), tz=tz), + pd.Index(list(values), dtype=dtype)]: + tm.assert_index_equal(res, idx) + + # check compat with DatetimeIndex + for res in [pd.DatetimeIndex(values, tz=tz), + pd.DatetimeIndex(values, dtype=dtype), + pd.DatetimeIndex(list(values), tz=tz), + pd.DatetimeIndex(list(values), dtype=dtype)]: + tm.assert_index_equal(res, idx) + + def test_constructor_dtypes_timedelta(self): + + idx = pd.timedelta_range('1 days', periods=5) + dtype = idx.dtype + + for values in [idx.values, idx.asi8]: + + for res in [pd.Index(values, dtype=dtype), + pd.Index(list(values), dtype=dtype)]: + tm.assert_index_equal(res, idx) + + # check compat with TimedeltaIndex + for res in [pd.TimedeltaIndex(values, dtype=dtype), + pd.TimedeltaIndex(list(values), dtype=dtype)]: + tm.assert_index_equal(res, idx) + def test_view_with_args(self): restricted = ['unicodeIndex', 'strIndex', 'catIndex', 'boolIndex', @@ -316,7 +400,7 @@ def test_astype(self): casted = self.intIndex.astype('i8') self.assertEqual(casted.name, 'foobar') - def test_equals(self): + def test_equals_object(self): # same self.assertTrue(Index(['a', 'b', 'c']).equals(Index(['a', 'b', 'c']))) @@ -418,7 +502,7 @@ def test_asof(self): d = self.dateIndex[-1] self.assertEqual(self.dateIndex.asof(d + timedelta(1)), d) - d = self.dateIndex[0].to_datetime() + d = self.dateIndex[0].to_pydatetime() tm.assertIsInstance(self.dateIndex.asof(d), Timestamp) def test_asof_datetime_partial(self): @@ -439,10 +523,9 @@ def test_nanosecond_index_access(self): # self.assertEqual(first_value, # x['2013-01-01 00:00:00.000000050+0000']) - self.assertEqual( - first_value, - x[Timestamp(np_datetime64_compat('2013-01-01 00:00:00.000000050+0000', - 'ns'))]) + exp_ts = np_datetime64_compat('2013-01-01 00:00:00.000000050+0000', + 'ns') + self.assertEqual(first_value, x[Timestamp(exp_ts)]) def test_comparators(self): index = self.dateIndex @@ -596,57 +679,56 @@ def test_union(self): first = Index(list('ab'), name='A') second = Index(list('ab'), name='B') union = first.union(second) - self.assertIsNone(union.name) + expected = Index(list('ab'), name=None) + tm.assert_index_equal(union, expected) first = Index(list('ab'), name='A') second = Index([], name='B') union = first.union(second) - self.assertIsNone(union.name) + expected = Index(list('ab'), name=None) + tm.assert_index_equal(union, expected) first = Index([], name='A') second = Index(list('ab'), name='B') union = first.union(second) - self.assertIsNone(union.name) + expected = Index(list('ab'), name=None) + tm.assert_index_equal(union, expected) first = Index(list('ab')) second = Index(list('ab'), name='B') union = first.union(second) - self.assertEqual(union.name, 'B') + expected = Index(list('ab'), name='B') + tm.assert_index_equal(union, expected) first = Index([]) second = Index(list('ab'), name='B') union = first.union(second) - self.assertEqual(union.name, 'B') + expected = Index(list('ab'), name='B') + tm.assert_index_equal(union, expected) first = Index(list('ab')) second = Index([], name='B') union = first.union(second) - self.assertEqual(union.name, 'B') + expected = Index(list('ab'), name='B') + tm.assert_index_equal(union, expected) first = Index(list('ab'), name='A') second = Index(list('ab')) union = first.union(second) - self.assertEqual(union.name, 'A') + expected = Index(list('ab'), name='A') + tm.assert_index_equal(union, expected) first = Index(list('ab'), name='A') second = Index([]) union = first.union(second) - self.assertEqual(union.name, 'A') + expected = Index(list('ab'), name='A') + tm.assert_index_equal(union, expected) first = Index([], name='A') second = Index(list('ab')) union = first.union(second) - self.assertEqual(union.name, 'A') - - def test_add(self): - - # - API change GH 8226 - with tm.assert_produces_warning(): - self.strIndex + self.strIndex - with tm.assert_produces_warning(): - self.strIndex + self.strIndex.tolist() - with tm.assert_produces_warning(): - self.strIndex.tolist() + self.strIndex + expected = Index(list('ab'), name='A') + tm.assert_index_equal(union, expected) with tm.assert_produces_warning(RuntimeWarning): firstCat = self.strIndex.union(self.dateIndex) @@ -663,6 +745,13 @@ def test_add(self): tm.assert_contains_all(self.strIndex, secondCat) tm.assert_contains_all(self.dateIndex, firstCat) + def test_add(self): + idx = self.strIndex + expected = Index(self.strIndex.values * 2) + self.assert_index_equal(idx + idx, expected) + self.assert_index_equal(idx + idx.tolist(), expected) + self.assert_index_equal(idx.tolist() + idx, expected) + # test add and radd idx = Index(list('abc')) expected = Index(['a1', 'b1', 'c1']) @@ -670,6 +759,13 @@ def test_add(self): expected = Index(['1a', '1b', '1c']) self.assert_index_equal('1' + idx, expected) + def test_sub(self): + idx = self.strIndex + self.assertRaises(TypeError, lambda: idx - 'a') + self.assertRaises(TypeError, lambda: idx - idx) + self.assertRaises(TypeError, lambda: idx - idx.tolist()) + self.assertRaises(TypeError, lambda: idx.tolist() - idx) + def test_append_multiple(self): index = Index(['a', 'b', 'c', 'd', 'e', 'f']) @@ -759,17 +855,19 @@ def test_symmetric_difference(self): self.assertTrue(tm.equalContents(result, expected)) # nans: - # GH #6444, sorting of nans. Make sure the number of nans is right - # and the correct non-nan values are there. punt on sorting. - idx1 = Index([1, 2, 3, np.nan]) + # GH 13514 change: {nan} - {nan} == {} + # (GH 6444, sorting of nans, is no longer an issue) + idx1 = Index([1, np.nan, 2, 3]) idx2 = Index([0, 1, np.nan]) + idx3 = Index([0, 1]) + result = idx1.symmetric_difference(idx2) - # expected = Index([0.0, np.nan, 2.0, 3.0, np.nan]) + expected = Index([0.0, 2.0, 3.0]) + tm.assert_index_equal(result, expected) - nans = pd.isnull(result) - self.assertEqual(nans.sum(), 1) - self.assertEqual((~nans).sum(), 3) - [self.assertIn(x, result) for x in [0.0, 2.0, 3.0]] + result = idx1.symmetric_difference(idx3) + expected = Index([0.0, 2.0, 3.0, np.nan]) + tm.assert_index_equal(result, expected) # other not an Index: idx1 = Index([1, 2, 3, 4], name='idx1') @@ -881,10 +979,10 @@ def test_get_indexer(self): idx2 = Index([2, 4, 6]) r1 = idx1.get_indexer(idx2) - assert_almost_equal(r1, np.array([1, 3, -1])) + assert_almost_equal(r1, np.array([1, 3, -1], dtype=np.intp)) r1 = idx2.get_indexer(idx1, method='pad') - e1 = np.array([-1, 0, 0, 1, 1]) + e1 = np.array([-1, 0, 0, 1, 1], dtype=np.intp) assert_almost_equal(r1, e1) r2 = idx2.get_indexer(idx1[::-1], method='pad') @@ -894,7 +992,7 @@ def test_get_indexer(self): assert_almost_equal(r1, rffill1) r1 = idx2.get_indexer(idx1, method='backfill') - e1 = np.array([0, 0, 1, 1, 2]) + e1 = np.array([0, 0, 1, 1, 2], dtype=np.intp) assert_almost_equal(r1, e1) rbfill1 = idx2.get_indexer(idx1, method='bfill') @@ -919,25 +1017,30 @@ def test_get_indexer_nearest(self): all_methods = ['pad', 'backfill', 'nearest'] for method in all_methods: actual = idx.get_indexer([0, 5, 9], method=method) - tm.assert_numpy_array_equal(actual, np.array([0, 5, 9])) + tm.assert_numpy_array_equal(actual, np.array([0, 5, 9], + dtype=np.intp)) actual = idx.get_indexer([0, 5, 9], method=method, tolerance=0) - tm.assert_numpy_array_equal(actual, np.array([0, 5, 9])) + tm.assert_numpy_array_equal(actual, np.array([0, 5, 9], + dtype=np.intp)) for method, expected in zip(all_methods, [[0, 1, 8], [1, 2, 9], [0, 2, 9]]): actual = idx.get_indexer([0.2, 1.8, 8.5], method=method) - tm.assert_numpy_array_equal(actual, np.array(expected)) + tm.assert_numpy_array_equal(actual, np.array(expected, + dtype=np.intp)) actual = idx.get_indexer([0.2, 1.8, 8.5], method=method, tolerance=1) - tm.assert_numpy_array_equal(actual, np.array(expected)) + tm.assert_numpy_array_equal(actual, np.array(expected, + dtype=np.intp)) for method, expected in zip(all_methods, [[0, -1, -1], [-1, 2, -1], [0, 2, -1]]): actual = idx.get_indexer([0.2, 1.8, 8.5], method=method, tolerance=0.2) - tm.assert_numpy_array_equal(actual, np.array(expected)) + tm.assert_numpy_array_equal(actual, np.array(expected, + dtype=np.intp)) with tm.assertRaisesRegexp(ValueError, 'limit argument'): idx.get_indexer([1, 0], method='nearest', limit=1) @@ -948,22 +1051,24 @@ def test_get_indexer_nearest_decreasing(self): all_methods = ['pad', 'backfill', 'nearest'] for method in all_methods: actual = idx.get_indexer([0, 5, 9], method=method) - tm.assert_numpy_array_equal(actual, np.array([9, 4, 0])) + tm.assert_numpy_array_equal(actual, np.array([9, 4, 0], + dtype=np.intp)) for method, expected in zip(all_methods, [[8, 7, 0], [9, 8, 1], [9, 7, 0]]): actual = idx.get_indexer([0.2, 1.8, 8.5], method=method) - tm.assert_numpy_array_equal(actual, np.array(expected)) + tm.assert_numpy_array_equal(actual, np.array(expected, + dtype=np.intp)) def test_get_indexer_strings(self): idx = pd.Index(['b', 'c']) actual = idx.get_indexer(['a', 'b', 'c', 'd'], method='pad') - expected = np.array([-1, 0, 1, 1]) + expected = np.array([-1, 0, 1, 1], dtype=np.intp) tm.assert_numpy_array_equal(actual, expected) actual = idx.get_indexer(['a', 'b', 'c', 'd'], method='backfill') - expected = np.array([0, 0, 1, -1]) + expected = np.array([0, 0, 1, -1], dtype=np.intp) tm.assert_numpy_array_equal(actual, expected) with tm.assertRaises(TypeError): @@ -1369,6 +1474,12 @@ def test_take_fill_value(self): with tm.assertRaises(IndexError): idx.take(np.array([1, -5])) + def test_reshape_raise(self): + msg = "reshaping is not supported" + idx = pd.Index([0, 1, 2]) + tm.assertRaisesRegexp(NotImplementedError, msg, + idx.reshape, idx.shape) + def test_reindex_preserves_name_if_target_is_list_or_ndarray(self): # GH6552 idx = pd.Index([0, 1, 2]) @@ -1524,41 +1635,47 @@ def test_string_index_repr(self): expected = u"""Index(['あ', 'いい', 'ううう'], dtype='object')""" self.assertEqual(repr(idx), expected) else: - expected = u"""\ -Index([u'あ', u'いい', u'ううう'], dtype='object')""" + expected = u"""Index([u'あ', u'いい', u'ううう'], dtype='object')""" self.assertEqual(coerce(idx), expected) # multiple lines idx = pd.Index([u'あ', u'いい', u'ううう'] * 10) if PY3: - expected = u"""Index(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', - 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', - 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'], - dtype='object')""" - + expected = (u"Index(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', " + u"'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう',\n" + u" 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', " + u"'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう',\n" + u" 'あ', 'いい', 'ううう', 'あ', 'いい', " + u"'ううう'],\n" + u" dtype='object')") self.assertEqual(repr(idx), expected) else: - expected = u"""Index([u'あ', u'いい', u'ううう', u'あ', u'いい', u'ううう', u'あ', u'いい', u'ううう', u'あ', - u'いい', u'ううう', u'あ', u'いい', u'ううう', u'あ', u'いい', u'ううう', u'あ', u'いい', - u'ううう', u'あ', u'いい', u'ううう', u'あ', u'いい', u'ううう', u'あ', u'いい', u'ううう'], - dtype='object')""" - + expected = (u"Index([u'あ', u'いい', u'ううう', u'あ', u'いい', " + u"u'ううう', u'あ', u'いい', u'ううう', u'あ',\n" + u" u'いい', u'ううう', u'あ', u'いい', u'ううう', " + u"u'あ', u'いい', u'ううう', u'あ', u'いい',\n" + u" u'ううう', u'あ', u'いい', u'ううう', u'あ', " + u"u'いい', u'ううう', u'あ', u'いい', u'ううう'],\n" + u" dtype='object')") self.assertEqual(coerce(idx), expected) # truncated idx = pd.Index([u'あ', u'いい', u'ううう'] * 100) if PY3: - expected = u"""Index(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', - ... - 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'], - dtype='object', length=300)""" - + expected = (u"Index(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', " + u"'あ', 'いい', 'ううう', 'あ',\n" + u" ...\n" + u" 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', " + u"'ううう', 'あ', 'いい', 'ううう'],\n" + u" dtype='object', length=300)") self.assertEqual(repr(idx), expected) else: - expected = u"""Index([u'あ', u'いい', u'ううう', u'あ', u'いい', u'ううう', u'あ', u'いい', u'ううう', u'あ', - ... - u'ううう', u'あ', u'いい', u'ううう', u'あ', u'いい', u'ううう', u'あ', u'いい', u'ううう'], - dtype='object', length=300)""" + expected = (u"Index([u'あ', u'いい', u'ううう', u'あ', u'いい', " + u"u'ううう', u'あ', u'いい', u'ううう', u'あ',\n" + u" ...\n" + u" u'ううう', u'あ', u'いい', u'ううう', u'あ', " + u"u'いい', u'ううう', u'あ', u'いい', u'ううう'],\n" + u" dtype='object', length=300)") self.assertEqual(coerce(idx), expected) @@ -1568,53 +1685,250 @@ def test_string_index_repr(self): # short idx = pd.Index([u'あ', u'いい', u'ううう']) if PY3: - expected = u"""Index(['あ', 'いい', 'ううう'], dtype='object')""" + expected = (u"Index(['あ', 'いい', 'ううう'], " + u"dtype='object')") self.assertEqual(repr(idx), expected) else: - expected = u"""Index([u'あ', u'いい', u'ううう'], dtype='object')""" + expected = (u"Index([u'あ', u'いい', u'ううう'], " + u"dtype='object')") self.assertEqual(coerce(idx), expected) # multiple lines idx = pd.Index([u'あ', u'いい', u'ううう'] * 10) if PY3: - expected = u"""Index(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', - 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', - 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', - 'あ', 'いい', 'ううう'], - dtype='object')""" + expected = (u"Index(['あ', 'いい', 'ううう', 'あ', 'いい', " + u"'ううう', 'あ', 'いい', 'ううう',\n" + u" 'あ', 'いい', 'ううう', 'あ', 'いい', " + u"'ううう', 'あ', 'いい', 'ううう',\n" + u" 'あ', 'いい', 'ううう', 'あ', 'いい', " + u"'ううう', 'あ', 'いい', 'ううう',\n" + u" 'あ', 'いい', 'ううう'],\n" + u" dtype='object')""") self.assertEqual(repr(idx), expected) else: - expected = u"""Index([u'あ', u'いい', u'ううう', u'あ', u'いい', u'ううう', u'あ', u'いい', - u'ううう', u'あ', u'いい', u'ううう', u'あ', u'いい', u'ううう', u'あ', - u'いい', u'ううう', u'あ', u'いい', u'ううう', u'あ', u'いい', - u'ううう', u'あ', u'いい', u'ううう', u'あ', u'いい', u'ううう'], - dtype='object')""" + expected = (u"Index([u'あ', u'いい', u'ううう', u'あ', u'いい', " + u"u'ううう', u'あ', u'いい',\n" + u" u'ううう', u'あ', u'いい', u'ううう', " + u"u'あ', u'いい', u'ううう', u'あ',\n" + u" u'いい', u'ううう', u'あ', u'いい', " + u"u'ううう', u'あ', u'いい',\n" + u" u'ううう', u'あ', u'いい', u'ううう', " + u"u'あ', u'いい', u'ううう'],\n" + u" dtype='object')") self.assertEqual(coerce(idx), expected) # truncated idx = pd.Index([u'あ', u'いい', u'ううう'] * 100) if PY3: - expected = u"""Index(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', - 'あ', - ... - 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', - 'ううう'], - dtype='object', length=300)""" + expected = (u"Index(['あ', 'いい', 'ううう', 'あ', 'いい', " + u"'ううう', 'あ', 'いい', 'ううう',\n" + u" 'あ',\n" + u" ...\n" + u" 'ううう', 'あ', 'いい', 'ううう', 'あ', " + u"'いい', 'ううう', 'あ', 'いい',\n" + u" 'ううう'],\n" + u" dtype='object', length=300)") self.assertEqual(repr(idx), expected) else: - expected = u"""Index([u'あ', u'いい', u'ううう', u'あ', u'いい', u'ううう', u'あ', u'いい', - u'ううう', u'あ', - ... - u'ううう', u'あ', u'いい', u'ううう', u'あ', u'いい', u'ううう', u'あ', - u'いい', u'ううう'], - dtype='object', length=300)""" + expected = (u"Index([u'あ', u'いい', u'ううう', u'あ', u'いい', " + u"u'ううう', u'あ', u'いい',\n" + u" u'ううう', u'あ',\n" + u" ...\n" + u" u'ううう', u'あ', u'いい', u'ううう', " + u"u'あ', u'いい', u'ううう', u'あ',\n" + u" u'いい', u'ううう'],\n" + u" dtype='object', length=300)") self.assertEqual(coerce(idx), expected) +class TestMixedIntIndex(Base, tm.TestCase): + # Mostly the tests from common.py for which the results differ + # in py2 and py3 because ints and strings are uncomparable in py3 + # (GH 13514) + + _holder = Index + _multiprocess_can_split_ = True + + def setUp(self): + self.indices = dict(mixedIndex=Index([0, 'a', 1, 'b', 2, 'c'])) + self.setup_indices() + + def create_index(self): + return self.mixedIndex + + def test_order(self): + idx = self.create_index() + # 9816 deprecated + if PY3: + with tm.assertRaisesRegexp(TypeError, "unorderable types"): + with tm.assert_produces_warning(FutureWarning): + idx.order() + else: + with tm.assert_produces_warning(FutureWarning): + idx.order() + + def test_argsort(self): + idx = self.create_index() + if PY3: + with tm.assertRaisesRegexp(TypeError, "unorderable types"): + result = idx.argsort() + else: + result = idx.argsort() + expected = np.array(idx).argsort() + tm.assert_numpy_array_equal(result, expected, check_dtype=False) + + def test_numpy_argsort(self): + idx = self.create_index() + if PY3: + with tm.assertRaisesRegexp(TypeError, "unorderable types"): + result = np.argsort(idx) + else: + result = np.argsort(idx) + expected = idx.argsort() + tm.assert_numpy_array_equal(result, expected) + + def test_copy_name(self): + # Check that "name" argument passed at initialization is honoured + # GH12309 + idx = self.create_index() + + first = idx.__class__(idx, copy=True, name='mario') + second = first.__class__(first, copy=False) + + # Even though "copy=False", we want a new object. + self.assertIsNot(first, second) + # Not using tm.assert_index_equal() since names differ: + self.assertTrue(idx.equals(first)) + + self.assertEqual(first.name, 'mario') + self.assertEqual(second.name, 'mario') + + s1 = Series(2, index=first) + s2 = Series(3, index=second[:-1]) + if PY3: + with tm.assert_produces_warning(RuntimeWarning): + # unorderable types + s3 = s1 * s2 + else: + s3 = s1 * s2 + self.assertEqual(s3.index.name, 'mario') + + def test_union_base(self): + idx = self.create_index() + first = idx[3:] + second = idx[:5] + + if PY3: + with tm.assert_produces_warning(RuntimeWarning): + # unorderable types + result = first.union(second) + expected = Index(['b', 2, 'c', 0, 'a', 1]) + self.assert_index_equal(result, expected) + else: + result = first.union(second) + expected = Index(['b', 2, 'c', 0, 'a', 1]) + self.assert_index_equal(result, expected) + + # GH 10149 + cases = [klass(second.values) + for klass in [np.array, Series, list]] + for case in cases: + if PY3: + with tm.assert_produces_warning(RuntimeWarning): + # unorderable types + result = first.union(case) + self.assertTrue(tm.equalContents(result, idx)) + else: + result = first.union(case) + self.assertTrue(tm.equalContents(result, idx)) + + def test_intersection_base(self): + # (same results for py2 and py3 but sortedness not tested elsewhere) + idx = self.create_index() + first = idx[:5] + second = idx[:3] + result = first.intersection(second) + expected = Index([0, 'a', 1]) + self.assert_index_equal(result, expected) + + # GH 10149 + cases = [klass(second.values) + for klass in [np.array, Series, list]] + for case in cases: + result = first.intersection(case) + self.assertTrue(tm.equalContents(result, second)) + + def test_difference_base(self): + # (same results for py2 and py3 but sortedness not tested elsewhere) + idx = self.create_index() + first = idx[:4] + second = idx[3:] + + result = first.difference(second) + expected = Index([0, 1, 'a']) + self.assert_index_equal(result, expected) + + def test_symmetric_difference(self): + # (same results for py2 and py3 but sortedness not tested elsewhere) + idx = self.create_index() + first = idx[:4] + second = idx[3:] + + result = first.symmetric_difference(second) + expected = Index([0, 1, 2, 'a', 'c']) + self.assert_index_equal(result, expected) + + def test_logical_compat(self): + idx = self.create_index() + self.assertEqual(idx.all(), idx.values.all()) + self.assertEqual(idx.any(), idx.values.any()) + + def test_dropna(self): + # GH 6194 + for dtype in [None, object, 'category']: + idx = pd.Index([1, 2, 3], dtype=dtype) + tm.assert_index_equal(idx.dropna(), idx) + + idx = pd.Index([1., 2., 3.], dtype=dtype) + tm.assert_index_equal(idx.dropna(), idx) + nanidx = pd.Index([1., 2., np.nan, 3.], dtype=dtype) + tm.assert_index_equal(nanidx.dropna(), idx) + + idx = pd.Index(['A', 'B', 'C'], dtype=dtype) + tm.assert_index_equal(idx.dropna(), idx) + nanidx = pd.Index(['A', np.nan, 'B', 'C'], dtype=dtype) + tm.assert_index_equal(nanidx.dropna(), idx) + + tm.assert_index_equal(nanidx.dropna(how='any'), idx) + tm.assert_index_equal(nanidx.dropna(how='all'), idx) + + idx = pd.DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03']) + tm.assert_index_equal(idx.dropna(), idx) + nanidx = pd.DatetimeIndex(['2011-01-01', '2011-01-02', + '2011-01-03', pd.NaT]) + tm.assert_index_equal(nanidx.dropna(), idx) + + idx = pd.TimedeltaIndex(['1 days', '2 days', '3 days']) + tm.assert_index_equal(idx.dropna(), idx) + nanidx = pd.TimedeltaIndex([pd.NaT, '1 days', '2 days', + '3 days', pd.NaT]) + tm.assert_index_equal(nanidx.dropna(), idx) + + idx = pd.PeriodIndex(['2012-02', '2012-04', '2012-05'], freq='M') + tm.assert_index_equal(idx.dropna(), idx) + nanidx = pd.PeriodIndex(['2012-02', '2012-04', 'NaT', '2012-05'], + freq='M') + tm.assert_index_equal(nanidx.dropna(), idx) + + msg = "invalid how option: xxx" + with tm.assertRaisesRegexp(ValueError, msg): + pd.Index([1, 2, 3]).dropna(how='xxx') + + def test_get_combined_index(): from pandas.core.index import _get_combined_index result = _get_combined_index([]) diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index c64b1e9fc4af8..9f8405bcc2e1e 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -271,12 +271,12 @@ def test_append(self): lambda: ci.append(ci.values.reorder_categories(list('abc')))) # with objects - result = ci.append(['c', 'a']) + result = ci.append(Index(['c', 'a'])) expected = CategoricalIndex(list('aabbcaca'), categories=categories) tm.assert_index_equal(result, expected, exact=True) # invalid objects - self.assertRaises(TypeError, lambda: ci.append(['a', 'd'])) + self.assertRaises(TypeError, lambda: ci.append(Index(['a', 'd']))) def test_insert(self): @@ -336,7 +336,7 @@ def test_reindex_base(self): # determined by cat ordering idx = self.create_index() - expected = np.array([4, 0, 1, 5, 2, 3]) + expected = np.array([4, 0, 1, 5, 2, 3], dtype=np.intp) actual = idx.get_indexer(idx) tm.assert_numpy_array_equal(expected, actual) @@ -395,6 +395,7 @@ def test_duplicates(self): expected = CategoricalIndex([0], name='foo') self.assert_index_equal(idx.drop_duplicates(), expected) + self.assert_index_equal(idx.unique(), expected) def test_get_indexer(self): @@ -403,7 +404,7 @@ def test_get_indexer(self): for indexer in [idx2, list('abf'), Index(list('abf'))]: r1 = idx1.get_indexer(idx2) - assert_almost_equal(r1, np.array([0, 1, 2, -1])) + assert_almost_equal(r1, np.array([0, 1, 2, -1], dtype=np.intp)) self.assertRaises(NotImplementedError, lambda: idx2.get_indexer(idx1, method='pad')) @@ -507,7 +508,21 @@ def test_identical(self): self.assertTrue(ci1.identical(ci1.copy())) self.assertFalse(ci1.identical(ci2)) - def test_equals(self): + def test_ensure_copied_data(self): + # Check the "copy" argument of each Index.__new__ is honoured + # GH12309 + # Must be tested separately from other indexes because + # self.value is not an ndarray + _base = lambda ar : ar if ar.base is None else ar.base + for index in self.indices.values(): + result = CategoricalIndex(index.values, copy=True) + tm.assert_index_equal(index, result) + self.assertIsNot(_base(index.values), _base(result.values)) + + result = CategoricalIndex(index.values, copy=False) + self.assertIs(_base(index.values), _base(result.values)) + + def test_equals_categorical(self): ci1 = CategoricalIndex(['a', 'b'], categories=['a', 'b'], ordered=True) ci2 = CategoricalIndex(['a', 'b'], categories=['a', 'b', 'c'], @@ -541,19 +556,30 @@ def test_equals(self): # tests # make sure that we are testing for category inclusion properly - self.assertTrue(CategoricalIndex( - list('aabca'), categories=['c', 'a', 'b']).equals(list('aabca'))) + ci = CategoricalIndex(list('aabca'), categories=['c', 'a', 'b']) + self.assertFalse(ci.equals(list('aabca'))) + self.assertFalse(ci.equals(CategoricalIndex(list('aabca')))) + self.assertTrue(ci.equals(ci.copy())) + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + ci = CategoricalIndex(list('aabca'), + categories=['c', 'a', 'b', np.nan]) + self.assertFalse(ci.equals(list('aabca'))) + self.assertFalse(ci.equals(CategoricalIndex(list('aabca')))) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - self.assertTrue(CategoricalIndex( - list('aabca'), categories=['c', 'a', 'b', np.nan]).equals(list( - 'aabca'))) - - self.assertFalse(CategoricalIndex( - list('aabca') + [np.nan], categories=['c', 'a', 'b']).equals(list( - 'aabca'))) - self.assertTrue(CategoricalIndex( - list('aabca') + [np.nan], categories=['c', 'a', 'b']).equals(list( - 'aabca') + [np.nan])) + self.assertTrue(ci.equals(ci.copy())) + + ci = CategoricalIndex(list('aabca') + [np.nan], + categories=['c', 'a', 'b']) + self.assertFalse(ci.equals(list('aabca'))) + self.assertFalse(ci.equals(CategoricalIndex(list('aabca')))) + self.assertTrue(ci.equals(ci.copy())) + + ci = CategoricalIndex(list('aabca') + [np.nan], + categories=['c', 'a', 'b']) + self.assertFalse(ci.equals(list('aabca') + [np.nan])) + self.assertFalse(ci.equals(CategoricalIndex(list('aabca') + [np.nan]))) + self.assertTrue(ci.equals(ci.copy())) def test_string_categorical_index_repr(self): # short diff --git a/pandas/tests/indexes/test_datetimelike.py b/pandas/tests/indexes/test_datetimelike.py index 4a664ed3542d7..7502a4ce26b04 100644 --- a/pandas/tests/indexes/test_datetimelike.py +++ b/pandas/tests/indexes/test_datetimelike.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -from datetime import timedelta, time +from datetime import datetime, timedelta, time import numpy as np @@ -12,7 +12,7 @@ import pandas.util.testing as tm import pandas as pd -from pandas.lib import Timestamp +from pandas.tslib import Timestamp, OutOfBoundsDatetime from .common import Base @@ -119,10 +119,10 @@ def test_pickle_compat_construction(self): def test_construction_index_with_mixed_timezones(self): # GH 11488 # no tz results in DatetimeIndex - result = Index( - [Timestamp('2011-01-01'), Timestamp('2011-01-02')], name='idx') - exp = DatetimeIndex( - [Timestamp('2011-01-01'), Timestamp('2011-01-02')], name='idx') + result = Index([Timestamp('2011-01-01'), + Timestamp('2011-01-02')], name='idx') + exp = DatetimeIndex([Timestamp('2011-01-01'), + Timestamp('2011-01-02')], name='idx') self.assert_index_equal(result, exp, exact=True) self.assertTrue(isinstance(result, DatetimeIndex)) self.assertIsNone(result.tz) @@ -170,16 +170,6 @@ def test_construction_index_with_mixed_timezones(self): self.assert_index_equal(result, exp, exact=True) self.assertFalse(isinstance(result, DatetimeIndex)) - # passing tz results in DatetimeIndex - result = Index([Timestamp('2011-01-01 10:00'), - Timestamp('2011-01-02 10:00', tz='US/Eastern')], - tz='Asia/Tokyo', name='idx') - exp = DatetimeIndex([Timestamp('2011-01-01 19:00'), - Timestamp('2011-01-03 00:00')], - tz='Asia/Tokyo', name='idx') - self.assert_index_equal(result, exp, exact=True) - self.assertTrue(isinstance(result, DatetimeIndex)) - # length = 1 result = Index([Timestamp('2011-01-01')], name='idx') exp = DatetimeIndex([Timestamp('2011-01-01')], name='idx') @@ -253,17 +243,6 @@ def test_construction_index_with_mixed_timezones_with_NaT(self): self.assert_index_equal(result, exp, exact=True) self.assertFalse(isinstance(result, DatetimeIndex)) - # passing tz results in DatetimeIndex - result = Index([pd.NaT, Timestamp('2011-01-01 10:00'), - pd.NaT, Timestamp('2011-01-02 10:00', - tz='US/Eastern')], - tz='Asia/Tokyo', name='idx') - exp = DatetimeIndex([pd.NaT, Timestamp('2011-01-01 19:00'), - pd.NaT, Timestamp('2011-01-03 00:00')], - tz='Asia/Tokyo', name='idx') - self.assert_index_equal(result, exp, exact=True) - self.assertTrue(isinstance(result, DatetimeIndex)) - # all NaT result = Index([pd.NaT, pd.NaT], name='idx') exp = DatetimeIndex([pd.NaT, pd.NaT], name='idx') @@ -295,9 +274,9 @@ def test_construction_dti_with_mixed_timezones(self): Timestamp('2011-01-02 10:00', tz='Asia/Tokyo')], name='idx') - exp = DatetimeIndex( - [Timestamp('2011-01-01 10:00'), Timestamp('2011-01-02 10:00') - ], tz='Asia/Tokyo', name='idx') + exp = DatetimeIndex([Timestamp('2011-01-01 10:00'), + Timestamp('2011-01-02 10:00')], + tz='Asia/Tokyo', name='idx') self.assert_index_equal(result, exp, exact=True) self.assertTrue(isinstance(result, DatetimeIndex)) @@ -323,12 +302,13 @@ def test_construction_dti_with_mixed_timezones(self): self.assertTrue(isinstance(result, DatetimeIndex)) # tz mismatch affecting to tz-aware raises TypeError/ValueError + with tm.assertRaises(ValueError): DatetimeIndex([Timestamp('2011-01-01 10:00', tz='Asia/Tokyo'), Timestamp('2011-01-02 10:00', tz='US/Eastern')], name='idx') - with tm.assertRaises(TypeError): + with tm.assertRaisesRegexp(TypeError, 'data is already tz-aware'): DatetimeIndex([Timestamp('2011-01-01 10:00'), Timestamp('2011-01-02 10:00', tz='US/Eastern')], tz='Asia/Tokyo', name='idx') @@ -338,6 +318,36 @@ def test_construction_dti_with_mixed_timezones(self): Timestamp('2011-01-02 10:00', tz='US/Eastern')], tz='US/Eastern', name='idx') + with tm.assertRaisesRegexp(TypeError, 'data is already tz-aware'): + # passing tz should results in DatetimeIndex, then mismatch raises + # TypeError + Index([pd.NaT, Timestamp('2011-01-01 10:00'), + pd.NaT, Timestamp('2011-01-02 10:00', tz='US/Eastern')], + tz='Asia/Tokyo', name='idx') + + def test_construction_base_constructor(self): + arr = [pd.Timestamp('2011-01-01'), pd.NaT, pd.Timestamp('2011-01-03')] + tm.assert_index_equal(pd.Index(arr), pd.DatetimeIndex(arr)) + tm.assert_index_equal(pd.Index(np.array(arr)), + pd.DatetimeIndex(np.array(arr))) + + arr = [np.nan, pd.NaT, pd.Timestamp('2011-01-03')] + tm.assert_index_equal(pd.Index(arr), pd.DatetimeIndex(arr)) + tm.assert_index_equal(pd.Index(np.array(arr)), + pd.DatetimeIndex(np.array(arr))) + + def test_construction_outofbounds(self): + # GH 13663 + dates = [datetime(3000, 1, 1), datetime(4000, 1, 1), + datetime(5000, 1, 1), datetime(6000, 1, 1)] + exp = Index(dates, dtype=object) + # coerces to object + tm.assert_index_equal(Index(dates), exp) + + with tm.assertRaises(OutOfBoundsDatetime): + # can't create DatetimeIndex + DatetimeIndex(dates) + def test_astype(self): # GH 13149, GH 13209 idx = DatetimeIndex(['2016-05-16', 'NaT', NaT, np.NaN]) @@ -534,28 +544,29 @@ def test_get_loc(self): # time indexing idx = pd.date_range('2000-01-01', periods=24, freq='H') tm.assert_numpy_array_equal(idx.get_loc(time(12)), - np.array([12], dtype=np.int64)) + np.array([12]), check_dtype=False) tm.assert_numpy_array_equal(idx.get_loc(time(12, 30)), - np.array([], dtype=np.int64)) + np.array([]), check_dtype=False) with tm.assertRaises(NotImplementedError): idx.get_loc(time(12, 30), method='pad') def test_get_indexer(self): idx = pd.date_range('2000-01-01', periods=3) - tm.assert_numpy_array_equal(idx.get_indexer(idx), np.array([0, 1, 2])) + exp = np.array([0, 1, 2], dtype=np.intp) + tm.assert_numpy_array_equal(idx.get_indexer(idx), exp) target = idx[0] + pd.to_timedelta(['-1 hour', '12 hours', '1 day 1 hour']) tm.assert_numpy_array_equal(idx.get_indexer(target, 'pad'), - np.array([-1, 0, 1])) + np.array([-1, 0, 1], dtype=np.intp)) tm.assert_numpy_array_equal(idx.get_indexer(target, 'backfill'), - np.array([0, 1, 2])) + np.array([0, 1, 2], dtype=np.intp)) tm.assert_numpy_array_equal(idx.get_indexer(target, 'nearest'), - np.array([0, 1, 1])) + np.array([0, 1, 1], dtype=np.intp)) tm.assert_numpy_array_equal( idx.get_indexer(target, 'nearest', tolerance=pd.Timedelta('1 hour')), - np.array([0, -1, 1])) + np.array([0, -1, 1], dtype=np.intp)) with tm.assertRaises(ValueError): idx.get_indexer(idx[[0]], method='nearest', tolerance='foo') @@ -587,7 +598,8 @@ def test_time_loc(self): # GH8667 ts = pd.Series(np.random.randn(n), index=idx) i = np.arange(start, n, step) - tm.assert_numpy_array_equal(ts.index.get_loc(key), i) + tm.assert_numpy_array_equal(ts.index.get_loc(key), i, + check_dtype=False) tm.assert_series_equal(ts[key], ts.iloc[i]) left, right = ts.copy(), ts.copy() @@ -698,12 +710,11 @@ def test_fillna_datetime64(self): pd.Timestamp('2011-01-01 11:00')], dtype=object) self.assert_index_equal(idx.fillna('x'), exp) - idx = pd.DatetimeIndex( - ['2011-01-01 09:00', pd.NaT, '2011-01-01 11:00'], tz=tz) + idx = pd.DatetimeIndex(['2011-01-01 09:00', pd.NaT, + '2011-01-01 11:00'], tz=tz) - exp = pd.DatetimeIndex( - ['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00' - ], tz=tz) + exp = pd.DatetimeIndex(['2011-01-01 09:00', '2011-01-01 10:00', + '2011-01-01 11:00'], tz=tz) self.assert_index_equal( idx.fillna(pd.Timestamp('2011-01-01 10:00', tz=tz)), exp) @@ -733,6 +744,26 @@ def setUp(self): def create_index(self): return period_range('20130101', periods=5, freq='D') + def test_construction_base_constructor(self): + # GH 13664 + arr = [pd.Period('2011-01', freq='M'), pd.NaT, + pd.Period('2011-03', freq='M')] + tm.assert_index_equal(pd.Index(arr), pd.PeriodIndex(arr)) + tm.assert_index_equal(pd.Index(np.array(arr)), + pd.PeriodIndex(np.array(arr))) + + arr = [np.nan, pd.NaT, pd.Period('2011-03', freq='M')] + tm.assert_index_equal(pd.Index(arr), pd.PeriodIndex(arr)) + tm.assert_index_equal(pd.Index(np.array(arr)), + pd.PeriodIndex(np.array(arr))) + + arr = [pd.Period('2011-01', freq='M'), pd.NaT, + pd.Period('2011-03', freq='D')] + tm.assert_index_equal(pd.Index(arr), pd.Index(arr, dtype=object)) + + tm.assert_index_equal(pd.Index(np.array(arr)), + pd.Index(np.array(arr), dtype=object)) + def test_astype(self): # GH 13149, GH 13209 idx = PeriodIndex(['2016-05-16', 'NaT', NaT, np.NaN], freq='D') @@ -740,14 +771,7 @@ def test_astype(self): result = idx.astype(object) expected = Index([Period('2016-05-16', freq='D')] + [Period(NaT, freq='D')] * 3, dtype='object') - # Hack because of lack of support for Period null checking (GH12759) - tm.assert_index_equal(result[:1], expected[:1]) - result_arr = np.asarray([p.ordinal for p in result], dtype=np.int64) - expected_arr = np.asarray([p.ordinal for p in expected], - dtype=np.int64) - tm.assert_numpy_array_equal(result_arr, expected_arr) - # TODO: When GH12759 is resolved, change the above hack to: - # tm.assert_index_equal(result, expected) # now, it raises. + tm.assert_index_equal(result, expected) result = idx.astype(int) expected = Int64Index([16937] + [-9223372036854775808] * 3, @@ -757,7 +781,7 @@ def test_astype(self): idx = period_range('1990', '2009', freq='A') result = idx.astype('i8') self.assert_index_equal(result, Index(idx.asi8)) - self.assert_numpy_array_equal(result.values, idx.values) + self.assert_numpy_array_equal(result.values, idx.asi8) def test_astype_raises(self): # GH 13149, GH 13209 @@ -767,8 +791,6 @@ def test_astype_raises(self): self.assertRaises(ValueError, idx.astype, float) self.assertRaises(ValueError, idx.astype, 'timedelta64') self.assertRaises(ValueError, idx.astype, 'timedelta64[ns]') - self.assertRaises(ValueError, idx.astype, 'datetime64') - self.assertRaises(ValueError, idx.astype, 'datetime64[ns]') def test_shift(self): @@ -849,19 +871,19 @@ def test_where_other(self): def test_get_indexer(self): idx = pd.period_range('2000-01-01', periods=3).asfreq('H', how='start') tm.assert_numpy_array_equal(idx.get_indexer(idx), - np.array([0, 1, 2], dtype=np.int_)) + np.array([0, 1, 2], dtype=np.intp)) target = pd.PeriodIndex(['1999-12-31T23', '2000-01-01T12', '2000-01-02T01'], freq='H') tm.assert_numpy_array_equal(idx.get_indexer(target, 'pad'), - np.array([-1, 0, 1], dtype=np.int_)) + np.array([-1, 0, 1], dtype=np.intp)) tm.assert_numpy_array_equal(idx.get_indexer(target, 'backfill'), - np.array([0, 1, 2], dtype=np.int_)) + np.array([0, 1, 2], dtype=np.intp)) tm.assert_numpy_array_equal(idx.get_indexer(target, 'nearest'), - np.array([0, 1, 1], dtype=np.int_)) + np.array([0, 1, 1], dtype=np.intp)) tm.assert_numpy_array_equal(idx.get_indexer(target, 'nearest', tolerance='1 hour'), - np.array([0, -1, 1], dtype=np.int_)) + np.array([0, -1, 1], dtype=np.intp)) msg = 'Input has different freq from PeriodIndex\\(freq=H\\)' with self.assertRaisesRegexp(ValueError, msg): @@ -869,7 +891,7 @@ def test_get_indexer(self): tm.assert_numpy_array_equal(idx.get_indexer(target, 'nearest', tolerance='1 day'), - np.array([0, 1, 1], dtype=np.int_)) + np.array([0, 1, 1], dtype=np.intp)) def test_repeat(self): # GH10183 @@ -880,7 +902,6 @@ def test_repeat(self): self.assertEqual(res.freqstr, 'D') def test_period_index_indexer(self): - # GH4125 idx = pd.period_range('2002-01', '2003-12', freq='M') df = pd.DataFrame(pd.np.random.randn(24, 10), index=idx) @@ -892,12 +913,11 @@ def test_period_index_indexer(self): def test_fillna_period(self): # GH 11343 - idx = pd.PeriodIndex( - ['2011-01-01 09:00', pd.NaT, '2011-01-01 11:00'], freq='H') + idx = pd.PeriodIndex(['2011-01-01 09:00', pd.NaT, + '2011-01-01 11:00'], freq='H') - exp = pd.PeriodIndex( - ['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00' - ], freq='H') + exp = pd.PeriodIndex(['2011-01-01 09:00', '2011-01-01 10:00', + '2011-01-01 11:00'], freq='H') self.assert_index_equal( idx.fillna(pd.Period('2011-01-01 10:00', freq='H')), exp) @@ -905,10 +925,11 @@ def test_fillna_period(self): pd.Period('2011-01-01 11:00', freq='H')], dtype=object) self.assert_index_equal(idx.fillna('x'), exp) - with tm.assertRaisesRegexp( - ValueError, - 'Input has different freq=D from PeriodIndex\\(freq=H\\)'): - idx.fillna(pd.Period('2011-01-01', freq='D')) + exp = pd.Index([pd.Period('2011-01-01 09:00', freq='H'), + pd.Period('2011-01-01', freq='D'), + pd.Period('2011-01-01 11:00', freq='H')], dtype=object) + self.assert_index_equal(idx.fillna(pd.Period('2011-01-01', freq='D')), + exp) def test_no_millisecond_field(self): with self.assertRaises(AttributeError): @@ -929,6 +950,17 @@ def setUp(self): def create_index(self): return pd.to_timedelta(range(5), unit='d') + pd.offsets.Hour(1) + def test_construction_base_constructor(self): + arr = [pd.Timedelta('1 days'), pd.NaT, pd.Timedelta('3 days')] + tm.assert_index_equal(pd.Index(arr), pd.TimedeltaIndex(arr)) + tm.assert_index_equal(pd.Index(np.array(arr)), + pd.TimedeltaIndex(np.array(arr))) + + arr = [np.nan, pd.NaT, pd.Timedelta('1 days')] + tm.assert_index_equal(pd.Index(arr), pd.TimedeltaIndex(arr)) + tm.assert_index_equal(pd.Index(np.array(arr)), + pd.TimedeltaIndex(np.array(arr))) + def test_shift(self): # test shift for TimedeltaIndex # err8083 @@ -1015,19 +1047,19 @@ def test_get_loc(self): def test_get_indexer(self): idx = pd.to_timedelta(['0 days', '1 days', '2 days']) tm.assert_numpy_array_equal(idx.get_indexer(idx), - np.array([0, 1, 2], dtype=np.int_)) + np.array([0, 1, 2], dtype=np.intp)) target = pd.to_timedelta(['-1 hour', '12 hours', '1 day 1 hour']) tm.assert_numpy_array_equal(idx.get_indexer(target, 'pad'), - np.array([-1, 0, 1], dtype=np.int_)) + np.array([-1, 0, 1], dtype=np.intp)) tm.assert_numpy_array_equal(idx.get_indexer(target, 'backfill'), - np.array([0, 1, 2], dtype=np.int_)) + np.array([0, 1, 2], dtype=np.intp)) tm.assert_numpy_array_equal(idx.get_indexer(target, 'nearest'), - np.array([0, 1, 1], dtype=np.int_)) + np.array([0, 1, 1], dtype=np.intp)) res = idx.get_indexer(target, 'nearest', tolerance=pd.Timedelta('1 hour')) - tm.assert_numpy_array_equal(res, np.array([0, -1, 1], dtype=np.int_)) + tm.assert_numpy_array_equal(res, np.array([0, -1, 1], dtype=np.intp)) def test_numeric_compat(self): diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index bec52f5f47b09..5248f0775d22f 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -6,8 +6,8 @@ import re import warnings -from pandas import (date_range, MultiIndex, Index, CategoricalIndex, - compat) +from pandas import (DataFrame, date_range, period_range, MultiIndex, Index, + CategoricalIndex, compat) from pandas.core.common import PerformanceWarning from pandas.indexes.base import InvalidIndexError from pandas.compat import range, lrange, u, PY3, long, lzip @@ -314,6 +314,22 @@ def test_set_levels_labels_names_bad_input(self): with tm.assertRaisesRegexp(TypeError, 'string'): self.index.set_names(names, level=0) + def test_set_levels_categorical(self): + # GH13854 + index = MultiIndex.from_arrays([list("xyzx"), [0, 1, 2, 3]]) + for ordered in [False, True]: + cidx = CategoricalIndex(list("bac"), ordered=ordered) + result = index.set_levels(cidx, 0) + expected = MultiIndex(levels=[cidx, [0, 1, 2, 3]], + labels=index.labels) + tm.assert_index_equal(result, expected) + + result_lvl = result.get_level_values(0) + expected_lvl = CategoricalIndex(list("bacb"), + categories=cidx.categories, + ordered=cidx.ordered) + tm.assert_index_equal(result_lvl, expected_lvl) + def test_metadata_immutable(self): levels, labels = self.index.levels, self.index.labels # shouldn't be able to set at either the top level or base level @@ -632,6 +648,66 @@ def test_from_arrays_index_series_period(self): tm.assert_index_equal(result, result2) + def test_from_arrays_index_datetimelike_mixed(self): + idx1 = pd.date_range('2015-01-01 10:00', freq='D', periods=3, + tz='US/Eastern') + idx2 = pd.date_range('2015-01-01 10:00', freq='H', periods=3) + idx3 = pd.timedelta_range('1 days', freq='D', periods=3) + idx4 = pd.period_range('2011-01-01', freq='D', periods=3) + + result = pd.MultiIndex.from_arrays([idx1, idx2, idx3, idx4]) + tm.assert_index_equal(result.get_level_values(0), idx1) + tm.assert_index_equal(result.get_level_values(1), idx2) + tm.assert_index_equal(result.get_level_values(2), idx3) + tm.assert_index_equal(result.get_level_values(3), idx4) + + result2 = pd.MultiIndex.from_arrays([pd.Series(idx1), + pd.Series(idx2), + pd.Series(idx3), + pd.Series(idx4)]) + tm.assert_index_equal(result2.get_level_values(0), idx1) + tm.assert_index_equal(result2.get_level_values(1), idx2) + tm.assert_index_equal(result2.get_level_values(2), idx3) + tm.assert_index_equal(result2.get_level_values(3), idx4) + + tm.assert_index_equal(result, result2) + + def test_from_arrays_index_series_categorical(self): + # GH13743 + idx1 = pd.CategoricalIndex(list("abcaab"), categories=list("bac"), + ordered=False) + idx2 = pd.CategoricalIndex(list("abcaab"), categories=list("bac"), + ordered=True) + + result = pd.MultiIndex.from_arrays([idx1, idx2]) + tm.assert_index_equal(result.get_level_values(0), idx1) + tm.assert_index_equal(result.get_level_values(1), idx2) + + result2 = pd.MultiIndex.from_arrays([pd.Series(idx1), pd.Series(idx2)]) + tm.assert_index_equal(result2.get_level_values(0), idx1) + tm.assert_index_equal(result2.get_level_values(1), idx2) + + result3 = pd.MultiIndex.from_arrays([idx1.values, idx2.values]) + tm.assert_index_equal(result3.get_level_values(0), idx1) + tm.assert_index_equal(result3.get_level_values(1), idx2) + + def test_from_arrays_different_lengths(self): + # GH13599 + idx1 = [1, 2, 3] + idx2 = ['a', 'b'] + assertRaisesRegexp(ValueError, '^all arrays must be same length$', + MultiIndex.from_arrays, [idx1, idx2]) + + idx1 = [] + idx2 = ['a', 'b'] + assertRaisesRegexp(ValueError, '^all arrays must be same length$', + MultiIndex.from_arrays, [idx1, idx2]) + + idx1 = [1, 2, 3] + idx2 = [] + assertRaisesRegexp(ValueError, '^all arrays must be same length$', + MultiIndex.from_arrays, [idx1, idx2]) + def test_from_product(self): first = ['foo', 'bar', 'buz'] @@ -655,6 +731,20 @@ def test_from_product_datetimeindex(self): '2000-01-01')), (2, pd.Timestamp('2000-01-02'))]) tm.assert_numpy_array_equal(mi.values, etalon) + def test_from_product_index_series_categorical(self): + # GH13743 + first = ['foo', 'bar'] + for ordered in [False, True]: + idx = pd.CategoricalIndex(list("abcaab"), categories=list("bac"), + ordered=ordered) + expected = pd.CategoricalIndex(list("abcaab") + list("abcaab"), + categories=list("bac"), + ordered=ordered) + + for arr in [idx, pd.Series(idx), idx.values]: + result = pd.MultiIndex.from_product([first, arr]) + tm.assert_index_equal(result.get_level_values(1), expected) + def test_values_boxed(self): tuples = [(1, pd.Timestamp('2000-01-01')), (2, pd.NaT), (3, pd.Timestamp('2000-01-03')), @@ -679,6 +769,40 @@ def test_append(self): result = self.index.append([]) self.assertTrue(result.equals(self.index)) + def test_append_mixed_dtypes(self): + # GH 13660 + dti = date_range('2011-01-01', freq='M', periods=3,) + dti_tz = date_range('2011-01-01', freq='M', periods=3, tz='US/Eastern') + pi = period_range('2011-01', freq='M', periods=3) + + mi = MultiIndex.from_arrays([[1, 2, 3], + [1.1, np.nan, 3.3], + ['a', 'b', 'c'], + dti, dti_tz, pi]) + self.assertEqual(mi.nlevels, 6) + + res = mi.append(mi) + exp = MultiIndex.from_arrays([[1, 2, 3, 1, 2, 3], + [1.1, np.nan, 3.3, 1.1, np.nan, 3.3], + ['a', 'b', 'c', 'a', 'b', 'c'], + dti.append(dti), + dti_tz.append(dti_tz), + pi.append(pi)]) + tm.assert_index_equal(res, exp) + + other = MultiIndex.from_arrays([['x', 'y', 'z'], ['x', 'y', 'z'], + ['x', 'y', 'z'], ['x', 'y', 'z'], + ['x', 'y', 'z'], ['x', 'y', 'z']]) + + res = mi.append(other) + exp = MultiIndex.from_arrays([[1, 2, 3, 'x', 'y', 'z'], + [1.1, np.nan, 3.3, 'x', 'y', 'z'], + ['a', 'b', 'c', 'x', 'y', 'z'], + dti.append(pd.Index(['x', 'y', 'z'])), + dti_tz.append(pd.Index(['x', 'y', 'z'])), + pi.append(pd.Index(['x', 'y', 'z']))]) + tm.assert_index_equal(res, exp) + def test_get_level_values(self): result = self.index.get_level_values(0) expected = Index(['foo', 'foo', 'bar', 'baz', 'qux', 'qux'], @@ -758,7 +882,7 @@ def test_legacy_pickle(self): self.assertTrue(obj.equals(obj2)) res = obj.get_indexer(obj) - exp = np.arange(len(obj)) + exp = np.arange(len(obj), dtype=np.intp) assert_almost_equal(res, exp) res = obj.get_indexer(obj2[::-1]) @@ -777,7 +901,7 @@ def test_legacy_v2_unpickle(self): self.assertTrue(obj.equals(obj2)) res = obj.get_indexer(obj) - exp = np.arange(len(obj)) + exp = np.arange(len(obj), dtype=np.intp) assert_almost_equal(res, exp) res = obj.get_indexer(obj2[::-1]) @@ -1022,8 +1146,8 @@ def test_get_indexer(self): major_axis = Index(lrange(4)) minor_axis = Index(lrange(2)) - major_labels = np.array([0, 0, 1, 2, 2, 3, 3]) - minor_labels = np.array([0, 1, 0, 0, 1, 0, 1]) + major_labels = np.array([0, 0, 1, 2, 2, 3, 3], dtype=np.intp) + minor_labels = np.array([0, 1, 0, 0, 1, 0, 1], dtype=np.intp) index = MultiIndex(levels=[major_axis, minor_axis], labels=[major_labels, minor_labels]) @@ -1031,10 +1155,10 @@ def test_get_indexer(self): idx2 = index[[1, 3, 5]] r1 = idx1.get_indexer(idx2) - assert_almost_equal(r1, np.array([1, 3, -1])) + assert_almost_equal(r1, np.array([1, 3, -1], dtype=np.intp)) r1 = idx2.get_indexer(idx1, method='pad') - e1 = np.array([-1, 0, 0, 1, 1]) + e1 = np.array([-1, 0, 0, 1, 1], dtype=np.intp) assert_almost_equal(r1, e1) r2 = idx2.get_indexer(idx1[::-1], method='pad') @@ -1044,7 +1168,7 @@ def test_get_indexer(self): assert_almost_equal(r1, rffill1) r1 = idx2.get_indexer(idx1, method='backfill') - e1 = np.array([0, 0, 1, 1, 2]) + e1 = np.array([0, 0, 1, 1, 2], dtype=np.intp) assert_almost_equal(r1, e1) r2 = idx2.get_indexer(idx1[::-1], method='backfill') @@ -1142,7 +1266,7 @@ def test_to_hierarchical(self): def test_bounds(self): self.index._bounds - def test_equals(self): + def test_equals_multi(self): self.assertTrue(self.index.equals(self.index)) self.assertTrue(self.index.equal_levels(self.index)) @@ -1284,21 +1408,24 @@ def test_intersection(self): # result = self.index & tuples # self.assertTrue(result.equals(tuples)) - def test_difference(self): + def test_sub(self): first = self.index - result = first.difference(self.index[-3:]) - # - API change GH 8226 - with tm.assert_produces_warning(): + # - now raises (previously was set op difference) + with tm.assertRaises(TypeError): first - self.index[-3:] - with tm.assert_produces_warning(): + with tm.assertRaises(TypeError): self.index[-3:] - first - with tm.assert_produces_warning(): + with tm.assertRaises(TypeError): self.index[-3:] - first.tolist() + with tm.assertRaises(TypeError): + first.tolist() - self.index[-3:] - self.assertRaises(TypeError, lambda: first.tolist() - self.index[-3:]) + def test_difference(self): + first = self.index + result = first.difference(self.index[-3:]) expected = MultiIndex.from_tuples(sorted(self.index[:-3].values), sortorder=0, names=self.index.names) @@ -1706,8 +1833,8 @@ def test_join_multi(self): jidx, lidx, ridx = midx.join(idx, how='inner', return_indexers=True) exp_idx = pd.MultiIndex.from_product( [np.arange(4), [1, 2]], names=['a', 'b']) - exp_lidx = np.array([1, 2, 5, 6, 9, 10, 13, 14], dtype=np.int_) - exp_ridx = np.array([0, 1, 0, 1, 0, 1, 0, 1], dtype=np.int64) + exp_lidx = np.array([1, 2, 5, 6, 9, 10, 13, 14], dtype=np.intp) + exp_ridx = np.array([0, 1, 0, 1, 0, 1, 0, 1], dtype=np.intp) self.assert_index_equal(jidx, exp_idx) self.assert_numpy_array_equal(lidx, exp_lidx) self.assert_numpy_array_equal(ridx, exp_ridx) @@ -1720,7 +1847,7 @@ def test_join_multi(self): # keep MultiIndex jidx, lidx, ridx = midx.join(idx, how='left', return_indexers=True) exp_ridx = np.array([-1, 0, 1, -1, -1, 0, 1, -1, -1, 0, 1, -1, -1, 0, - 1, -1], dtype=np.int64) + 1, -1], dtype=np.intp) self.assert_index_equal(jidx, midx) self.assertIsNone(lidx) self.assert_numpy_array_equal(ridx, exp_ridx) @@ -1750,12 +1877,12 @@ def test_reindex_level(self): exp_index2 = self.index.join(idx, level='second', how='left') self.assertTrue(target.equals(exp_index)) - exp_indexer = np.array([0, 2, 4], dtype=np.int64) - tm.assert_numpy_array_equal(indexer, exp_indexer) + exp_indexer = np.array([0, 2, 4]) + tm.assert_numpy_array_equal(indexer, exp_indexer, check_dtype=False) self.assertTrue(target2.equals(exp_index2)) - exp_indexer2 = np.array([0, -1, 0, -1, 0, -1], dtype=np.int64) - tm.assert_numpy_array_equal(indexer2, exp_indexer2) + exp_indexer2 = np.array([0, -1, 0, -1, 0, -1]) + tm.assert_numpy_array_equal(indexer2, exp_indexer2, check_dtype=False) assertRaisesRegexp(TypeError, "Fill method not supported", self.index.reindex, self.index, method='pad', @@ -1843,7 +1970,7 @@ def check(nlevels, with_nulls): for keep in ['first', 'last', False]: left = mi.duplicated(keep=keep) - right = pd.lib.duplicated(mi.values, keep=keep) + right = pd.hashtable.duplicated_object(mi.values, keep=keep) tm.assert_numpy_array_equal(left, right) # GH5873 @@ -1877,6 +2004,47 @@ def test_duplicate_meta_data(self): self.assertTrue(idx.has_duplicates) self.assertEqual(idx.drop_duplicates().names, idx.names) + def test_get_unique_index(self): + idx = self.index[[0, 1, 0, 1, 1, 0, 0]] + expected = self.index._shallow_copy(idx[[0, 1]]) + + for dropna in [False, True]: + result = idx._get_unique_index(dropna=dropna) + self.assertTrue(result.unique) + self.assert_index_equal(result, expected) + + def test_unique(self): + mi = pd.MultiIndex.from_arrays([[1, 2, 1, 2], [1, 1, 1, 2]]) + + res = mi.unique() + exp = pd.MultiIndex.from_arrays([[1, 2, 2], [1, 1, 2]]) + tm.assert_index_equal(res, exp) + + mi = pd.MultiIndex.from_arrays([list('aaaa'), list('abab')]) + res = mi.unique() + exp = pd.MultiIndex.from_arrays([list('aa'), list('ab')]) + tm.assert_index_equal(res, exp) + + mi = pd.MultiIndex.from_arrays([list('aaaa'), list('aaaa')]) + res = mi.unique() + exp = pd.MultiIndex.from_arrays([['a'], ['a']]) + tm.assert_index_equal(res, exp) + + def test_unique_datetimelike(self): + idx1 = pd.DatetimeIndex(['2015-01-01', '2015-01-01', '2015-01-01', + '2015-01-01', 'NaT', 'NaT']) + idx2 = pd.DatetimeIndex(['2015-01-01', '2015-01-01', '2015-01-02', + '2015-01-02', 'NaT', '2015-01-01'], + tz='Asia/Tokyo') + result = pd.MultiIndex.from_arrays([idx1, idx2]).unique() + + eidx1 = pd.DatetimeIndex(['2015-01-01', '2015-01-01', 'NaT', 'NaT']) + eidx2 = pd.DatetimeIndex(['2015-01-01', '2015-01-02', + 'NaT', '2015-01-01'], + tz='Asia/Tokyo') + exp = pd.MultiIndex.from_arrays([eidx1, eidx2]) + tm.assert_index_equal(result, exp) + def test_tolist(self): result = self.index.tolist() exp = list(self.index.values) @@ -2201,6 +2369,15 @@ def test_partial_string_timestamp_multiindex(self): with assertRaises(KeyError): df_swap.loc['2016-01-01'] + # GH12685 (partial string with daily resolution or below) + dr = date_range('2013-01-01', periods=100, freq='D') + ix = MultiIndex.from_product([dr, ['a', 'b']]) + df = DataFrame(np.random.randn(200, 1), columns=['A'], index=ix) + + result = df.loc[idx['2013-03':'2013-03', :], :] + expected = df.iloc[118:180] + tm.assert_frame_equal(result, expected) + def test_rangeindex_fallback_coercion_bug(self): # GH 12893 foo = pd.DataFrame(np.arange(100).reshape((10, 10))) @@ -2223,3 +2400,24 @@ def test_rangeindex_fallback_coercion_bug(self): result = df.index.get_level_values('buzz') expected = pd.Int64Index(np.tile(np.arange(10), 10), name='buzz') tm.assert_index_equal(result, expected) + + def test_dropna(self): + # GH 6194 + idx = pd.MultiIndex.from_arrays([[1, np.nan, 3, np.nan, 5], + [1, 2, np.nan, np.nan, 5], + ['a', 'b', 'c', np.nan, 'e']]) + + exp = pd.MultiIndex.from_arrays([[1, 5], + [1, 5], + ['a', 'e']]) + tm.assert_index_equal(idx.dropna(), exp) + tm.assert_index_equal(idx.dropna(how='any'), exp) + + exp = pd.MultiIndex.from_arrays([[1, np.nan, 3, 5], + [1, 2, np.nan, 5], + ['a', 'b', 'c', 'e']]) + tm.assert_index_equal(idx.dropna(how='all'), exp) + + msg = "invalid how option: xxx" + with tm.assertRaisesRegexp(ValueError, msg): + idx.dropna(how='xxx') diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index 5eac0bc870756..d3a89b301ae46 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -169,8 +169,8 @@ def test_constructor(self): # explicit construction index = Float64Index([1, 2, 3, 4, 5]) self.assertIsInstance(index, Float64Index) - self.assertTrue((index.values == np.array( - [1, 2, 3, 4, 5], dtype='float64')).all()) + expected = np.array([1, 2, 3, 4, 5], dtype='float64') + self.assert_numpy_array_equal(index.values, expected) index = Float64Index(np.array([1, 2, 3, 4, 5])) self.assertIsInstance(index, Float64Index) index = Float64Index([1., 2, 3, 4, 5]) @@ -265,7 +265,7 @@ def test_astype(self): i = Float64Index([0, 1.1, np.NAN]) self.assertRaises(ValueError, lambda: i.astype(dtype)) - def test_equals(self): + def test_equals_numeric(self): i = Float64Index([1.0, 2.0]) self.assertTrue(i.equals(i)) @@ -284,15 +284,15 @@ def test_equals(self): def test_get_indexer(self): idx = Float64Index([0.0, 1.0, 2.0]) tm.assert_numpy_array_equal(idx.get_indexer(idx), - np.array([0, 1, 2], dtype=np.int_)) + np.array([0, 1, 2], dtype=np.intp)) target = [-0.1, 0.5, 1.1] tm.assert_numpy_array_equal(idx.get_indexer(target, 'pad'), - np.array([-1, 0, 1], dtype=np.int_)) + np.array([-1, 0, 1], dtype=np.intp)) tm.assert_numpy_array_equal(idx.get_indexer(target, 'backfill'), - np.array([0, 1, 2], dtype=np.int_)) + np.array([0, 1, 2], dtype=np.intp)) tm.assert_numpy_array_equal(idx.get_indexer(target, 'nearest'), - np.array([0, 1, 1], dtype=np.int_)) + np.array([0, 1, 1], dtype=np.intp)) def test_get_loc(self): idx = Float64Index([0.0, 1.0, 2.0]) @@ -560,19 +560,19 @@ def test_identical(self): def test_get_indexer(self): target = Int64Index(np.arange(10)) indexer = self.index.get_indexer(target) - expected = np.array([0, -1, 1, -1, 2, -1, 3, -1, 4, -1]) + expected = np.array([0, -1, 1, -1, 2, -1, 3, -1, 4, -1], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected) def test_get_indexer_pad(self): target = Int64Index(np.arange(10)) indexer = self.index.get_indexer(target, method='pad') - expected = np.array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4]) + expected = np.array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected) def test_get_indexer_backfill(self): target = Int64Index(np.arange(10)) indexer = self.index.get_indexer(target, method='backfill') - expected = np.array([0, 1, 1, 2, 2, 3, 3, 4, 4, 5]) + expected = np.array([0, 1, 1, 2, 2, 3, 3, 4, 4, 5], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected) def test_join_outer(self): @@ -588,9 +588,9 @@ def test_join_outer(self): eres = Int64Index([0, 1, 2, 4, 5, 6, 7, 8, 10, 12, 14, 16, 18, 25]) elidx = np.array([0, -1, 1, 2, -1, 3, -1, 4, 5, 6, 7, 8, 9, -1], - dtype=np.int_) + dtype=np.intp) eridx = np.array([-1, 3, 4, -1, 5, -1, 0, -1, -1, 1, -1, -1, -1, 2], - dtype=np.int_) + dtype=np.intp) tm.assertIsInstance(res, Int64Index) self.assert_index_equal(res, eres) @@ -604,9 +604,9 @@ def test_join_outer(self): self.assert_index_equal(res, noidx_res) elidx = np.array([0, -1, 1, 2, -1, 3, -1, 4, 5, 6, 7, 8, 9, -1], - dtype=np.int64) + dtype=np.intp) eridx = np.array([-1, 0, 1, -1, 2, -1, 3, -1, -1, 4, -1, -1, -1, 5], - dtype=np.int64) + dtype=np.intp) tm.assertIsInstance(res, Int64Index) self.assert_index_equal(res, eres) tm.assert_numpy_array_equal(lidx, elidx) @@ -627,8 +627,8 @@ def test_join_inner(self): ridx = ridx.take(ind) eres = Int64Index([2, 12]) - elidx = np.array([1, 6], dtype=np.int_) - eridx = np.array([4, 1], dtype=np.int_) + elidx = np.array([1, 6], dtype=np.intp) + eridx = np.array([4, 1], dtype=np.intp) tm.assertIsInstance(res, Int64Index) self.assert_index_equal(res, eres) @@ -642,8 +642,8 @@ def test_join_inner(self): res2 = self.index.intersection(other_mono) self.assert_index_equal(res, res2) - elidx = np.array([1, 6], dtype=np.int64) - eridx = np.array([1, 4], dtype=np.int64) + elidx = np.array([1, 6], dtype=np.intp) + eridx = np.array([1, 4], dtype=np.intp) tm.assertIsInstance(res, Int64Index) self.assert_index_equal(res, eres) tm.assert_numpy_array_equal(lidx, elidx) @@ -658,7 +658,7 @@ def test_join_left(self): return_indexers=True) eres = self.index eridx = np.array([-1, 4, -1, -1, -1, -1, 1, -1, -1, -1], - dtype=np.int_) + dtype=np.intp) tm.assertIsInstance(res, Int64Index) self.assert_index_equal(res, eres) @@ -669,7 +669,7 @@ def test_join_left(self): res, lidx, ridx = self.index.join(other_mono, how='left', return_indexers=True) eridx = np.array([-1, 1, -1, -1, -1, -1, 4, -1, -1, -1], - dtype=np.int64) + dtype=np.intp) tm.assertIsInstance(res, Int64Index) self.assert_index_equal(res, eres) self.assertIsNone(lidx) @@ -680,8 +680,8 @@ def test_join_left(self): idx2 = Index([1, 2, 5, 7, 9]) res, lidx, ridx = idx2.join(idx, how='left', return_indexers=True) eres = Index([1, 1, 2, 5, 7, 9]) # 1 is in idx2, so it should be x2 - eridx = np.array([0, 1, 2, 3, -1, -1], dtype=np.int64) - elidx = np.array([0, 0, 1, 2, 3, 4], dtype=np.int64) + eridx = np.array([0, 1, 2, 3, -1, -1], dtype=np.intp) + elidx = np.array([0, 0, 1, 2, 3, 4], dtype=np.intp) self.assert_index_equal(res, eres) tm.assert_numpy_array_equal(lidx, elidx) tm.assert_numpy_array_equal(ridx, eridx) @@ -694,7 +694,7 @@ def test_join_right(self): res, lidx, ridx = self.index.join(other, how='right', return_indexers=True) eres = other - elidx = np.array([-1, 6, -1, -1, 1, -1], dtype=np.int_) + elidx = np.array([-1, 6, -1, -1, 1, -1], dtype=np.intp) tm.assertIsInstance(other, Int64Index) self.assert_index_equal(res, eres) @@ -705,7 +705,7 @@ def test_join_right(self): res, lidx, ridx = self.index.join(other_mono, how='right', return_indexers=True) eres = other_mono - elidx = np.array([-1, 1, -1, -1, 6, -1], dtype=np.int64) + elidx = np.array([-1, 1, -1, -1, 6, -1], dtype=np.intp) tm.assertIsInstance(other, Int64Index) self.assert_index_equal(res, eres) tm.assert_numpy_array_equal(lidx, elidx) @@ -716,8 +716,8 @@ def test_join_right(self): idx2 = Index([1, 2, 5, 7, 9]) res, lidx, ridx = idx.join(idx2, how='right', return_indexers=True) eres = Index([1, 1, 2, 5, 7, 9]) # 1 is in idx2, so it should be x2 - elidx = np.array([0, 1, 2, 3, -1, -1], dtype=np.int64) - eridx = np.array([0, 0, 1, 2, 3, 4], dtype=np.int64) + elidx = np.array([0, 1, 2, 3, -1, -1], dtype=np.intp) + eridx = np.array([0, 0, 1, 2, 3, 4], dtype=np.intp) self.assert_index_equal(res, eres) tm.assert_numpy_array_equal(lidx, elidx) tm.assert_numpy_array_equal(ridx, eridx) @@ -757,10 +757,10 @@ def test_join_non_unique(self): exp_joined = Index([3, 3, 3, 3, 4, 4, 4, 4]) self.assert_index_equal(joined, exp_joined) - exp_lidx = np.array([2, 2, 3, 3, 0, 0, 1, 1], dtype=np.int_) + exp_lidx = np.array([2, 2, 3, 3, 0, 0, 1, 1], dtype=np.intp) tm.assert_numpy_array_equal(lidx, exp_lidx) - exp_ridx = np.array([2, 3, 2, 3, 0, 1, 0, 1], dtype=np.int_) + exp_ridx = np.array([2, 3, 2, 3, 0, 1, 0, 1], dtype=np.intp) tm.assert_numpy_array_equal(ridx, exp_ridx) def test_join_self(self): diff --git a/pandas/tests/indexes/test_range.py b/pandas/tests/indexes/test_range.py index 99e4b72bcee37..b0b8864521666 100644 --- a/pandas/tests/indexes/test_range.py +++ b/pandas/tests/indexes/test_range.py @@ -74,24 +74,36 @@ def test_constructor(self): self.assertEqual(index._step, 2) tm.assert_index_equal(Index(expected), index) - index = RangeIndex() - expected = np.empty(0, dtype=np.int64) - self.assertIsInstance(index, RangeIndex) - self.assertEqual(index._start, 0) - self.assertEqual(index._stop, 0) - self.assertEqual(index._step, 1) - tm.assert_index_equal(Index(expected), index) - - index = RangeIndex(name='Foo') - self.assertIsInstance(index, RangeIndex) - self.assertEqual(index.name, 'Foo') + msg = "RangeIndex\\(\\.\\.\\.\\) must be called with integers" + with tm.assertRaisesRegexp(TypeError, msg): + RangeIndex() + + for index in [RangeIndex(0), RangeIndex(start=0), RangeIndex(stop=0), + RangeIndex(0, 0)]: + expected = np.empty(0, dtype=np.int64) + self.assertIsInstance(index, RangeIndex) + self.assertEqual(index._start, 0) + self.assertEqual(index._stop, 0) + self.assertEqual(index._step, 1) + tm.assert_index_equal(Index(expected), index) + + with tm.assertRaisesRegexp(TypeError, msg): + RangeIndex(name='Foo') + + for index in [RangeIndex(0, name='Foo'), + RangeIndex(start=0, name='Foo'), + RangeIndex(stop=0, name='Foo'), + RangeIndex(0, 0, name='Foo')]: + self.assertIsInstance(index, RangeIndex) + self.assertEqual(index.name, 'Foo') # we don't allow on a bare Index self.assertRaises(TypeError, lambda: Index(0, 1000)) # invalid args for i in [Index(['a', 'b']), Series(['a', 'b']), np.array(['a', 'b']), - [], 'foo', datetime(2000, 1, 1, 0, 0), np.arange(0, 10)]: + [], 'foo', datetime(2000, 1, 1, 0, 0), np.arange(0, 10), + np.array([1]), [1]]: self.assertRaises(TypeError, lambda: RangeIndex(i)) def test_constructor_same(self): @@ -210,10 +222,10 @@ def test_numeric_compat2(self): RangeIndex(0, 1000, 1)._int64index // 2), (RangeIndex(0, 100, 1), 2.0, RangeIndex(0, 100, 1)._int64index // 2.0), - (RangeIndex(), 50, RangeIndex()), + (RangeIndex(0), 50, RangeIndex(0)), (RangeIndex(2, 4, 2), 3, RangeIndex(0, 1, 1)), (RangeIndex(-5, -10, -6), 4, RangeIndex(-2, -1, 1)), - (RangeIndex(-100, -200, 3), 2, RangeIndex())] + (RangeIndex(-100, -200, 3), 2, RangeIndex(0))] for idx, div, expected in cases_exact: tm.assert_index_equal(idx // div, expected, exact=True) @@ -288,7 +300,7 @@ def test_delete(self): def test_view(self): super(TestRangeIndex, self).test_view() - i = RangeIndex(name='Foo') + i = RangeIndex(0, name='Foo') i_view = i.view() self.assertEqual(i_view.name, 'Foo') @@ -315,7 +327,17 @@ def test_is_monotonic(self): self.assertTrue(index.is_monotonic_increasing) self.assertTrue(index.is_monotonic_decreasing) - def test_equals(self): + index = RangeIndex(2, 1) + self.assertTrue(index.is_monotonic) + self.assertTrue(index.is_monotonic_increasing) + self.assertTrue(index.is_monotonic_decreasing) + + index = RangeIndex(1, 1) + self.assertTrue(index.is_monotonic) + self.assertTrue(index.is_monotonic_increasing) + self.assertTrue(index.is_monotonic_decreasing) + + def test_equals_range(self): equiv_pairs = [(RangeIndex(0, 9, 2), RangeIndex(0, 10, 2)), (RangeIndex(0), RangeIndex(1, -1, 3)), (RangeIndex(1, 2, 3), RangeIndex(1, 3, 4)), @@ -355,19 +377,19 @@ def test_identical(self): def test_get_indexer(self): target = RangeIndex(10) indexer = self.index.get_indexer(target) - expected = np.array([0, -1, 1, -1, 2, -1, 3, -1, 4, -1]) + expected = np.array([0, -1, 1, -1, 2, -1, 3, -1, 4, -1], dtype=np.intp) self.assert_numpy_array_equal(indexer, expected) def test_get_indexer_pad(self): target = RangeIndex(10) indexer = self.index.get_indexer(target, method='pad') - expected = np.array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4]) + expected = np.array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4], dtype=np.intp) self.assert_numpy_array_equal(indexer, expected) def test_get_indexer_backfill(self): target = RangeIndex(10) indexer = self.index.get_indexer(target, method='backfill') - expected = np.array([0, 1, 1, 2, 2, 3, 3, 4, 4, 5]) + expected = np.array([0, 1, 1, 2, 2, 3, 3, 4, 4, 5], dtype=np.intp) self.assert_numpy_array_equal(indexer, expected) def test_join_outer(self): @@ -382,9 +404,9 @@ def test_join_outer(self): eres = Int64Index([0, 2, 4, 6, 8, 10, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]) elidx = np.array([0, 1, 2, 3, 4, 5, 6, 7, -1, 8, -1, 9, - -1, -1, -1, -1, -1, -1, -1], dtype=np.int_) + -1, -1, -1, -1, -1, -1, -1], dtype=np.intp) eridx = np.array([-1, -1, -1, -1, -1, -1, -1, -1, 10, 9, 8, 7, 6, - 5, 4, 3, 2, 1, 0], dtype=np.int_) + 5, 4, 3, 2, 1, 0], dtype=np.intp) self.assertIsInstance(res, Int64Index) self.assertFalse(isinstance(res, RangeIndex)) @@ -420,8 +442,8 @@ def test_join_inner(self): ridx = ridx.take(ind) eres = Int64Index([16, 18]) - elidx = np.array([8, 9]) - eridx = np.array([9, 7]) + elidx = np.array([8, 9], dtype=np.intp) + eridx = np.array([9, 7], dtype=np.intp) self.assertIsInstance(res, Int64Index) self.assert_index_equal(res, eres) @@ -446,7 +468,7 @@ def test_join_left(self): res, lidx, ridx = self.index.join(other, how='left', return_indexers=True) eres = self.index - eridx = np.array([-1, -1, -1, -1, -1, -1, -1, -1, 9, 7], dtype=np.int_) + eridx = np.array([-1, -1, -1, -1, -1, -1, -1, -1, 9, 7], dtype=np.intp) self.assertIsInstance(res, RangeIndex) self.assert_index_equal(res, eres) @@ -472,7 +494,7 @@ def test_join_right(self): return_indexers=True) eres = other elidx = np.array([-1, -1, -1, -1, -1, -1, -1, 9, -1, 8, -1], - dtype=np.int_) + dtype=np.intp) self.assertIsInstance(other, Int64Index) self.assert_index_equal(res, eres) @@ -524,9 +546,9 @@ def test_join_non_unique(self): res, lidx, ridx = self.index.join(other, return_indexers=True) eres = Int64Index([0, 2, 4, 4, 6, 8, 10, 12, 14, 16, 18]) - elidx = np.array([0, 1, 2, 2, 3, 4, 5, 6, 7, 8, 9], dtype=np.int_) + elidx = np.array([0, 1, 2, 2, 3, 4, 5, 6, 7, 8, 9], dtype=np.intp) eridx = np.array([-1, -1, 0, 1, -1, -1, -1, -1, -1, -1, -1], - dtype=np.int_) + dtype=np.intp) self.assert_index_equal(res, eres) self.assert_numpy_array_equal(lidx, elidx) @@ -602,8 +624,8 @@ def test_union(self): (RI(0, 100, 5), RI(0, 100, 20), RI(0, 100, 5)), (RI(0, -100, -5), RI(5, -100, -20), RI(-95, 10, 5)), (RI(0, -11, -1), RI(1, -12, -4), RI(-11, 2, 1)), - (RI(), RI(), RI()), - (RI(0, -10, -2), RI(), RI(0, -10, -2)), + (RI(0), RI(0), RI(0)), + (RI(0, -10, -2), RI(0), RI(0, -10, -2)), (RI(0, 100, 2), RI(100, 150, 200), RI(0, 102, 2)), (RI(0, -100, -2), RI(-100, 50, 102), RI(-100, 4, 2)), (RI(0, -100, -1), RI(0, -50, -3), RI(-99, 1, 1)), @@ -611,7 +633,7 @@ def test_union(self): (RI(0, 10, 5), RI(-5, -6, -20), RI(-5, 10, 5)), (RI(0, 3, 1), RI(4, 5, 1), I64([0, 1, 2, 4])), (RI(0, 10, 1), I64([]), RI(0, 10, 1)), - (RI(), I64([1, 5, 6]), I64([1, 5, 6]))] + (RI(0), I64([1, 5, 6]), I64([1, 5, 6]))] for idx1, idx2, expected in cases: res1 = idx1.union(idx2) res2 = idx2.union(idx1) diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index 23600e1f4241c..5fbaea6c5efcb 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -13,444 +13,1153 @@ ############################################################### -class TestIndexCoercion(tm.TestCase): +class CoercionBase(object): _multiprocess_can_split_ = True - def test_setitem_index_numeric_coercion_int(self): - # tests setitem with non-existing numeric key - s = pd.Series([1, 2, 3, 4]) - self.assertEqual(s.index.dtype, np.int64) - - # int + int -> int - temp = s.copy() - temp[5] = 5 - tm.assert_series_equal(temp, pd.Series([1, 2, 3, 4, 5], - index=[0, 1, 2, 3, 5])) - self.assertEqual(temp.index.dtype, np.int64) - - # int + float -> float - temp = s.copy() - temp[1.1] = 5 - tm.assert_series_equal(temp, pd.Series([1, 2, 3, 4, 5], - index=[0, 1, 2, 3, 1.1])) - self.assertEqual(temp.index.dtype, np.float64) - - def test_setitem_index_numeric_coercion_float(self): - # tests setitem with non-existing numeric key - s = pd.Series([1, 2, 3, 4], index=[1.1, 2.1, 3.1, 4.1]) - self.assertEqual(s.index.dtype, np.float64) - - # float + int -> int - temp = s.copy() - # TODO_GH12747 The result must be float - with tm.assertRaises(IndexError): - temp[5] = 5 - - # float + float -> float - temp = s.copy() - temp[5.1] = 5 - exp = pd.Series([1, 2, 3, 4, 5], index=[1.1, 2.1, 3.1, 4.1, 5.1]) - tm.assert_series_equal(temp, exp) - self.assertEqual(temp.index.dtype, np.float64) - - def test_insert_numeric_coercion_int(self): - idx = pd.Int64Index([1, 2, 3, 4]) - self.assertEqual(idx.dtype, np.int64) - - # int + int -> int - res = idx.insert(1, 1) - tm.assert_index_equal(res, pd.Index([1, 1, 2, 3, 4])) - self.assertEqual(res.dtype, np.int64) - - # int + float -> float - res = idx.insert(1, 1.1) - tm.assert_index_equal(res, pd.Index([1, 1.1, 2, 3, 4])) - self.assertEqual(res.dtype, np.float64) - - # int + bool -> int - res = idx.insert(1, False) - tm.assert_index_equal(res, pd.Index([1, 0, 2, 3, 4])) - self.assertEqual(res.dtype, np.int64) - - def test_insert_numeric_coercion_float(self): - idx = pd.Float64Index([1, 2, 3, 4]) - self.assertEqual(idx.dtype, np.float64) - - # float + int -> int - res = idx.insert(1, 1) - tm.assert_index_equal(res, pd.Index([1., 1., 2., 3., 4.])) - self.assertEqual(res.dtype, np.float64) - - # float + float -> float - res = idx.insert(1, 1.1) - tm.assert_index_equal(res, pd.Index([1., 1.1, 2., 3., 4.])) - self.assertEqual(res.dtype, np.float64) - - # float + bool -> float - res = idx.insert(1, False) - tm.assert_index_equal(res, pd.Index([1., 0., 2., 3., 4.])) - self.assertEqual(res.dtype, np.float64) - + klasses = ['index', 'series'] + dtypes = ['object', 'int64', 'float64', 'complex128', 'bool', + 'datetime64', 'datetime64tz', 'timedelta64', 'period'] -class TestSeriesCoercion(tm.TestCase): + @property + def method(self): + raise NotImplementedError(self) - _multiprocess_can_split_ = True - - def setUp(self): - self.rep = {} - self.rep['object'] = ['a', 'b'] - self.rep['int64'] = [4, 5] - self.rep['float64'] = [1.1, 2.2] - self.rep['complex128'] = [1 + 1j, 2 + 2j] - self.rep['bool'] = [True, False] - - def test_setitem_numeric_coercion_int(self): - s = pd.Series([1, 2, 3, 4]) - self.assertEqual(s.dtype, np.int64) + def _assert(self, left, right, dtype): + # explicitly check dtype to avoid any unexpected result + if isinstance(left, pd.Series): + tm.assert_series_equal(left, right) + elif isinstance(left, pd.Index): + tm.assert_index_equal(left, right) + else: + raise NotImplementedError + self.assertEqual(left.dtype, dtype) + self.assertEqual(right.dtype, dtype) + + def test_has_comprehensive_tests(self): + for klass in self.klasses: + for dtype in self.dtypes: + method_name = 'test_{0}_{1}_{2}'.format(self.method, + klass, dtype) + if not hasattr(self, method_name): + msg = 'test method is not defined: {0}, {1}' + raise AssertionError(msg.format(type(self), method_name)) + + +class TestSetitemCoercion(CoercionBase, tm.TestCase): + + method = 'setitem' + + def _assert_setitem_series_conversion(self, original_series, loc_value, + expected_series, expected_dtype): + """ test series value's coercion triggered by assignment """ + temp = original_series.copy() + temp[1] = loc_value + tm.assert_series_equal(temp, expected_series) + # check dtype explicitly for sure + self.assertEqual(temp.dtype, expected_dtype) + + # .loc works different rule, temporary disable + # temp = original_series.copy() + # temp.loc[1] = loc_value + # tm.assert_series_equal(temp, expected_series) + + def test_setitem_series_object(self): + obj = pd.Series(list('abcd')) + self.assertEqual(obj.dtype, np.object) + + # object + int -> object + exp = pd.Series(['a', 1, 'c', 'd']) + self._assert_setitem_series_conversion(obj, 1, exp, np.object) + + # object + float -> object + exp = pd.Series(['a', 1.1, 'c', 'd']) + self._assert_setitem_series_conversion(obj, 1.1, exp, np.object) + + # object + complex -> object + exp = pd.Series(['a', 1 + 1j, 'c', 'd']) + self._assert_setitem_series_conversion(obj, 1 + 1j, exp, np.object) + + # object + bool -> object + exp = pd.Series(['a', True, 'c', 'd']) + self._assert_setitem_series_conversion(obj, True, exp, np.object) + + def test_setitem_series_int64(self): + obj = pd.Series([1, 2, 3, 4]) + self.assertEqual(obj.dtype, np.int64) # int + int -> int - temp = s.copy() - temp[1] = 1 - tm.assert_series_equal(temp, pd.Series([1, 1, 3, 4])) - self.assertEqual(temp.dtype, np.int64) + exp = pd.Series([1, 1, 3, 4]) + self._assert_setitem_series_conversion(obj, 1, exp, np.int64) # int + float -> float # TODO_GH12747 The result must be float - temp = s.copy() - temp[1] = 1.1 # tm.assert_series_equal(temp, pd.Series([1, 1.1, 3, 4])) # self.assertEqual(temp.dtype, np.float64) - tm.assert_series_equal(temp, pd.Series([1, 1, 3, 4])) - self.assertEqual(temp.dtype, np.int64) + exp = pd.Series([1, 1, 3, 4]) + self._assert_setitem_series_conversion(obj, 1.1, exp, np.int64) # int + complex -> complex - temp = s.copy() - temp[1] = 1 + 1j - tm.assert_series_equal(temp, pd.Series([1, 1 + 1j, 3, 4])) - self.assertEqual(temp.dtype, np.complex128) + exp = pd.Series([1, 1 + 1j, 3, 4]) + self._assert_setitem_series_conversion(obj, 1 + 1j, exp, np.complex128) # int + bool -> int - temp = s.copy() - temp[1] = True - tm.assert_series_equal(temp, pd.Series([1, 1, 3, 4])) - self.assertEqual(temp.dtype, np.int64) + exp = pd.Series([1, 1, 3, 4]) + self._assert_setitem_series_conversion(obj, True, exp, np.int64) - def test_setitem_numeric_coercion_float(self): - s = pd.Series([1.1, 2.2, 3.3, 4.4]) - self.assertEqual(s.dtype, np.float64) + def test_setitem_series_float64(self): + obj = pd.Series([1.1, 2.2, 3.3, 4.4]) + self.assertEqual(obj.dtype, np.float64) # float + int -> float - temp = s.copy() - temp[1] = 1 - tm.assert_series_equal(temp, pd.Series([1.1, 1.0, 3.3, 4.4])) - self.assertEqual(temp.dtype, np.float64) + exp = pd.Series([1.1, 1.0, 3.3, 4.4]) + self._assert_setitem_series_conversion(obj, 1, exp, np.float64) # float + float -> float - temp = s.copy() - temp[1] = 1.1 - tm.assert_series_equal(temp, pd.Series([1.1, 1.1, 3.3, 4.4])) - self.assertEqual(temp.dtype, np.float64) + exp = pd.Series([1.1, 1.1, 3.3, 4.4]) + self._assert_setitem_series_conversion(obj, 1.1, exp, np.float64) # float + complex -> complex - temp = s.copy() - temp[1] = 1 + 1j - tm.assert_series_equal(temp, pd.Series([1.1, 1 + 1j, 3.3, 4.4])) - self.assertEqual(temp.dtype, np.complex128) + exp = pd.Series([1.1, 1 + 1j, 3.3, 4.4]) + self._assert_setitem_series_conversion(obj, 1 + 1j, exp, + np.complex128) # float + bool -> float - temp = s.copy() - temp[1] = True - tm.assert_series_equal(temp, pd.Series([1.1, 1.0, 3.3, 4.4])) - self.assertEqual(temp.dtype, np.float64) + exp = pd.Series([1.1, 1.0, 3.3, 4.4]) + self._assert_setitem_series_conversion(obj, True, exp, np.float64) - def test_setitem_numeric_coercion_complex(self): - s = pd.Series([1 + 1j, 2 + 2j, 3 + 3j, 4 + 4j]) - self.assertEqual(s.dtype, np.complex128) + def test_setitem_series_complex128(self): + obj = pd.Series([1 + 1j, 2 + 2j, 3 + 3j, 4 + 4j]) + self.assertEqual(obj.dtype, np.complex128) # complex + int -> complex - temp = s.copy() - temp[1] = 1 - tm.assert_series_equal(temp, pd.Series([1 + 1j, 1, 3 + 3j, 4 + 4j])) - self.assertEqual(temp.dtype, np.complex128) + exp = pd.Series([1 + 1j, 1, 3 + 3j, 4 + 4j]) + self._assert_setitem_series_conversion(obj, True, exp, np.complex128) # complex + float -> complex - temp = s.copy() - temp[1] = 1.1 - tm.assert_series_equal(temp, pd.Series([1 + 1j, 1.1, 3 + 3j, 4 + 4j])) - self.assertEqual(temp.dtype, np.complex128) + exp = pd.Series([1 + 1j, 1.1, 3 + 3j, 4 + 4j]) + self._assert_setitem_series_conversion(obj, 1.1, exp, np.complex128) # complex + complex -> complex - temp = s.copy() - temp[1] = 1 + 1j - tm.assert_series_equal(temp, - pd.Series([1 + 1j, 1 + 1j, 3 + 3j, 4 + 4j])) - self.assertEqual(temp.dtype, np.complex128) + exp = pd.Series([1 + 1j, 1 + 1j, 3 + 3j, 4 + 4j]) + self._assert_setitem_series_conversion(obj, 1 + 1j, exp, np.complex128) # complex + bool -> complex - temp = s.copy() - temp[1] = True - tm.assert_series_equal(temp, pd.Series([1 + 1j, 1, 3 + 3j, 4 + 4j])) - self.assertEqual(temp.dtype, np.complex128) + exp = pd.Series([1 + 1j, 1, 3 + 3j, 4 + 4j]) + self._assert_setitem_series_conversion(obj, True, exp, np.complex128) - def test_setitem_numeric_coercion_bool(self): - s = pd.Series([True, False, True, False]) - self.assertEqual(s.dtype, np.bool) + def test_setitem_series_bool(self): + obj = pd.Series([True, False, True, False]) + self.assertEqual(obj.dtype, np.bool) # bool + int -> int # TODO_GH12747 The result must be int - temp = s.copy() - temp[1] = 1 # tm.assert_series_equal(temp, pd.Series([1, 1, 1, 0])) # self.assertEqual(temp.dtype, np.int64) - tm.assert_series_equal(temp, pd.Series([True, True, True, False])) - self.assertEqual(temp.dtype, np.bool) + exp = pd.Series([True, True, True, False]) + self._assert_setitem_series_conversion(obj, 1, exp, np.bool) # TODO_GH12747 The result must be int - temp = s.copy() - temp[1] = 3 # greater than bool + # assigning int greater than bool # tm.assert_series_equal(temp, pd.Series([1, 3, 1, 0])) # self.assertEqual(temp.dtype, np.int64) - tm.assert_series_equal(temp, pd.Series([True, True, True, False])) - self.assertEqual(temp.dtype, np.bool) + exp = pd.Series([True, True, True, False]) + self._assert_setitem_series_conversion(obj, 3, exp, np.bool) # bool + float -> float # TODO_GH12747 The result must be float - temp = s.copy() - temp[1] = 1.1 # tm.assert_series_equal(temp, pd.Series([1., 1.1, 1., 0.])) # self.assertEqual(temp.dtype, np.float64) - tm.assert_series_equal(temp, pd.Series([True, True, True, False])) - self.assertEqual(temp.dtype, np.bool) + exp = pd.Series([True, True, True, False]) + self._assert_setitem_series_conversion(obj, 1.1, exp, np.bool) # bool + complex -> complex (buggy, results in bool) # TODO_GH12747 The result must be complex - temp = s.copy() - temp[1] = 1 + 1j # tm.assert_series_equal(temp, pd.Series([1, 1 + 1j, 1, 0])) # self.assertEqual(temp.dtype, np.complex128) - tm.assert_series_equal(temp, pd.Series([True, True, True, False])) - self.assertEqual(temp.dtype, np.bool) - - # bool + bool -> int - temp = s.copy() - temp[1] = True - tm.assert_series_equal(temp, pd.Series([True, True, True, False])) - self.assertEqual(temp.dtype, np.bool) - - def test_where_numeric_coercion_int(self): - s = pd.Series([1, 2, 3, 4]) - self.assertEqual(s.dtype, np.int64) - cond = pd.Series([True, False, True, False]) + exp = pd.Series([True, True, True, False]) + self._assert_setitem_series_conversion(obj, 1 + 1j, exp, np.bool) + + # bool + bool -> bool + exp = pd.Series([True, True, True, False]) + self._assert_setitem_series_conversion(obj, True, exp, np.bool) + + def test_setitem_series_datetime64(self): + obj = pd.Series([pd.Timestamp('2011-01-01'), + pd.Timestamp('2011-01-02'), + pd.Timestamp('2011-01-03'), + pd.Timestamp('2011-01-04')]) + self.assertEqual(obj.dtype, 'datetime64[ns]') + + # datetime64 + datetime64 -> datetime64 + exp = pd.Series([pd.Timestamp('2011-01-01'), + pd.Timestamp('2012-01-01'), + pd.Timestamp('2011-01-03'), + pd.Timestamp('2011-01-04')]) + self._assert_setitem_series_conversion(obj, pd.Timestamp('2012-01-01'), + exp, 'datetime64[ns]') + + # datetime64 + int -> object + # ToDo: The result must be object + exp = pd.Series([pd.Timestamp('2011-01-01'), + pd.Timestamp(1), + pd.Timestamp('2011-01-03'), + pd.Timestamp('2011-01-04')]) + self._assert_setitem_series_conversion(obj, 1, exp, 'datetime64[ns]') + + # ToDo: add more tests once the above issue has been fixed + + def test_setitem_series_datetime64tz(self): + tz = 'US/Eastern' + obj = pd.Series([pd.Timestamp('2011-01-01', tz=tz), + pd.Timestamp('2011-01-02', tz=tz), + pd.Timestamp('2011-01-03', tz=tz), + pd.Timestamp('2011-01-04', tz=tz)]) + self.assertEqual(obj.dtype, 'datetime64[ns, US/Eastern]') + + # datetime64tz + datetime64tz -> datetime64tz + exp = pd.Series([pd.Timestamp('2011-01-01', tz=tz), + pd.Timestamp('2012-01-01', tz=tz), + pd.Timestamp('2011-01-03', tz=tz), + pd.Timestamp('2011-01-04', tz=tz)]) + value = pd.Timestamp('2012-01-01', tz=tz) + self._assert_setitem_series_conversion(obj, value, exp, + 'datetime64[ns, US/Eastern]') + + # datetime64 + int -> object + # ToDo: The result must be object + exp = pd.Series([pd.Timestamp('2011-01-01', tz=tz), + pd.Timestamp(1).tz_localize(tz), + pd.Timestamp('2011-01-03', tz=tz), + pd.Timestamp('2011-01-04', tz=tz)]) + self._assert_setitem_series_conversion(obj, 1, exp, + 'datetime64[ns, US/Eastern]') + + # ToDo: add more tests once the above issue has been fixed + + def test_setitem_series_timedelta64(self): + pass + + def test_setitem_series_period(self): + pass + + def _assert_setitem_index_conversion(self, original_series, loc_key, + expected_index, expected_dtype): + """ test index's coercion triggered by assign key """ + temp = original_series.copy() + temp[loc_key] = 5 + exp = pd.Series([1, 2, 3, 4, 5], index=expected_index) + tm.assert_series_equal(temp, exp) + # check dtype explicitly for sure + self.assertEqual(temp.index.dtype, expected_dtype) + + temp = original_series.copy() + temp.loc[loc_key] = 5 + exp = pd.Series([1, 2, 3, 4, 5], index=expected_index) + tm.assert_series_equal(temp, exp) + # check dtype explicitly for sure + self.assertEqual(temp.index.dtype, expected_dtype) + + def test_setitem_index_object(self): + obj = pd.Series([1, 2, 3, 4], index=list('abcd')) + self.assertEqual(obj.index.dtype, np.object) + + # object + object -> object + exp_index = pd.Index(list('abcdx')) + self._assert_setitem_index_conversion(obj, 'x', exp_index, np.object) + + # object + int -> IndexError, regarded as location + temp = obj.copy() + with tm.assertRaises(IndexError): + temp[5] = 5 + + # object + float -> object + exp_index = pd.Index(['a', 'b', 'c', 'd', 1.1]) + self._assert_setitem_index_conversion(obj, 1.1, exp_index, np.object) + + def test_setitem_index_int64(self): + # tests setitem with non-existing numeric key + obj = pd.Series([1, 2, 3, 4]) + self.assertEqual(obj.index.dtype, np.int64) # int + int -> int - res = s.where(cond, 1) - tm.assert_series_equal(res, pd.Series([1, 1, 3, 1])) - self.assertEqual(res.dtype, np.int64) - res = s.where(cond, pd.Series([5, 6, 7, 8])) - tm.assert_series_equal(res, pd.Series([1, 6, 3, 8])) - self.assertEqual(res.dtype, np.int64) + exp_index = pd.Index([0, 1, 2, 3, 5]) + self._assert_setitem_index_conversion(obj, 5, exp_index, np.int64) # int + float -> float - res = s.where(cond, 1.1) - tm.assert_series_equal(res, pd.Series([1, 1.1, 3, 1.1])) - self.assertEqual(res.dtype, np.float64) - res = s.where(cond, pd.Series([5.5, 6.6, 7.7, 8.8])) - tm.assert_series_equal(res, pd.Series([1, 6.6, 3, 8.8])) - self.assertEqual(res.dtype, np.float64) + exp_index = pd.Index([0, 1, 2, 3, 1.1]) + self._assert_setitem_index_conversion(obj, 1.1, exp_index, np.float64) + + # int + object -> object + exp_index = pd.Index([0, 1, 2, 3, 'x']) + self._assert_setitem_index_conversion(obj, 'x', exp_index, np.object) + + def test_setitem_index_float64(self): + # tests setitem with non-existing numeric key + obj = pd.Series([1, 2, 3, 4], index=[1.1, 2.1, 3.1, 4.1]) + self.assertEqual(obj.index.dtype, np.float64) + + # float + int -> int + temp = obj.copy() + # TODO_GH12747 The result must be float + with tm.assertRaises(IndexError): + temp[5] = 5 + + # float + float -> float + exp_index = pd.Index([1.1, 2.1, 3.1, 4.1, 5.1]) + self._assert_setitem_index_conversion(obj, 5.1, exp_index, np.float64) + + # float + object -> object + exp_index = pd.Index([1.1, 2.1, 3.1, 4.1, 'x']) + self._assert_setitem_index_conversion(obj, 'x', exp_index, np.object) + + def test_setitem_index_complex128(self): + pass + + def test_setitem_index_bool(self): + pass + + def test_setitem_index_datetime64(self): + pass + + def test_setitem_index_datetime64tz(self): + pass + + def test_setitem_index_timedelta64(self): + pass + + def test_setitem_index_period(self): + pass + + +class TestInsertIndexCoercion(CoercionBase, tm.TestCase): + + klasses = ['index'] + method = 'insert' + + def _assert_insert_conversion(self, original, value, + expected, expected_dtype): + """ test coercion triggered by insert """ + target = original.copy() + res = target.insert(1, value) + tm.assert_index_equal(res, expected) + self.assertEqual(res.dtype, expected_dtype) + + def test_insert_index_object(self): + obj = pd.Index(list('abcd')) + self.assertEqual(obj.dtype, np.object) + + # object + int -> object + exp = pd.Index(['a', 1, 'b', 'c', 'd']) + self._assert_insert_conversion(obj, 1, exp, np.object) + + # object + float -> object + exp = pd.Index(['a', 1.1, 'b', 'c', 'd']) + self._assert_insert_conversion(obj, 1.1, exp, np.object) + + # object + bool -> object + res = obj.insert(1, False) + tm.assert_index_equal(res, pd.Index(['a', False, 'b', 'c', 'd'])) + self.assertEqual(res.dtype, np.object) + + # object + object -> object + exp = pd.Index(['a', 'x', 'b', 'c', 'd']) + self._assert_insert_conversion(obj, 'x', exp, np.object) + + def test_insert_index_int64(self): + obj = pd.Int64Index([1, 2, 3, 4]) + self.assertEqual(obj.dtype, np.int64) + + # int + int -> int + exp = pd.Index([1, 1, 2, 3, 4]) + self._assert_insert_conversion(obj, 1, exp, np.int64) + + # int + float -> float + exp = pd.Index([1, 1.1, 2, 3, 4]) + self._assert_insert_conversion(obj, 1.1, exp, np.float64) + + # int + bool -> int + exp = pd.Index([1, 0, 2, 3, 4]) + self._assert_insert_conversion(obj, False, exp, np.int64) + + # int + object -> object + exp = pd.Index([1, 'x', 2, 3, 4]) + self._assert_insert_conversion(obj, 'x', exp, np.object) + + def test_insert_index_float64(self): + obj = pd.Float64Index([1., 2., 3., 4.]) + self.assertEqual(obj.dtype, np.float64) + + # float + int -> int + exp = pd.Index([1., 1., 2., 3., 4.]) + self._assert_insert_conversion(obj, 1, exp, np.float64) + + # float + float -> float + exp = pd.Index([1., 1.1, 2., 3., 4.]) + self._assert_insert_conversion(obj, 1.1, exp, np.float64) + + # float + bool -> float + exp = pd.Index([1., 0., 2., 3., 4.]) + self._assert_insert_conversion(obj, False, exp, np.float64) + + # float + object -> object + exp = pd.Index([1., 'x', 2., 3., 4.]) + self._assert_insert_conversion(obj, 'x', exp, np.object) + + def test_insert_index_complex128(self): + pass + + def test_insert_index_bool(self): + pass + + def test_insert_index_datetime64(self): + obj = pd.DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03', + '2011-01-04']) + self.assertEqual(obj.dtype, 'datetime64[ns]') + + # datetime64 + datetime64 => datetime64 + exp = pd.DatetimeIndex(['2011-01-01', '2012-01-01', '2011-01-02', + '2011-01-03', '2011-01-04']) + self._assert_insert_conversion(obj, pd.Timestamp('2012-01-01'), + exp, 'datetime64[ns]') + + # ToDo: must coerce to object + msg = "Passed item and index have different timezone" + with tm.assertRaisesRegexp(ValueError, msg): + obj.insert(1, pd.Timestamp('2012-01-01', tz='US/Eastern')) + + # ToDo: must coerce to object + msg = "cannot insert DatetimeIndex with incompatible label" + with tm.assertRaisesRegexp(TypeError, msg): + obj.insert(1, 1) + + def test_insert_index_datetime64tz(self): + obj = pd.DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03', + '2011-01-04'], tz='US/Eastern') + self.assertEqual(obj.dtype, 'datetime64[ns, US/Eastern]') + + # datetime64tz + datetime64tz => datetime64 + exp = pd.DatetimeIndex(['2011-01-01', '2012-01-01', '2011-01-02', + '2011-01-03', '2011-01-04'], tz='US/Eastern') + val = pd.Timestamp('2012-01-01', tz='US/Eastern') + self._assert_insert_conversion(obj, val, exp, + 'datetime64[ns, US/Eastern]') + + # ToDo: must coerce to object + msg = "Passed item and index have different timezone" + with tm.assertRaisesRegexp(ValueError, msg): + obj.insert(1, pd.Timestamp('2012-01-01')) + + # ToDo: must coerce to object + msg = "Passed item and index have different timezone" + with tm.assertRaisesRegexp(ValueError, msg): + obj.insert(1, pd.Timestamp('2012-01-01', tz='Asia/Tokyo')) + + # ToDo: must coerce to object + msg = "cannot insert DatetimeIndex with incompatible label" + with tm.assertRaisesRegexp(TypeError, msg): + obj.insert(1, 1) + + def test_insert_index_timedelta64(self): + obj = pd.TimedeltaIndex(['1 day', '2 day', '3 day', '4 day']) + self.assertEqual(obj.dtype, 'timedelta64[ns]') + + # timedelta64 + timedelta64 => timedelta64 + exp = pd.TimedeltaIndex(['1 day', '10 day', '2 day', '3 day', '4 day']) + self._assert_insert_conversion(obj, pd.Timedelta('10 day'), + exp, 'timedelta64[ns]') + + # ToDo: must coerce to object + msg = "cannot insert TimedeltaIndex with incompatible label" + with tm.assertRaisesRegexp(TypeError, msg): + obj.insert(1, pd.Timestamp('2012-01-01')) + + # ToDo: must coerce to object + msg = "cannot insert TimedeltaIndex with incompatible label" + with tm.assertRaisesRegexp(TypeError, msg): + obj.insert(1, 1) + + def test_insert_index_period(self): + obj = pd.PeriodIndex(['2011-01', '2011-02', '2011-03', '2011-04'], + freq='M') + self.assertEqual(obj.dtype, 'period[M]') + + # period + period => period + exp = pd.PeriodIndex(['2011-01', '2012-01', '2011-02', + '2011-03', '2011-04'], freq='M') + self._assert_insert_conversion(obj, pd.Period('2012-01', freq='M'), + exp, 'period[M]') + + # period + datetime64 => object + exp = pd.Index([pd.Period('2011-01', freq='M'), + pd.Timestamp('2012-01-01'), + pd.Period('2011-02', freq='M'), + pd.Period('2011-03', freq='M'), + pd.Period('2011-04', freq='M')], freq='M') + self._assert_insert_conversion(obj, pd.Timestamp('2012-01-01'), + exp, np.object) + + # period + int => object + exp = pd.Index([pd.Period('2011-01', freq='M'), + 1, + pd.Period('2011-02', freq='M'), + pd.Period('2011-03', freq='M'), + pd.Period('2011-04', freq='M')], freq='M') + self._assert_insert_conversion(obj, 1, exp, np.object) + + # period + object => object + exp = pd.Index([pd.Period('2011-01', freq='M'), + 'x', + pd.Period('2011-02', freq='M'), + pd.Period('2011-03', freq='M'), + pd.Period('2011-04', freq='M')], freq='M') + self._assert_insert_conversion(obj, 'x', exp, np.object) + + +class TestWhereCoercion(CoercionBase, tm.TestCase): + + method = 'where' + + def _assert_where_conversion(self, original, cond, values, + expected, expected_dtype): + """ test coercion triggered by where """ + target = original.copy() + res = target.where(cond, values) + self._assert(res, expected, expected_dtype) + + def _where_object_common(self, klass): + obj = klass(list('abcd')) + self.assertEqual(obj.dtype, np.object) + cond = klass([True, False, True, False]) + + # object + int -> object + exp = klass(['a', 1, 'c', 1]) + self._assert_where_conversion(obj, cond, 1, exp, np.object) + + values = klass([5, 6, 7, 8]) + exp = klass(['a', 6, 'c', 8]) + self._assert_where_conversion(obj, cond, values, exp, np.object) + + # object + float -> object + exp = klass(['a', 1.1, 'c', 1.1]) + self._assert_where_conversion(obj, cond, 1.1, exp, np.object) + + values = klass([5.5, 6.6, 7.7, 8.8]) + exp = klass(['a', 6.6, 'c', 8.8]) + self._assert_where_conversion(obj, cond, values, exp, np.object) + + # object + complex -> object + exp = klass(['a', 1 + 1j, 'c', 1 + 1j]) + self._assert_where_conversion(obj, cond, 1 + 1j, exp, np.object) + + values = klass([5 + 5j, 6 + 6j, 7 + 7j, 8 + 8j]) + exp = klass(['a', 6 + 6j, 'c', 8 + 8j]) + self._assert_where_conversion(obj, cond, values, exp, np.object) + + if klass is pd.Series: + exp = klass(['a', 1, 'c', 1]) + self._assert_where_conversion(obj, cond, True, exp, np.object) + + values = klass([True, False, True, True]) + exp = klass(['a', 0, 'c', 1]) + self._assert_where_conversion(obj, cond, values, exp, np.object) + elif klass is pd.Index: + # object + bool -> object + exp = klass(['a', True, 'c', True]) + self._assert_where_conversion(obj, cond, True, exp, np.object) + + values = klass([True, False, True, True]) + exp = klass(['a', False, 'c', True]) + self._assert_where_conversion(obj, cond, values, exp, np.object) + else: + NotImplementedError + + def test_where_series_object(self): + self._where_object_common(pd.Series) + + def test_where_index_object(self): + self._where_object_common(pd.Index) + + def _where_int64_common(self, klass): + obj = klass([1, 2, 3, 4]) + self.assertEqual(obj.dtype, np.int64) + cond = klass([True, False, True, False]) + + # int + int -> int + exp = klass([1, 1, 3, 1]) + self._assert_where_conversion(obj, cond, 1, exp, np.int64) + + values = klass([5, 6, 7, 8]) + exp = klass([1, 6, 3, 8]) + self._assert_where_conversion(obj, cond, values, exp, np.int64) + + # int + float -> float + exp = klass([1, 1.1, 3, 1.1]) + self._assert_where_conversion(obj, cond, 1.1, exp, np.float64) + + values = klass([5.5, 6.6, 7.7, 8.8]) + exp = klass([1, 6.6, 3, 8.8]) + self._assert_where_conversion(obj, cond, values, exp, np.float64) # int + complex -> complex - res = s.where(cond, 1 + 1j) - tm.assert_series_equal(res, pd.Series([1, 1 + 1j, 3, 1 + 1j])) - self.assertEqual(res.dtype, np.complex128) - res = s.where(cond, pd.Series([5 + 5j, 6 + 6j, 7 + 7j, 8 + 8j])) - tm.assert_series_equal(res, pd.Series([1, 6 + 6j, 3, 8 + 8j])) - self.assertEqual(res.dtype, np.complex128) + if klass is pd.Series: + exp = klass([1, 1 + 1j, 3, 1 + 1j]) + self._assert_where_conversion(obj, cond, 1 + 1j, exp, + np.complex128) + + values = klass([5 + 5j, 6 + 6j, 7 + 7j, 8 + 8j]) + exp = klass([1, 6 + 6j, 3, 8 + 8j]) + self._assert_where_conversion(obj, cond, values, exp, + np.complex128) # int + bool -> int - res = s.where(cond, True) - tm.assert_series_equal(res, pd.Series([1, 1, 3, 1])) - self.assertEqual(res.dtype, np.int64) - res = s.where(cond, pd.Series([True, False, True, True])) - tm.assert_series_equal(res, pd.Series([1, 0, 3, 1])) - self.assertEqual(res.dtype, np.int64) - - def test_where_numeric_coercion_float(self): - s = pd.Series([1.1, 2.2, 3.3, 4.4]) - self.assertEqual(s.dtype, np.float64) - cond = pd.Series([True, False, True, False]) + exp = klass([1, 1, 3, 1]) + self._assert_where_conversion(obj, cond, True, exp, np.int64) + + values = klass([True, False, True, True]) + exp = klass([1, 0, 3, 1]) + self._assert_where_conversion(obj, cond, values, exp, np.int64) + + def test_where_series_int64(self): + self._where_int64_common(pd.Series) + + def test_where_index_int64(self): + self._where_int64_common(pd.Index) + + def _where_float64_common(self, klass): + obj = klass([1.1, 2.2, 3.3, 4.4]) + self.assertEqual(obj.dtype, np.float64) + cond = klass([True, False, True, False]) # float + int -> float - res = s.where(cond, 1) - tm.assert_series_equal(res, pd.Series([1.1, 1.0, 3.3, 1.0])) - self.assertEqual(res.dtype, np.float64) - res = s.where(cond, pd.Series([5, 6, 7, 8])) - tm.assert_series_equal(res, pd.Series([1.1, 6.0, 3.3, 8.0])) - self.assertEqual(res.dtype, np.float64) + exp = klass([1.1, 1.0, 3.3, 1.0]) + self._assert_where_conversion(obj, cond, 1, exp, np.float64) + + values = klass([5, 6, 7, 8]) + exp = klass([1.1, 6.0, 3.3, 8.0]) + self._assert_where_conversion(obj, cond, values, exp, np.float64) # float + float -> float - res = s.where(cond, 1.1) - tm.assert_series_equal(res, pd.Series([1.1, 1.1, 3.3, 1.1])) - self.assertEqual(res.dtype, np.float64) - res = s.where(cond, pd.Series([5.5, 6.6, 7.7, 8.8])) - tm.assert_series_equal(res, pd.Series([1.1, 6.6, 3.3, 8.8])) - self.assertEqual(res.dtype, np.float64) + exp = klass([1.1, 1.1, 3.3, 1.1]) + self._assert_where_conversion(obj, cond, 1.1, exp, np.float64) + + values = klass([5.5, 6.6, 7.7, 8.8]) + exp = klass([1.1, 6.6, 3.3, 8.8]) + self._assert_where_conversion(obj, cond, values, exp, np.float64) # float + complex -> complex - res = s.where(cond, 1 + 1j) - tm.assert_series_equal(res, pd.Series([1.1, 1 + 1j, 3.3, 1 + 1j])) - self.assertEqual(res.dtype, np.complex128) - res = s.where(cond, pd.Series([5 + 5j, 6 + 6j, 7 + 7j, 8 + 8j])) - tm.assert_series_equal(res, pd.Series([1.1, 6 + 6j, 3.3, 8 + 8j])) - self.assertEqual(res.dtype, np.complex128) + if klass is pd.Series: + exp = klass([1.1, 1 + 1j, 3.3, 1 + 1j]) + self._assert_where_conversion(obj, cond, 1 + 1j, exp, + np.complex128) + + values = klass([5 + 5j, 6 + 6j, 7 + 7j, 8 + 8j]) + exp = klass([1.1, 6 + 6j, 3.3, 8 + 8j]) + self._assert_where_conversion(obj, cond, values, exp, + np.complex128) # float + bool -> float - res = s.where(cond, True) - tm.assert_series_equal(res, pd.Series([1.1, 1.0, 3.3, 1.0])) - self.assertEqual(res.dtype, np.float64) - res = s.where(cond, pd.Series([True, False, True, True])) - tm.assert_series_equal(res, pd.Series([1.1, 0.0, 3.3, 1.0])) - self.assertEqual(res.dtype, np.float64) - - def test_where_numeric_coercion_complex(self): - s = pd.Series([1 + 1j, 2 + 2j, 3 + 3j, 4 + 4j]) - self.assertEqual(s.dtype, np.complex128) + exp = klass([1.1, 1.0, 3.3, 1.0]) + self._assert_where_conversion(obj, cond, True, exp, np.float64) + + values = klass([True, False, True, True]) + exp = klass([1.1, 0.0, 3.3, 1.0]) + self._assert_where_conversion(obj, cond, values, exp, np.float64) + + def test_where_series_float64(self): + self._where_float64_common(pd.Series) + + def test_where_index_float64(self): + self._where_float64_common(pd.Index) + + def test_where_series_complex128(self): + obj = pd.Series([1 + 1j, 2 + 2j, 3 + 3j, 4 + 4j]) + self.assertEqual(obj.dtype, np.complex128) cond = pd.Series([True, False, True, False]) - # complex + int -> float - res = s.where(cond, 1) - tm.assert_series_equal(res, pd.Series([1 + 1j, 1, 3 + 3j, 1])) - self.assertEqual(res.dtype, np.complex128) - res = s.where(cond, pd.Series([5, 6, 7, 8])) - tm.assert_series_equal(res, pd.Series([1 + 1j, 6.0, 3 + 3j, 8.0])) - self.assertEqual(res.dtype, np.complex128) - - # complex + float -> float - res = s.where(cond, 1.1) - tm.assert_series_equal(res, pd.Series([1 + 1j, 1.1, 3 + 3j, 1.1])) - self.assertEqual(res.dtype, np.complex128) - res = s.where(cond, pd.Series([5.5, 6.6, 7.7, 8.8])) - tm.assert_series_equal(res, pd.Series([1 + 1j, 6.6, 3 + 3j, 8.8])) - self.assertEqual(res.dtype, np.complex128) + # complex + int -> complex + exp = pd.Series([1 + 1j, 1, 3 + 3j, 1]) + self._assert_where_conversion(obj, cond, 1, exp, np.complex128) + + values = pd.Series([5, 6, 7, 8]) + exp = pd.Series([1 + 1j, 6.0, 3 + 3j, 8.0]) + self._assert_where_conversion(obj, cond, values, exp, np.complex128) + + # complex + float -> complex + exp = pd.Series([1 + 1j, 1.1, 3 + 3j, 1.1]) + self._assert_where_conversion(obj, cond, 1.1, exp, np.complex128) + + values = pd.Series([5.5, 6.6, 7.7, 8.8]) + exp = pd.Series([1 + 1j, 6.6, 3 + 3j, 8.8]) + self._assert_where_conversion(obj, cond, values, exp, np.complex128) # complex + complex -> complex - res = s.where(cond, 1 + 1j) - tm.assert_series_equal(res, - pd.Series([1 + 1j, 1 + 1j, 3 + 3j, 1 + 1j])) - self.assertEqual(res.dtype, np.complex128) - res = s.where(cond, pd.Series([5 + 5j, 6 + 6j, 7 + 7j, 8 + 8j])) - tm.assert_series_equal(res, - pd.Series([1 + 1j, 6 + 6j, 3 + 3j, 8 + 8j])) - self.assertEqual(res.dtype, np.complex128) + exp = pd.Series([1 + 1j, 1 + 1j, 3 + 3j, 1 + 1j]) + self._assert_where_conversion(obj, cond, 1 + 1j, exp, np.complex128) + + values = pd.Series([5 + 5j, 6 + 6j, 7 + 7j, 8 + 8j]) + exp = pd.Series([1 + 1j, 6 + 6j, 3 + 3j, 8 + 8j]) + self._assert_where_conversion(obj, cond, values, exp, np.complex128) # complex + bool -> complex - res = s.where(cond, True) - tm.assert_series_equal(res, pd.Series([1 + 1j, 1, 3 + 3j, 1])) - self.assertEqual(res.dtype, np.complex128) - res = s.where(cond, pd.Series([True, False, True, True])) - tm.assert_series_equal(res, pd.Series([1 + 1j, 0, 3 + 3j, 1])) - self.assertEqual(res.dtype, np.complex128) - - def test_where_numeric_coercion_bool(self): - s = pd.Series([True, False, True, False]) - self.assertEqual(s.dtype, np.bool) + exp = pd.Series([1 + 1j, 1, 3 + 3j, 1]) + self._assert_where_conversion(obj, cond, True, exp, np.complex128) + + values = pd.Series([True, False, True, True]) + exp = pd.Series([1 + 1j, 0, 3 + 3j, 1]) + self._assert_where_conversion(obj, cond, values, exp, np.complex128) + + def test_where_index_complex128(self): + pass + + def test_where_series_bool(self): + obj = pd.Series([True, False, True, False]) + self.assertEqual(obj.dtype, np.bool) cond = pd.Series([True, False, True, False]) # bool + int -> int - res = s.where(cond, 1) - tm.assert_series_equal(res, pd.Series([1, 1, 1, 1])) - self.assertEqual(res.dtype, np.int64) - res = s.where(cond, pd.Series([5, 6, 7, 8])) - tm.assert_series_equal(res, pd.Series([1, 6, 1, 8])) - self.assertEqual(res.dtype, np.int64) + exp = pd.Series([1, 1, 1, 1]) + self._assert_where_conversion(obj, cond, 1, exp, np.int64) + + values = pd.Series([5, 6, 7, 8]) + exp = pd.Series([1, 6, 1, 8]) + self._assert_where_conversion(obj, cond, values, exp, np.int64) # bool + float -> float - res = s.where(cond, 1.1) - tm.assert_series_equal(res, pd.Series([1.0, 1.1, 1.0, 1.1])) - self.assertEqual(res.dtype, np.float64) - res = s.where(cond, pd.Series([5.5, 6.6, 7.7, 8.8])) - tm.assert_series_equal(res, pd.Series([1.0, 6.6, 1.0, 8.8])) - self.assertEqual(res.dtype, np.float64) + exp = pd.Series([1.0, 1.1, 1.0, 1.1]) + self._assert_where_conversion(obj, cond, 1.1, exp, np.float64) + + values = pd.Series([5.5, 6.6, 7.7, 8.8]) + exp = pd.Series([1.0, 6.6, 1.0, 8.8]) + self._assert_where_conversion(obj, cond, values, exp, np.float64) # bool + complex -> complex - res = s.where(cond, 1 + 1j) - tm.assert_series_equal(res, pd.Series([1, 1 + 1j, 1, 1 + 1j])) - self.assertEqual(res.dtype, np.complex128) - res = s.where(cond, pd.Series([5 + 5j, 6 + 6j, 7 + 7j, 8 + 8j])) - tm.assert_series_equal(res, pd.Series([1, 6 + 6j, 1, 8 + 8j])) - self.assertEqual(res.dtype, np.complex128) + exp = pd.Series([1, 1 + 1j, 1, 1 + 1j]) + self._assert_where_conversion(obj, cond, 1 + 1j, exp, np.complex128) + + values = pd.Series([5 + 5j, 6 + 6j, 7 + 7j, 8 + 8j]) + exp = pd.Series([1, 6 + 6j, 1, 8 + 8j]) + self._assert_where_conversion(obj, cond, values, exp, np.complex128) # bool + bool -> bool - res = s.where(cond, True) - tm.assert_series_equal(res, pd.Series([True, True, True, True])) - self.assertEqual(res.dtype, np.bool) - res = s.where(cond, pd.Series([True, False, True, True])) - tm.assert_series_equal(res, pd.Series([True, False, True, True])) - self.assertEqual(res.dtype, np.bool) + exp = pd.Series([True, True, True, True]) + self._assert_where_conversion(obj, cond, True, exp, np.bool) + + values = pd.Series([True, False, True, True]) + exp = pd.Series([True, False, True, True]) + self._assert_where_conversion(obj, cond, values, exp, np.bool) + + def test_where_index_bool(self): + pass + + def test_where_series_datetime64(self): + obj = pd.Series([pd.Timestamp('2011-01-01'), + pd.Timestamp('2011-01-02'), + pd.Timestamp('2011-01-03'), + pd.Timestamp('2011-01-04')]) + self.assertEqual(obj.dtype, 'datetime64[ns]') + cond = pd.Series([True, False, True, False]) + + # datetime64 + datetime64 -> datetime64 + exp = pd.Series([pd.Timestamp('2011-01-01'), + pd.Timestamp('2012-01-01'), + pd.Timestamp('2011-01-03'), + pd.Timestamp('2012-01-01')]) + self._assert_where_conversion(obj, cond, pd.Timestamp('2012-01-01'), + exp, 'datetime64[ns]') + + values = pd.Series([pd.Timestamp('2012-01-01'), + pd.Timestamp('2012-01-02'), + pd.Timestamp('2012-01-03'), + pd.Timestamp('2012-01-04')]) + exp = pd.Series([pd.Timestamp('2011-01-01'), + pd.Timestamp('2012-01-02'), + pd.Timestamp('2011-01-03'), + pd.Timestamp('2012-01-04')]) + self._assert_where_conversion(obj, cond, values, exp, 'datetime64[ns]') + + # ToDo: coerce to object + msg = "cannot coerce a Timestamp with a tz on a naive Block" + with tm.assertRaisesRegexp(TypeError, msg): + obj.where(cond, pd.Timestamp('2012-01-01', tz='US/Eastern')) + + # ToDo: do not coerce to UTC, must be object + values = pd.Series([pd.Timestamp('2012-01-01', tz='US/Eastern'), + pd.Timestamp('2012-01-02', tz='US/Eastern'), + pd.Timestamp('2012-01-03', tz='US/Eastern'), + pd.Timestamp('2012-01-04', tz='US/Eastern')]) + exp = pd.Series([pd.Timestamp('2011-01-01'), + pd.Timestamp('2012-01-02 05:00'), + pd.Timestamp('2011-01-03'), + pd.Timestamp('2012-01-04 05:00')]) + self._assert_where_conversion(obj, cond, values, exp, 'datetime64[ns]') + + def test_where_index_datetime64(self): + obj = pd.Index([pd.Timestamp('2011-01-01'), + pd.Timestamp('2011-01-02'), + pd.Timestamp('2011-01-03'), + pd.Timestamp('2011-01-04')]) + self.assertEqual(obj.dtype, 'datetime64[ns]') + cond = pd.Index([True, False, True, False]) + + # datetime64 + datetime64 -> datetime64 + # must support scalar + msg = "cannot coerce a Timestamp with a tz on a naive Block" + with tm.assertRaises(TypeError): + obj.where(cond, pd.Timestamp('2012-01-01')) + + values = pd.Index([pd.Timestamp('2012-01-01'), + pd.Timestamp('2012-01-02'), + pd.Timestamp('2012-01-03'), + pd.Timestamp('2012-01-04')]) + exp = pd.Index([pd.Timestamp('2011-01-01'), + pd.Timestamp('2012-01-02'), + pd.Timestamp('2011-01-03'), + pd.Timestamp('2012-01-04')]) + self._assert_where_conversion(obj, cond, values, exp, 'datetime64[ns]') + + # ToDo: coerce to object + msg = ("Index\\(\\.\\.\\.\\) must be called with a collection " + "of some kind") + with tm.assertRaisesRegexp(TypeError, msg): + obj.where(cond, pd.Timestamp('2012-01-01', tz='US/Eastern')) + + # ToDo: do not ignore timezone, must be object + values = pd.Index([pd.Timestamp('2012-01-01', tz='US/Eastern'), + pd.Timestamp('2012-01-02', tz='US/Eastern'), + pd.Timestamp('2012-01-03', tz='US/Eastern'), + pd.Timestamp('2012-01-04', tz='US/Eastern')]) + exp = pd.Index([pd.Timestamp('2011-01-01'), + pd.Timestamp('2012-01-02'), + pd.Timestamp('2011-01-03'), + pd.Timestamp('2012-01-04')]) + self._assert_where_conversion(obj, cond, values, exp, 'datetime64[ns]') + + def test_where_series_datetime64tz(self): + pass + + def test_where_series_timedelta64(self): + pass + + def test_where_series_period(self): + pass + + def test_where_index_datetime64tz(self): + pass + + def test_where_index_timedelta64(self): + pass + + def test_where_index_period(self): + pass + + +class TestFillnaSeriesCoercion(CoercionBase, tm.TestCase): # not indexing, but place here for consisntency - def test_fillna_numeric_coercion_int(self): + method = 'fillna' + + def _assert_fillna_conversion(self, original, value, + expected, expected_dtype): + """ test coercion triggered by fillna """ + target = original.copy() + res = target.fillna(value) + self._assert(res, expected, expected_dtype) + + def _fillna_object_common(self, klass): + obj = klass(['a', np.nan, 'c', 'd']) + self.assertEqual(obj.dtype, np.object) + + # object + int -> object + exp = klass(['a', 1, 'c', 'd']) + self._assert_fillna_conversion(obj, 1, exp, np.object) + + # object + float -> object + exp = klass(['a', 1.1, 'c', 'd']) + self._assert_fillna_conversion(obj, 1.1, exp, np.object) + + # object + complex -> object + exp = klass(['a', 1 + 1j, 'c', 'd']) + self._assert_fillna_conversion(obj, 1 + 1j, exp, np.object) + + # object + bool -> object + exp = klass(['a', True, 'c', 'd']) + self._assert_fillna_conversion(obj, True, exp, np.object) + + def test_fillna_series_object(self): + self._fillna_object_common(pd.Series) + + def test_fillna_index_object(self): + self._fillna_object_common(pd.Index) + + def test_fillna_series_int64(self): # int can't hold NaN pass - def test_fillna_numeric_coercion_float(self): - s = pd.Series([1.1, np.nan, 3.3, 4.4]) - self.assertEqual(s.dtype, np.float64) + def test_fillna_index_int64(self): + pass + + def _fillna_float64_common(self, klass): + obj = klass([1.1, np.nan, 3.3, 4.4]) + self.assertEqual(obj.dtype, np.float64) # float + int -> float - res = s.fillna(1) - tm.assert_series_equal(res, pd.Series([1.1, 1.0, 3.3, 4.4])) - self.assertEqual(res.dtype, np.float64) + exp = klass([1.1, 1.0, 3.3, 4.4]) + self._assert_fillna_conversion(obj, 1, exp, np.float64) # float + float -> float - res = s.fillna(1.1) - tm.assert_series_equal(res, pd.Series([1.1, 1.1, 3.3, 4.4])) - self.assertEqual(res.dtype, np.float64) - - # float + complex -> complex - res = s.fillna(1 + 1j) - tm.assert_series_equal(res, pd.Series([1.1, 1 + 1j, 3.3, 4.4])) - self.assertEqual(res.dtype, np.complex128) + exp = klass([1.1, 1.1, 3.3, 4.4]) + self._assert_fillna_conversion(obj, 1.1, exp, np.float64) + + if klass is pd.Series: + # float + complex -> complex + exp = klass([1.1, 1 + 1j, 3.3, 4.4]) + self._assert_fillna_conversion(obj, 1 + 1j, exp, np.complex128) + elif klass is pd.Index: + # float + complex -> object + exp = klass([1.1, 1 + 1j, 3.3, 4.4]) + self._assert_fillna_conversion(obj, 1 + 1j, exp, np.object) + else: + NotImplementedError # float + bool -> float - res = s.fillna(True) - tm.assert_series_equal(res, pd.Series([1.1, 1.0, 3.3, 4.4])) - self.assertEqual(res.dtype, np.float64) + exp = klass([1.1, 1.0, 3.3, 4.4]) + self._assert_fillna_conversion(obj, True, exp, np.float64) + + def test_fillna_series_float64(self): + self._fillna_float64_common(pd.Series) - def test_fillna_numeric_coercion_complex(self): - s = pd.Series([1 + 1j, np.nan, 3 + 3j, 4 + 4j]) - self.assertEqual(s.dtype, np.complex128) + def test_fillna_index_float64(self): + self._fillna_float64_common(pd.Index) + + def test_fillna_series_complex128(self): + obj = pd.Series([1 + 1j, np.nan, 3 + 3j, 4 + 4j]) + self.assertEqual(obj.dtype, np.complex128) # complex + int -> complex - res = s.fillna(1) - tm.assert_series_equal(res, pd.Series([1 + 1j, 1, 3 + 3j, 4 + 4j])) - self.assertEqual(res.dtype, np.complex128) + exp = pd.Series([1 + 1j, 1, 3 + 3j, 4 + 4j]) + self._assert_fillna_conversion(obj, 1, exp, np.complex128) # complex + float -> complex - res = s.fillna(1.1) - tm.assert_series_equal(res, pd.Series([1 + 1j, 1.1, 3 + 3j, 4 + 4j])) - self.assertEqual(res.dtype, np.complex128) + exp = pd.Series([1 + 1j, 1.1, 3 + 3j, 4 + 4j]) + self._assert_fillna_conversion(obj, 1.1, exp, np.complex128) # complex + complex -> complex - res = s.fillna(1 + 1j) - tm.assert_series_equal(res, - pd.Series([1 + 1j, 1 + 1j, 3 + 3j, 4 + 4j])) - self.assertEqual(res.dtype, np.complex128) + exp = pd.Series([1 + 1j, 1 + 1j, 3 + 3j, 4 + 4j]) + self._assert_fillna_conversion(obj, 1 + 1j, exp, np.complex128) # complex + bool -> complex - res = s.fillna(True) - tm.assert_series_equal(res, pd.Series([1 + 1j, 1, 3 + 3j, 4 + 4j])) - self.assertEqual(res.dtype, np.complex128) + exp = pd.Series([1 + 1j, 1, 3 + 3j, 4 + 4j]) + self._assert_fillna_conversion(obj, True, exp, np.complex128) + + def test_fillna_index_complex128(self): + self._fillna_float64_common(pd.Index) - def test_fillna_numeric_coercion_bool(self): + def test_fillna_series_bool(self): # bool can't hold NaN pass + def test_fillna_index_bool(self): + pass + + def test_fillna_series_datetime64(self): + obj = pd.Series([pd.Timestamp('2011-01-01'), + pd.NaT, + pd.Timestamp('2011-01-03'), + pd.Timestamp('2011-01-04')]) + self.assertEqual(obj.dtype, 'datetime64[ns]') + + # datetime64 + datetime64 => datetime64 + exp = pd.Series([pd.Timestamp('2011-01-01'), + pd.Timestamp('2012-01-01'), + pd.Timestamp('2011-01-03'), + pd.Timestamp('2011-01-04')]) + self._assert_fillna_conversion(obj, pd.Timestamp('2012-01-01'), + exp, 'datetime64[ns]') + + # datetime64 + datetime64tz => object + exp = pd.Series([pd.Timestamp('2011-01-01'), + pd.Timestamp('2012-01-01', tz='US/Eastern'), + pd.Timestamp('2011-01-03'), + pd.Timestamp('2011-01-04')]) + value = pd.Timestamp('2012-01-01', tz='US/Eastern') + self._assert_fillna_conversion(obj, value, exp, np.object) + + # datetime64 + int => object + # ToDo: must be coerced to object + exp = pd.Series([pd.Timestamp('2011-01-01'), + pd.Timestamp(1), + pd.Timestamp('2011-01-03'), + pd.Timestamp('2011-01-04')]) + self._assert_fillna_conversion(obj, 1, exp, 'datetime64[ns]') + + # datetime64 + object => object + exp = pd.Series([pd.Timestamp('2011-01-01'), + 'x', + pd.Timestamp('2011-01-03'), + pd.Timestamp('2011-01-04')]) + self._assert_fillna_conversion(obj, 'x', exp, np.object) + + def test_fillna_series_datetime64tz(self): + tz = 'US/Eastern' + + obj = pd.Series([pd.Timestamp('2011-01-01', tz=tz), + pd.NaT, + pd.Timestamp('2011-01-03', tz=tz), + pd.Timestamp('2011-01-04', tz=tz)]) + self.assertEqual(obj.dtype, 'datetime64[ns, US/Eastern]') + + # datetime64tz + datetime64tz => datetime64tz + exp = pd.Series([pd.Timestamp('2011-01-01', tz=tz), + pd.Timestamp('2012-01-01', tz=tz), + pd.Timestamp('2011-01-03', tz=tz), + pd.Timestamp('2011-01-04', tz=tz)]) + value = pd.Timestamp('2012-01-01', tz=tz) + self._assert_fillna_conversion(obj, value, exp, + 'datetime64[ns, US/Eastern]') + + # datetime64tz + datetime64 => object + exp = pd.Series([pd.Timestamp('2011-01-01', tz=tz), + pd.Timestamp('2012-01-01'), + pd.Timestamp('2011-01-03', tz=tz), + pd.Timestamp('2011-01-04', tz=tz)]) + value = pd.Timestamp('2012-01-01') + self._assert_fillna_conversion(obj, value, exp, np.object) + + # datetime64tz + datetime64tz(different tz) => object + exp = pd.Series([pd.Timestamp('2011-01-01', tz=tz), + pd.Timestamp('2012-01-01', tz='Asia/Tokyo'), + pd.Timestamp('2011-01-03', tz=tz), + pd.Timestamp('2011-01-04', tz=tz)]) + value = pd.Timestamp('2012-01-01', tz='Asia/Tokyo') + self._assert_fillna_conversion(obj, value, exp, np.object) + + # datetime64tz + int => datetime64tz + # ToDo: must be object + exp = pd.Series([pd.Timestamp('2011-01-01', tz=tz), + pd.Timestamp(1).tz_localize(tz=tz), + pd.Timestamp('2011-01-03', tz=tz), + pd.Timestamp('2011-01-04', tz=tz)]) + self._assert_fillna_conversion(obj, 1, exp, + 'datetime64[ns, US/Eastern]') + + # datetime64tz + object => object + exp = pd.Series([pd.Timestamp('2011-01-01', tz=tz), + 'x', + pd.Timestamp('2011-01-03', tz=tz), + pd.Timestamp('2011-01-04', tz=tz)]) + self._assert_fillna_conversion(obj, 'x', exp, np.object) + + def test_fillna_series_timedelta64(self): + pass + + def test_fillna_series_period(self): + pass + + def test_fillna_index_datetime64(self): + obj = pd.DatetimeIndex(['2011-01-01', 'NaT', '2011-01-03', + '2011-01-04']) + self.assertEqual(obj.dtype, 'datetime64[ns]') + + # datetime64 + datetime64 => datetime64 + exp = pd.DatetimeIndex(['2011-01-01', '2012-01-01', + '2011-01-03', '2011-01-04']) + self._assert_fillna_conversion(obj, pd.Timestamp('2012-01-01'), + exp, 'datetime64[ns]') + + # datetime64 + datetime64tz => object + exp = pd.Index([pd.Timestamp('2011-01-01'), + pd.Timestamp('2012-01-01', tz='US/Eastern'), + pd.Timestamp('2011-01-03'), + pd.Timestamp('2011-01-04')]) + value = pd.Timestamp('2012-01-01', tz='US/Eastern') + self._assert_fillna_conversion(obj, value, exp, np.object) + + # datetime64 + int => object + exp = pd.Index([pd.Timestamp('2011-01-01'), + 1, + pd.Timestamp('2011-01-03'), + pd.Timestamp('2011-01-04')]) + self._assert_fillna_conversion(obj, 1, exp, np.object) + + # datetime64 + object => object + exp = pd.Index([pd.Timestamp('2011-01-01'), + 'x', + pd.Timestamp('2011-01-03'), + pd.Timestamp('2011-01-04')]) + self._assert_fillna_conversion(obj, 'x', exp, np.object) + + def test_fillna_index_datetime64tz(self): + tz = 'US/Eastern' + + obj = pd.DatetimeIndex(['2011-01-01', 'NaT', '2011-01-03', + '2011-01-04'], tz=tz) + self.assertEqual(obj.dtype, 'datetime64[ns, US/Eastern]') + + # datetime64tz + datetime64tz => datetime64tz + exp = pd.DatetimeIndex(['2011-01-01', '2012-01-01', + '2011-01-03', '2011-01-04'], tz=tz) + value = pd.Timestamp('2012-01-01', tz=tz) + self._assert_fillna_conversion(obj, value, exp, + 'datetime64[ns, US/Eastern]') + + # datetime64tz + datetime64 => object + exp = pd.Index([pd.Timestamp('2011-01-01', tz=tz), + pd.Timestamp('2012-01-01'), + pd.Timestamp('2011-01-03', tz=tz), + pd.Timestamp('2011-01-04', tz=tz)]) + value = pd.Timestamp('2012-01-01') + self._assert_fillna_conversion(obj, value, exp, np.object) + + # datetime64tz + datetime64tz(different tz) => object + exp = pd.Index([pd.Timestamp('2011-01-01', tz=tz), + pd.Timestamp('2012-01-01', tz='Asia/Tokyo'), + pd.Timestamp('2011-01-03', tz=tz), + pd.Timestamp('2011-01-04', tz=tz)]) + value = pd.Timestamp('2012-01-01', tz='Asia/Tokyo') + self._assert_fillna_conversion(obj, value, exp, np.object) + + # datetime64tz + int => object + exp = pd.Index([pd.Timestamp('2011-01-01', tz=tz), + 1, + pd.Timestamp('2011-01-03', tz=tz), + pd.Timestamp('2011-01-04', tz=tz)]) + self._assert_fillna_conversion(obj, 1, exp, np.object) + + # datetime64tz + object => object + exp = pd.Index([pd.Timestamp('2011-01-01', tz=tz), + 'x', + pd.Timestamp('2011-01-03', tz=tz), + pd.Timestamp('2011-01-04', tz=tz)]) + self._assert_fillna_conversion(obj, 'x', exp, np.object) + + def test_fillna_index_timedelta64(self): + pass + + def test_fillna_index_period(self): + pass + + +class TestReplaceSeriesCoercion(CoercionBase, tm.TestCase): + + # not indexing, but place here for consisntency + + klasses = ['series'] + method = 'replace' + + def setUp(self): + self.rep = {} + self.rep['object'] = ['a', 'b'] + self.rep['int64'] = [4, 5] + self.rep['float64'] = [1.1, 2.2] + self.rep['complex128'] = [1 + 1j, 2 + 2j] + self.rep['bool'] = [True, False] + def _assert_replace_conversion(self, from_key, to_key, how): index = pd.Index([3, 4], name='xxx') - s = pd.Series(self.rep[from_key], index=index, name='yyy') - self.assertEqual(s.dtype, from_key) + obj = pd.Series(self.rep[from_key], index=index, name='yyy') + self.assertEqual(obj.dtype, from_key) if how == 'dict': replacer = dict(zip(self.rep[from_key], self.rep[to_key])) @@ -459,7 +1168,14 @@ def _assert_replace_conversion(self, from_key, to_key, how): else: raise ValueError - result = s.replace(replacer) + result = obj.replace(replacer) + + # buggy on windows for bool/int64 + if (from_key == 'bool' and + to_key == 'int64' and + tm.is_platform_windows()): + raise nose.SkipTest("windows platform buggy: {0} -> {1}".format + (from_key, to_key)) if ((from_key == 'float64' and to_key in ('bool', 'int64')) or @@ -471,7 +1187,7 @@ def _assert_replace_conversion(self, from_key, to_key, how): to_key in ('bool')) or # TODO_GH12747 The result must be int? - (from_key == 'bool' and to_key in ('int64'))): + (from_key == 'bool' and to_key == 'int64')): # buggy on 32-bit if tm.is_platform_32bit(): @@ -488,54 +1204,43 @@ def _assert_replace_conversion(self, from_key, to_key, how): tm.assert_series_equal(result, exp) - def test_replace_conversion_dict_from_object(self): + def test_replace_series_object(self): from_key = 'object' for to_key in self.rep: self._assert_replace_conversion(from_key, to_key, how='dict') - def test_replace_conversion_dict_from_int(self): - from_key = 'int64' for to_key in self.rep: - self._assert_replace_conversion(from_key, to_key, how='dict') + self._assert_replace_conversion(from_key, to_key, how='series') - def test_replace_conversion_dict_from_float(self): - from_key = 'float64' + def test_replace_series_int64(self): + from_key = 'int64' for to_key in self.rep: self._assert_replace_conversion(from_key, to_key, how='dict') - def test_replace_conversion_dict_from_complex(self): - from_key = 'complex128' for to_key in self.rep: - self._assert_replace_conversion(from_key, to_key, how='dict') + self._assert_replace_conversion(from_key, to_key, how='series') - def test_replace_conversion_dict_from_bool(self): - from_key = 'bool' + def test_replace_series_float64(self): + from_key = 'float64' for to_key in self.rep: self._assert_replace_conversion(from_key, to_key, how='dict') - # Series - def test_replace_conversion_series_from_object(self): - from_key = 'object' for to_key in self.rep: self._assert_replace_conversion(from_key, to_key, how='series') - def test_replace_conversion_series_from_int(self): - from_key = 'int64' + def test_replace_series_complex128(self): + from_key = 'complex128' for to_key in self.rep: - self._assert_replace_conversion(from_key, to_key, how='series') + self._assert_replace_conversion(from_key, to_key, how='dict') - def test_replace_conversion_series_from_float(self): - from_key = 'float64' for to_key in self.rep: self._assert_replace_conversion(from_key, to_key, how='series') - def test_replace_conversion_series_from_complex(self): - from_key = 'complex128' + def test_replace_series_bool(self): + from_key = 'bool' for to_key in self.rep: - self._assert_replace_conversion(from_key, to_key, how='series') + self._assert_replace_conversion(from_key, to_key, how='dict') - def test_replace_conversion_series_from_bool(self): - from_key = 'bool' for to_key in self.rep: if compat.PY3: @@ -543,3 +1248,15 @@ def test_replace_conversion_series_from_bool(self): raise nose.SkipTest("doesn't work as in PY3") self._assert_replace_conversion(from_key, to_key, how='series') + + def test_replace_series_datetime64(self): + pass + + def test_replace_series_datetime64tz(self): + pass + + def test_replace_series_timedelta64(self): + pass + + def test_replace_series_period(self): + pass diff --git a/pandas/tests/indexing/test_floats.py b/pandas/tests/indexing/test_floats.py index 29f3889d20bd0..920aefa24b576 100644 --- a/pandas/tests/indexing/test_floats.py +++ b/pandas/tests/indexing/test_floats.py @@ -676,3 +676,14 @@ def test_floating_misc(self): assert_series_equal(result1, result2) assert_series_equal(result1, result3) assert_series_equal(result1, Series([1], index=[2.5])) + + def test_floating_tuples(self): + # GH13509 + s = Series([(1, 1), (2, 2), (3, 3)], index=[0.0, 0.1, 0.2], name='foo') + result = s[0.0] + self.assertEqual(result, (1, 1)) + + s = Series([(1, 1), (2, 2), (3, 3)], index=[0.0, 0.0, 0.2], name='foo') + result = s[0.0] + expected = Series([(1, 1), (2, 2)], index=[0.0, 0.0], name='foo') + assert_series_equal(result, expected) diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index b86b248ead290..e0d63d5aa0c44 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -6,6 +6,9 @@ import warnings from datetime import datetime +from pandas.types.common import (is_integer_dtype, + is_float_dtype, + is_scalar) from pandas.compat import range, lrange, lzip, StringIO, lmap, map from pandas.tslib import NaT from numpy import nan @@ -18,11 +21,8 @@ from pandas.core.indexing import _non_reducing_slice, _maybe_numeric_slice from pandas.core.api import (DataFrame, Index, Series, Panel, isnull, MultiIndex, Timestamp, Timedelta) -from pandas.util.testing import (assert_almost_equal, assert_series_equal, - assert_frame_equal, assert_panel_equal, - assert_attr_equal, slow) from pandas.formats.printing import pprint_thing -from pandas import concat, lib +from pandas import concat from pandas.core.common import PerformanceWarning import pandas.util.testing as tm @@ -108,32 +108,34 @@ def setUp(self): warnings.filterwarnings(action='ignore', category=FutureWarning) self.series_ints = Series(np.random.rand(4), index=lrange(0, 8, 2)) - self.frame_ints = DataFrame( - np.random.randn( - 4, 4), index=lrange(0, 8, 2), columns=lrange(0, 12, 3)) - self.panel_ints = Panel( - np.random.rand(4, 4, 4), items=lrange(0, 8, 2), - major_axis=lrange(0, 12, 3), minor_axis=lrange(0, 16, 4)) + self.frame_ints = DataFrame(np.random.randn(4, 4), + index=lrange(0, 8, 2), + columns=lrange(0, 12, 3)) + self.panel_ints = Panel(np.random.rand(4, 4, 4), + items=lrange(0, 8, 2), + major_axis=lrange(0, 12, 3), + minor_axis=lrange(0, 16, 4)) self.series_labels = Series(np.random.randn(4), index=list('abcd')) - self.frame_labels = DataFrame( - np.random.randn(4, 4), index=list('abcd'), columns=list('ABCD')) - self.panel_labels = Panel( - np.random.randn(4, 4, 4), items=list('abcd'), - major_axis=list('ABCD'), minor_axis=list('ZYXW')) + self.frame_labels = DataFrame(np.random.randn(4, 4), + index=list('abcd'), columns=list('ABCD')) + self.panel_labels = Panel(np.random.randn(4, 4, 4), + items=list('abcd'), + major_axis=list('ABCD'), + minor_axis=list('ZYXW')) self.series_mixed = Series(np.random.randn(4), index=[2, 4, 'null', 8]) - self.frame_mixed = DataFrame( - np.random.randn(4, 4), index=[2, 4, 'null', 8]) - self.panel_mixed = Panel( - np.random.randn(4, 4, 4), items=[2, 4, 'null', 8]) - - self.series_ts = Series( - np.random.randn(4), index=date_range('20130101', periods=4)) - self.frame_ts = DataFrame( - np.random.randn(4, 4), index=date_range('20130101', periods=4)) - self.panel_ts = Panel( - np.random.randn(4, 4, 4), items=date_range('20130101', periods=4)) + self.frame_mixed = DataFrame(np.random.randn(4, 4), + index=[2, 4, 'null', 8]) + self.panel_mixed = Panel(np.random.randn(4, 4, 4), + items=[2, 4, 'null', 8]) + + self.series_ts = Series(np.random.randn(4), + index=date_range('20130101', periods=4)) + self.frame_ts = DataFrame(np.random.randn(4, 4), + index=date_range('20130101', periods=4)) + self.panel_ts = Panel(np.random.randn(4, 4, 4), + items=date_range('20130101', periods=4)) self.frame_empty = DataFrame({}) self.series_empty = Series({}) @@ -166,7 +168,7 @@ def check_values(self, f, func, values=False): for a in reversed(i): expected = expected.__getitem__(a) - assert_almost_equal(result, expected) + tm.assert_almost_equal(result, expected) def check_result(self, name, method1, key1, method2, key2, typs=None, objs=None, axes=None, fails=None): @@ -200,14 +202,14 @@ def _print(result, error=None): return try: - if lib.isscalar(rs) and lib.isscalar(xp): + if is_scalar(rs) and is_scalar(xp): self.assertEqual(rs, xp) elif xp.ndim == 1: - assert_series_equal(rs, xp) + tm.assert_series_equal(rs, xp) elif xp.ndim == 2: - assert_frame_equal(rs, xp) + tm.assert_frame_equal(rs, xp) elif xp.ndim == 3: - assert_panel_equal(rs, xp) + tm.assert_panel_equal(rs, xp) result = 'ok' except (AssertionError): result = 'fail' @@ -281,7 +283,7 @@ def test_indexer_caching(self): expected = Series(np.ones(n), index=index) s = Series(np.zeros(n), index=index) s[s == 0] = 1 - assert_series_equal(s, expected) + tm.assert_series_equal(s, expected) def test_at_and_iat_get(self): def _check(f, func, values=False): @@ -291,7 +293,7 @@ def _check(f, func, values=False): for i in indicies: result = getattr(f, func)[i] expected = _get_value(f, i, values) - assert_almost_equal(result, expected) + tm.assert_almost_equal(result, expected) for o in self._objs: @@ -317,7 +319,7 @@ def _check(f, func, values=False): for i in indicies: getattr(f, func)[i] = 1 expected = _get_value(f, i, values) - assert_almost_equal(expected, 1) + tm.assert_almost_equal(expected, 1) for t in self._objs: @@ -380,12 +382,12 @@ def test_imethods_with_dups(self): result = s.iloc[[2, 3]] expected = Series([2, 3], [2, 2], dtype='int64') - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) df = s.to_frame() result = df.iloc[2] expected = Series(2, index=[0], name=2) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) result = df.iat[2, 0] expected = 2 @@ -399,7 +401,7 @@ def test_repeated_getitem_dups(self): index=['ABCDE' [x % 5] for x in range(20)]) expected = df.loc['A', 0] result = df.loc[:, 0].loc['A'] - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_iloc_exceeds_bounds(self): @@ -421,70 +423,69 @@ def test_iloc_exceeds_bounds(self): self.assertRaises(IndexError, lambda: s.iloc[[-100]]) # still raise on a single indexer - with tm.assertRaisesRegexp( - IndexError, 'single positional indexer is out-of-bounds'): + msg = 'single positional indexer is out-of-bounds' + with tm.assertRaisesRegexp(IndexError, msg): df.iloc[30] self.assertRaises(IndexError, lambda: df.iloc[-30]) # GH10779 # single positive/negative indexer exceeding Series bounds should raise # an IndexError - with tm.assertRaisesRegexp( - IndexError, 'single positional indexer is out-of-bounds'): + with tm.assertRaisesRegexp(IndexError, msg): s.iloc[30] self.assertRaises(IndexError, lambda: s.iloc[-30]) # slices are ok result = df.iloc[:, 4:10] # 0 < start < len < stop expected = df.iloc[:, 4:] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.iloc[:, -4:-10] # stop < 0 < start < len expected = df.iloc[:, :0] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.iloc[:, 10:4:-1] # 0 < stop < len < start (down) expected = df.iloc[:, :4:-1] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.iloc[:, 4:-10:-1] # stop < 0 < start < len (down) expected = df.iloc[:, 4::-1] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.iloc[:, -10:4] # start < 0 < stop < len expected = df.iloc[:, :4] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.iloc[:, 10:4] # 0 < stop < len < start expected = df.iloc[:, :0] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.iloc[:, -10:-11:-1] # stop < start < 0 < len (down) expected = df.iloc[:, :0] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.iloc[:, 10:11] # 0 < len < start < stop expected = df.iloc[:, :0] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # slice bounds exceeding is ok result = s.iloc[18:30] expected = s.iloc[18:] - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) result = s.iloc[30:] expected = s.iloc[:0] - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) result = s.iloc[30::-1] expected = s.iloc[::-1] - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) # doc example def check(result, expected): str(result) result.dtypes - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) dfl = DataFrame(np.random.randn(5, 2), columns=list('AB')) check(dfl.iloc[:, 2:3], DataFrame(index=dfl.index)) @@ -497,9 +498,8 @@ def check(result, expected): def test_iloc_getitem_int(self): # integer - self.check_result('integer', 'iloc', 2, 'ix', {0: 4, - 1: 6, - 2: 8}, typs=['ints']) + self.check_result('integer', 'iloc', 2, 'ix', + {0: 4, 1: 6, 2: 8}, typs=['ints']) self.check_result('integer', 'iloc', 2, 'indexer', 2, typs=['labels', 'mixed', 'ts', 'floats', 'empty'], fails=IndexError) @@ -507,9 +507,8 @@ def test_iloc_getitem_int(self): def test_iloc_getitem_neg_int(self): # neg integer - self.check_result('neg int', 'iloc', -1, 'ix', {0: 6, - 1: 9, - 2: 12}, typs=['ints']) + self.check_result('neg int', 'iloc', -1, 'ix', + {0: 6, 1: 9, 2: 12}, typs=['ints']) self.check_result('neg int', 'iloc', -1, 'indexer', -1, typs=['labels', 'mixed', 'ts', 'floats', 'empty'], fails=IndexError) @@ -517,14 +516,11 @@ def test_iloc_getitem_neg_int(self): def test_iloc_getitem_list_int(self): # list of ints - self.check_result('list int', 'iloc', [0, 1, 2], 'ix', {0: [0, 2, 4], - 1: [0, 3, 6], - 2: [0, 4, 8]}, - typs=['ints']) - self.check_result('list int', 'iloc', [2], 'ix', {0: [4], - 1: [6], - 2: [8]}, + self.check_result('list int', 'iloc', [0, 1, 2], 'ix', + {0: [0, 2, 4], 1: [0, 3, 6], 2: [0, 4, 8]}, typs=['ints']) + self.check_result('list int', 'iloc', [2], 'ix', + {0: [4], 1: [6], 2: [8]}, typs=['ints']) self.check_result('list int', 'iloc', [0, 1, 2], 'indexer', [0, 1, 2], typs=['labels', 'mixed', 'ts', 'floats', 'empty'], fails=IndexError) @@ -535,10 +531,8 @@ def test_iloc_getitem_list_int(self): {0: [0, 2, 4], 1: [0, 3, 6], 2: [0, 4, 8]}, typs=['ints']) - self.check_result('array int', 'iloc', np.array([2]), 'ix', {0: [4], - 1: [6], - 2: [8]}, - typs=['ints']) + self.check_result('array int', 'iloc', np.array([2]), 'ix', + {0: [4], 1: [6], 2: [8]}, typs=['ints']) self.check_result('array int', 'iloc', np.array([0, 1, 2]), 'indexer', [0, 1, 2], typs=['labels', 'mixed', 'ts', 'floats', 'empty'], @@ -552,11 +546,11 @@ def test_iloc_getitem_neg_int_can_reach_first_index(self): expected = df.iloc[0] result = df.iloc[-3] - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) expected = df.iloc[[0]] result = df.iloc[[-3]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) expected = s.iloc[0] result = s.iloc[-3] @@ -564,20 +558,19 @@ def test_iloc_getitem_neg_int_can_reach_first_index(self): expected = s.iloc[[0]] result = s.iloc[[-3]] - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) # check the length 1 Series case highlighted in GH10547 expected = pd.Series(['a'], index=['A']) result = expected.iloc[[-1]] - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_iloc_getitem_dups(self): # no dups in panel (bug?) self.check_result('list int (dups)', 'iloc', [0, 1, 1, 3], 'ix', - {0: [0, 2, 2, 6], - 1: [0, 3, 3, 9 - ]}, objs=['series', 'frame'], typs=['ints']) + {0: [0, 2, 2, 6], 1: [0, 3, 3, 9]}, + objs=['series', 'frame'], typs=['ints']) # GH 6766 df1 = DataFrame([{'A': None, 'B': 1}, {'A': 2, 'B': 2}]) @@ -591,15 +584,14 @@ def test_iloc_getitem_dups(self): result = df.iloc[0, :] expected = Series([np.nan, 1, 3, 3], index=['A', 'B', 'A', 'B'], name=0) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_iloc_getitem_array(self): # array like s = Series(index=lrange(1, 4)) - self.check_result('array like', 'iloc', s.index, 'ix', {0: [2, 4, 6], - 1: [3, 6, 9], - 2: [4, 8, 12]}, + self.check_result('array like', 'iloc', s.index, 'ix', + {0: [2, 4, 6], 1: [3, 6, 9], 2: [4, 8, 12]}, typs=['ints']) def test_iloc_getitem_bool(self): @@ -614,39 +606,38 @@ def test_iloc_getitem_bool(self): def test_iloc_getitem_slice(self): # slices - self.check_result('slice', 'iloc', slice(1, 3), 'ix', {0: [2, 4], - 1: [3, 6], - 2: [4, 8]}, + self.check_result('slice', 'iloc', slice(1, 3), 'ix', + {0: [2, 4], 1: [3, 6], 2: [4, 8]}, typs=['ints']) - self.check_result('slice', 'iloc', slice(1, 3), 'indexer', slice( - 1, 3), typs=['labels', 'mixed', 'ts', 'floats', 'empty'], - fails=IndexError) + self.check_result('slice', 'iloc', slice(1, 3), 'indexer', + slice(1, 3), + typs=['labels', 'mixed', 'ts', 'floats', 'empty'], + fails=IndexError) def test_iloc_getitem_slice_dups(self): df1 = DataFrame(np.random.randn(10, 4), columns=['A', 'A', 'B', 'B']) - df2 = DataFrame( - np.random.randint(0, 10, size=20).reshape(10, - 2), columns=['A', 'C']) + df2 = DataFrame(np.random.randint(0, 10, size=20).reshape(10, 2), + columns=['A', 'C']) # axis=1 df = concat([df1, df2], axis=1) - assert_frame_equal(df.iloc[:, :4], df1) - assert_frame_equal(df.iloc[:, 4:], df2) + tm.assert_frame_equal(df.iloc[:, :4], df1) + tm.assert_frame_equal(df.iloc[:, 4:], df2) df = concat([df2, df1], axis=1) - assert_frame_equal(df.iloc[:, :2], df2) - assert_frame_equal(df.iloc[:, 2:], df1) + tm.assert_frame_equal(df.iloc[:, :2], df2) + tm.assert_frame_equal(df.iloc[:, 2:], df1) - assert_frame_equal(df.iloc[:, 0:3], concat( - [df2, df1.iloc[:, [0]]], axis=1)) + exp = concat([df2, df1.iloc[:, [0]]], axis=1) + tm.assert_frame_equal(df.iloc[:, 0:3], exp) # axis=0 df = concat([df, df], axis=0) - assert_frame_equal(df.iloc[0:10, :2], df2) - assert_frame_equal(df.iloc[0:10, 2:], df1) - assert_frame_equal(df.iloc[10:, :2], df2) - assert_frame_equal(df.iloc[10:, 2:], df1) + tm.assert_frame_equal(df.iloc[0:10, :2], df2) + tm.assert_frame_equal(df.iloc[0:10, 2:], df1) + tm.assert_frame_equal(df.iloc[10:, :2], df2) + tm.assert_frame_equal(df.iloc[10:, 2:], df1) def test_iloc_getitem_multiindex2(self): # TODO(wesm): fix this @@ -659,11 +650,11 @@ def test_iloc_getitem_multiindex2(self): rs = df.iloc[2] xp = Series(arr[2], index=df.columns) - assert_series_equal(rs, xp) + tm.assert_series_equal(rs, xp) rs = df.iloc[:, 2] xp = Series(arr[:, 2], index=df.index) - assert_series_equal(rs, xp) + tm.assert_series_equal(rs, xp) rs = df.iloc[2, 2] xp = df.values[2, 2] @@ -673,14 +664,14 @@ def test_iloc_getitem_multiindex2(self): # GH 5528 rs = df.iloc[[0, 1]] xp = df.xs(4, drop_level=False) - assert_frame_equal(rs, xp) + tm.assert_frame_equal(rs, xp) tup = zip(*[['a', 'a', 'b', 'b'], ['x', 'y', 'x', 'y']]) index = MultiIndex.from_tuples(tup) df = DataFrame(np.random.randn(4, 4), index=index) rs = df.iloc[[2, 3]] xp = df.xs('b', drop_level=False) - assert_frame_equal(rs, xp) + tm.assert_frame_equal(rs, xp) def test_iloc_setitem(self): df = self.frame_ints @@ -692,13 +683,13 @@ def test_iloc_setitem(self): df.iloc[:, 2:3] = 0 expected = df.iloc[:, 2:3] result = df.iloc[:, 2:3] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # GH5771 s = Series(0, index=[4, 5, 6]) s.iloc[1:2] += 1 expected = Series([0, 1, 0], index=[4, 5, 6]) - assert_series_equal(s, expected) + tm.assert_series_equal(s, expected) def test_loc_setitem_slice(self): # GH10503 @@ -711,7 +702,7 @@ def test_loc_setitem_slice(self): df1.loc[ix, 'b'] = newb1 expected = DataFrame({'a': [0, 1, 1], 'b': Series([100, 201, 301], dtype='uint32')}) - assert_frame_equal(df1, expected) + tm.assert_frame_equal(df1, expected) # assigning a new type should get the inferred type df2 = DataFrame({'a': [0, 1, 1], 'b': [100, 200, 300]}, @@ -721,7 +712,7 @@ def test_loc_setitem_slice(self): df1.loc[ix, 'b'] = newb2 expected = DataFrame({'a': [0, 1, 1], 'b': [100, 200, 300]}, dtype='uint64') - assert_frame_equal(df2, expected) + tm.assert_frame_equal(df2, expected) def test_ix_loc_setitem_consistency(self): @@ -730,7 +721,7 @@ def test_ix_loc_setitem_consistency(self): s = Series(0, index=[4, 5, 6]) s.loc[4:5] += 1 expected = Series([1, 1, 0], index=[4, 5, 6]) - assert_series_equal(s, expected) + tm.assert_series_equal(s, expected) # GH 5928 # chained indexing assignment @@ -739,12 +730,12 @@ def test_ix_loc_setitem_consistency(self): expected.ix[[0, 1, 2], 'a'] = -expected.ix[[0, 1, 2], 'a'] df['a'].ix[[0, 1, 2]] = -df['a'].ix[[0, 1, 2]] - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) df = DataFrame({'a': [0, 1, 2], 'b': [0, 1, 2]}) df['a'].ix[[0, 1, 2]] = -df['a'].ix[[0, 1, 2]].astype('float64') + 0.5 expected = DataFrame({'a': [0.5, -0.5, -1.5], 'b': [0, 1, 2]}) - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) # GH 8607 # ix setitem consistency @@ -758,15 +749,15 @@ def test_ix_loc_setitem_consistency(self): df2 = df.copy() df2['timestamp'] = pd.to_datetime(df['timestamp'], unit='s') - assert_frame_equal(df2, expected) + tm.assert_frame_equal(df2, expected) df2 = df.copy() df2.loc[:, 'timestamp'] = pd.to_datetime(df['timestamp'], unit='s') - assert_frame_equal(df2, expected) + tm.assert_frame_equal(df2, expected) df2 = df.copy() df2.ix[:, 2] = pd.to_datetime(df['timestamp'], unit='s') - assert_frame_equal(df2, expected) + tm.assert_frame_equal(df2, expected) def test_ix_loc_consistency(self): @@ -775,7 +766,7 @@ def test_ix_loc_consistency(self): # this is not an exhaustive case def compare(result, expected): - if lib.isscalar(expected): + if is_scalar(expected): self.assertEqual(result, expected) else: self.assertTrue(expected.equals(result)) @@ -793,9 +784,8 @@ def compare(result, expected): self.assertRaises(TypeError, lambda: df.loc[key]) - df = pd.DataFrame( - np.random.randn(5, 4), columns=list('ABCD'), - index=pd.date_range('2012-01-01', periods=5)) + df = pd.DataFrame(np.random.randn(5, 4), columns=list('ABCD'), + index=pd.date_range('2012-01-01', periods=5)) for key in ['2012-01-03', '2012-01-31', @@ -831,19 +821,19 @@ def compare(result, expected): result1 = s['a':'c'] result2 = s.ix['a':'c'] result3 = s.loc['a':'c'] - assert_series_equal(result1, result2) - assert_series_equal(result1, result3) + tm.assert_series_equal(result1, result2) + tm.assert_series_equal(result1, result3) # now work rather than raising KeyError s = Series(range(5), [-2, -1, 1, 2, 3]) result1 = s.ix[-10:3] result2 = s.loc[-10:3] - assert_series_equal(result1, result2) + tm.assert_series_equal(result1, result2) result1 = s.ix[0:3] result2 = s.loc[0:3] - assert_series_equal(result1, result2) + tm.assert_series_equal(result1, result2) def test_setitem_multiindex(self): for index_fn in ('ix', 'loc'): @@ -856,31 +846,32 @@ def check(target, indexers, value, compare_fn, expected=None): expected = value compare_fn(result, expected) # GH7190 - index = pd.MultiIndex.from_product( - [np.arange(0, 100), np.arange(0, 80)], names=['time', 'firm']) + index = pd.MultiIndex.from_product([np.arange(0, 100), + np.arange(0, 80)], + names=['time', 'firm']) t, n = 0, 2 - df = DataFrame( - np.nan, columns=['A', 'w', 'l', 'a', 'x', 'X', 'd', 'profit'], - index=index) + df = DataFrame(np.nan, columns=['A', 'w', 'l', 'a', 'x', + 'X', 'd', 'profit'], + index=index) check(target=df, indexers=((t, n), 'X'), value=0, compare_fn=self.assertEqual) - df = DataFrame( - -999, columns=['A', 'w', 'l', 'a', 'x', 'X', 'd', 'profit'], - index=index) + df = DataFrame(-999, columns=['A', 'w', 'l', 'a', 'x', + 'X', 'd', 'profit'], + index=index) check(target=df, indexers=((t, n), 'X'), value=1, compare_fn=self.assertEqual) - df = DataFrame( - columns=['A', 'w', 'l', 'a', 'x', 'X', 'd', 'profit'], - index=index) + df = DataFrame(columns=['A', 'w', 'l', 'a', 'x', + 'X', 'd', 'profit'], + index=index) check(target=df, indexers=((t, n), 'X'), value=2, compare_fn=self.assertEqual) # GH 7218, assinging with 0-dim arrays - df = DataFrame( - -999, columns=['A', 'w', 'l', 'a', 'x', 'X', 'd', 'profit'], - index=index) + df = DataFrame(-999, columns=['A', 'w', 'l', 'a', 'x', + 'X', 'd', 'profit'], + index=index) check(target=df, indexers=((t, n), 'X'), value=np.array(3), @@ -888,54 +879,55 @@ def check(target, indexers, value, compare_fn, expected=None): expected=3, ) # GH5206 - df = pd.DataFrame( - np.arange(25).reshape(5, 5), columns='A,B,C,D,E'.split(','), - dtype=float - ) + df = pd.DataFrame(np.arange(25).reshape(5, 5), + columns='A,B,C,D,E'.split(','), dtype=float) df['F'] = 99 row_selection = df['A'] % 2 == 0 col_selection = ['B', 'C'] df.ix[row_selection, col_selection] = df['F'] output = pd.DataFrame(99., index=[0, 2, 4], columns=['B', 'C']) - assert_frame_equal(df.ix[row_selection, col_selection], output) + tm.assert_frame_equal(df.ix[row_selection, col_selection], output) check(target=df, indexers=(row_selection, col_selection), value=df['F'], - compare_fn=assert_frame_equal, + compare_fn=tm.assert_frame_equal, expected=output, ) # GH11372 idx = pd.MultiIndex.from_product([ - ['A', 'B', 'C'], pd.date_range( - '2015-01-01', '2015-04-01', freq='MS') - ]) + ['A', 'B', 'C'], + pd.date_range('2015-01-01', '2015-04-01', freq='MS')]) cols = pd.MultiIndex.from_product([ - ['foo', 'bar'], pd.date_range( - '2016-01-01', '2016-02-01', freq='MS') - ]) - df = pd.DataFrame( - np.random.random((12, 4)), index=idx, columns=cols) - subidx = pd.MultiIndex.from_tuples([('A', pd.Timestamp( - '2015-01-01')), ('A', pd.Timestamp('2015-02-01'))]) - subcols = pd.MultiIndex.from_tuples([('foo', pd.Timestamp( - '2016-01-01')), ('foo', pd.Timestamp('2016-02-01'))]) - vals = pd.DataFrame( - np.random.random((2, 2)), index=subidx, columns=subcols) + ['foo', 'bar'], + pd.date_range('2016-01-01', '2016-02-01', freq='MS')]) + + df = pd.DataFrame(np.random.random((12, 4)), + index=idx, columns=cols) + + subidx = pd.MultiIndex.from_tuples( + [('A', pd.Timestamp('2015-01-01')), + ('A', pd.Timestamp('2015-02-01'))]) + subcols = pd.MultiIndex.from_tuples( + [('foo', pd.Timestamp('2016-01-01')), + ('foo', pd.Timestamp('2016-02-01'))]) + + vals = pd.DataFrame(np.random.random((2, 2)), + index=subidx, columns=subcols) check(target=df, indexers=(subidx, subcols), value=vals, - compare_fn=assert_frame_equal, ) + compare_fn=tm.assert_frame_equal, ) # set all columns vals = pd.DataFrame( np.random.random((2, 4)), index=subidx, columns=cols) check(target=df, indexers=(subidx, slice(None, None, None)), value=vals, - compare_fn=assert_frame_equal, ) + compare_fn=tm.assert_frame_equal, ) # identity copy = df.copy() check(target=df, indexers=(df.index, df.columns), value=df, - compare_fn=assert_frame_equal, expected=copy) + compare_fn=tm.assert_frame_equal, expected=copy) def test_indexing_with_datetime_tz(self): @@ -955,17 +947,17 @@ def test_indexing_with_datetime_tz(self): expected = Series([Timestamp('2013-01-02 00:00:00-0500', tz='US/Eastern'), np.nan, np.nan], index=list('ABC'), dtype='object', name=1) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) result = df.loc[1] expected = Series([Timestamp('2013-01-02 00:00:00-0500', tz='US/Eastern'), np.nan, np.nan], index=list('ABC'), dtype='object', name=1) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) # indexing - fast_xs df = DataFrame({'a': date_range('2014-01-01', periods=10, tz='UTC')}) result = df.iloc[5] - expected = Timestamp('2014-01-06 00:00:00+0000', tz='UTC', offset='D') + expected = Timestamp('2014-01-06 00:00:00+0000', tz='UTC', freq='D') self.assertEqual(result, expected) result = df.loc[5] @@ -974,7 +966,7 @@ def test_indexing_with_datetime_tz(self): # indexing - boolean result = df[df.a > df.a[3]] expected = df.iloc[4:] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # indexing - setting an element df = DataFrame(data=pd.to_datetime( @@ -992,7 +984,7 @@ def f(): v = df.loc[df.new_col == 'new', 'time'] + pd.Timedelta('1s') df.loc[df.new_col == 'new', 'time'] = v - assert_series_equal(df.loc[df.new_col == 'new', 'time'], v) + tm.assert_series_equal(df.loc[df.new_col == 'new', 'time'], v) def test_indexing_with_datetimeindex_tz(self): @@ -1007,22 +999,22 @@ def test_indexing_with_datetimeindex_tz(self): for sel in (index, list(index)): # getitem - assert_series_equal(ser[sel], ser) + tm.assert_series_equal(ser[sel], ser) # setitem result = ser.copy() result[sel] = 1 expected = pd.Series(1, index=index) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) # .loc getitem - assert_series_equal(ser.loc[sel], ser) + tm.assert_series_equal(ser.loc[sel], ser) # .loc setitem result = ser.copy() result.loc[sel] = 1 expected = pd.Series(1, index=index) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) # single element indexing @@ -1033,7 +1025,7 @@ def test_indexing_with_datetimeindex_tz(self): result = ser.copy() result[index[1]] = 5 expected = pd.Series([0, 5], index=index) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) # .loc getitem self.assertEqual(ser.loc[index[1]], 1) @@ -1042,7 +1034,7 @@ def test_indexing_with_datetimeindex_tz(self): result = ser.copy() result.loc[index[1]] = 5 expected = pd.Series([0, 5], index=index) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_loc_setitem_dups(self): @@ -1056,7 +1048,7 @@ def test_loc_setitem_dups(self): indexer = tuple(['r', ['bar', 'bar2']]) df = df_orig.copy() df.loc[indexer] *= 2.0 - assert_series_equal(df.loc[indexer], 2.0 * df_orig.loc[indexer]) + tm.assert_series_equal(df.loc[indexer], 2.0 * df_orig.loc[indexer]) indexer = tuple(['r', 'bar']) df = df_orig.copy() @@ -1066,7 +1058,7 @@ def test_loc_setitem_dups(self): indexer = tuple(['t', ['bar', 'bar2']]) df = df_orig.copy() df.loc[indexer] *= 2.0 - assert_frame_equal(df.loc[indexer], 2.0 * df_orig.loc[indexer]) + tm.assert_frame_equal(df.loc[indexer], 2.0 * df_orig.loc[indexer]) def test_iloc_setitem_dups(self): @@ -1081,24 +1073,24 @@ def test_iloc_setitem_dups(self): inds = np.isnan(df.iloc[:, 0]) mask = inds[inds].index df.iloc[mask, 0] = df.iloc[mask, 2] - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) # del a dup column across blocks expected = DataFrame({0: [1, 2], 1: [3, 4]}) expected.columns = ['B', 'B'] del df['A'] - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) # assign back to self df.iloc[[0, 1], [0, 1]] = df.iloc[[0, 1], [0, 1]] - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) # reversed x 2 df.iloc[[1, 0], [0, 1]] = df.iloc[[1, 0], [0, 1]].reset_index( drop=True) df.iloc[[1, 0], [0, 1]] = df.iloc[[1, 0], [0, 1]].reset_index( drop=True) - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) def test_chained_getitem_with_lists(self): @@ -1193,12 +1185,14 @@ def test_loc_getitem_label_list(self): self.check_result('list lbl', 'loc', [4, 8, 10], 'ix', [4, 8, 10], typs=['ints'], axes=2, fails=KeyError) + def test_loc_getitem_label_list_fails(self): # fails self.check_result('list lbl', 'loc', [20, 30, 40], 'ix', [20, 30, 40], typs=['ints'], axes=1, fails=KeyError) self.check_result('list lbl', 'loc', [20, 30, 40], 'ix', [20, 30, 40], typs=['ints'], axes=2, fails=KeyError) + def test_loc_getitem_label_array_like(self): # array like self.check_result('array like', 'loc', Series(index=[0, 2, 4]).index, 'ix', [0, 2, 4], typs=['ints'], axes=0) @@ -1208,7 +1202,6 @@ def test_loc_getitem_label_list(self): 'ix', [4, 8, 12], typs=['ints'], axes=2) def test_loc_getitem_bool(self): - # boolean indexers b = [True, False, True, False] self.check_result('bool', 'loc', b, 'ix', b, @@ -1235,7 +1228,7 @@ def test_loc_getitem_int_slice(self): df = DataFrame(np.random.randn(6, 6), index, index) result = df.loc[6:8, :] expected = df.ix[6:8, :] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) index = MultiIndex.from_tuples([t for t in product( @@ -1243,17 +1236,17 @@ def test_loc_getitem_int_slice(self): df = DataFrame(np.random.randn(6, 6), index, index) result = df.loc[20:30, :] expected = df.ix[20:30, :] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # doc examples result = df.loc[10, :] expected = df.ix[10, :] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.loc[:, 10] # expected = df.ix[:,10] (this fails) expected = df[10] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_loc_to_fail(self): @@ -1281,7 +1274,7 @@ def test_loc_to_fail(self): s.loc[-1] = 3 result = s.loc[[-1, -2]] expected = Series([3, np.nan], index=[-1, -2]) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) s['a'] = 2 self.assertRaises(KeyError, lambda: s.loc[[-2]]) @@ -1307,6 +1300,7 @@ def f(): self.assertRaises(KeyError, f) + def test_at_to_fail(self): # at should not fallback # GH 7814 s = Series([1, 2, 3], index=list('abc')) @@ -1329,30 +1323,40 @@ def f(): self.assertEqual(result, 3) self.assertRaises(ValueError, lambda: df.at['a', 0]) + # GH 13822, incorrect error string with non-unique columns when missing + # column is accessed + df = DataFrame({'x': [1.], 'y': [2.], 'z': [3.]}) + df.columns = ['x', 'x', 'z'] + + # Check that we get the correct value in the KeyError + self.assertRaisesRegexp(KeyError, "\['y'\] not in index", + lambda: df[['x', 'y', 'z']]) + def test_loc_getitem_label_slice(self): # label slices (with ints) - self.check_result('lab slice', 'loc', slice(1, 3), 'ix', slice( - 1, 3), typs=['labels', 'mixed', 'empty', 'ts', 'floats'], - fails=TypeError) + self.check_result('lab slice', 'loc', slice(1, 3), + 'ix', slice(1, 3), + typs=['labels', 'mixed', 'empty', 'ts', 'floats'], + fails=TypeError) # real label slices - self.check_result('lab slice', 'loc', slice('a', 'c'), 'ix', slice( - 'a', 'c'), typs=['labels'], axes=0) - self.check_result('lab slice', 'loc', slice('A', 'C'), 'ix', slice( - 'A', 'C'), typs=['labels'], axes=1) - self.check_result('lab slice', 'loc', slice('W', 'Z'), 'ix', slice( - 'W', 'Z'), typs=['labels'], axes=2) - - self.check_result('ts slice', 'loc', slice( - '20130102', '20130104'), 'ix', slice('20130102', '20130104'), - typs=['ts'], axes=0) - self.check_result('ts slice', 'loc', slice( - '20130102', '20130104'), 'ix', slice('20130102', '20130104'), - typs=['ts'], axes=1, fails=TypeError) - self.check_result('ts slice', 'loc', slice( - '20130102', '20130104'), 'ix', slice('20130102', '20130104'), - typs=['ts'], axes=2, fails=TypeError) + self.check_result('lab slice', 'loc', slice('a', 'c'), + 'ix', slice('a', 'c'), typs=['labels'], axes=0) + self.check_result('lab slice', 'loc', slice('A', 'C'), + 'ix', slice('A', 'C'), typs=['labels'], axes=1) + self.check_result('lab slice', 'loc', slice('W', 'Z'), + 'ix', slice('W', 'Z'), typs=['labels'], axes=2) + + self.check_result('ts slice', 'loc', slice('20130102', '20130104'), + 'ix', slice('20130102', '20130104'), + typs=['ts'], axes=0) + self.check_result('ts slice', 'loc', slice('20130102', '20130104'), + 'ix', slice('20130102', '20130104'), + typs=['ts'], axes=1, fails=TypeError) + self.check_result('ts slice', 'loc', slice('20130102', '20130104'), + 'ix', slice('20130102', '20130104'), + typs=['ts'], axes=2, fails=TypeError) self.check_result('mixed slice', 'loc', slice(2, 8), 'ix', slice(2, 8), typs=['mixed'], axes=0, fails=TypeError) @@ -1378,7 +1382,7 @@ def test_loc_general(self): # mixed type result = DataFrame({'a': [Timestamp('20130101')], 'b': [1]}).iloc[0] expected = Series([Timestamp('20130101'), 1], index=['a', 'b'], name=0) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) self.assertEqual(result.dtype, object) def test_loc_setitem_consistency(self): @@ -1392,49 +1396,45 @@ def test_loc_setitem_consistency(self): 'val': Series( range(5), dtype=np.int64)}) df.loc[:, 'date'] = 0 - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) df = DataFrame({'date': date_range('2000-01-01', '2000-01-5'), - 'val': Series( - range(5), dtype=np.int64)}) + 'val': Series(range(5), dtype=np.int64)}) df.loc[:, 'date'] = np.array(0, dtype=np.int64) - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) df = DataFrame({'date': date_range('2000-01-01', '2000-01-5'), - 'val': Series( - range(5), dtype=np.int64)}) + 'val': Series(range(5), dtype=np.int64)}) df.loc[:, 'date'] = np.array([0, 0, 0, 0, 0], dtype=np.int64) - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) expected = DataFrame({'date': Series('foo', index=range(5)), - 'val': Series( - range(5), dtype=np.int64)}) + 'val': Series(range(5), dtype=np.int64)}) df = DataFrame({'date': date_range('2000-01-01', '2000-01-5'), - 'val': Series( - range(5), dtype=np.int64)}) + 'val': Series(range(5), dtype=np.int64)}) df.loc[:, 'date'] = 'foo' - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) expected = DataFrame({'date': Series(1.0, index=range(5)), - 'val': Series( - range(5), dtype=np.int64)}) + 'val': Series(range(5), dtype=np.int64)}) df = DataFrame({'date': date_range('2000-01-01', '2000-01-5'), - 'val': Series( - range(5), dtype=np.int64)}) + 'val': Series(range(5), dtype=np.int64)}) df.loc[:, 'date'] = 1.0 - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) + def test_loc_setitem_consistency_empty(self): # empty (essentially noops) expected = DataFrame(columns=['x', 'y']) expected['x'] = expected['x'].astype(np.int64) df = DataFrame(columns=['x', 'y']) df.loc[:, 'x'] = 1 - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) df = DataFrame(columns=['x', 'y']) df['x'] = 1 - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) + def test_loc_setitem_consistency_slice_column_len(self): # .loc[:,column] setting with slice == len of the column # GH10408 data = """Level_0,,,Respondent,Respondent,Respondent,OtherCat,OtherCat @@ -1457,7 +1457,7 @@ def test_loc_setitem_consistency(self): 'Respondent', 'Duration')].astype('timedelta64[s]') expected = Series([1380, 720, 840, 2160.], index=df.index, name=('Respondent', 'Duration')) - assert_series_equal(df[('Respondent', 'Duration')], expected) + tm.assert_series_equal(df[('Respondent', 'Duration')], expected) def test_loc_setitem_frame(self): df = self.frame_labels @@ -1474,7 +1474,7 @@ def test_loc_setitem_frame(self): df.loc[:, 'B':'D'] = 0 expected = df.loc[:, 'B':'D'] result = df.ix[:, 1:] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # GH 6254 # setting issue @@ -1482,7 +1482,7 @@ def test_loc_setitem_frame(self): df.loc[[4, 3, 5], 'A'] = np.array([1, 2, 3], dtype='int64') expected = DataFrame(dict(A=Series( [1, 2, 3], index=[4, 3, 5]))).reindex(index=[3, 5, 4]) - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) # GH 6252 # setting with an empty frame @@ -1502,14 +1502,14 @@ def test_loc_setitem_frame(self): expected = DataFrame(dict(A=Series(val1, index=keys1), B=Series( val2, index=keys2))).reindex(index=index) - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) # GH 8669 # invalid coercion of nan -> int df = DataFrame({'A': [1, 2, 3], 'B': np.nan}) df.loc[df.B > df.A, 'B'] = df.A expected = DataFrame({'A': [1, 2, 3], 'B': np.nan}) - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) # GH 6546 # setting with mixed labels @@ -1517,11 +1517,11 @@ def test_loc_setitem_frame(self): result = df.loc[0, [1, 2]] expected = Series([1, 3], index=[1, 2], dtype=object, name=0) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) expected = DataFrame({1: [5, 2], 2: [6, 4], 'a': ['a', 'b']}) df.loc[0, [1, 2]] = [5, 6] - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) def test_loc_setitem_frame_multiples(self): # multiple setting @@ -1534,7 +1534,7 @@ def test_loc_setitem_frame_multiples(self): expected = DataFrame({'A': ['bar', 'baz', 'baz'], 'B': Series( [1, 2, 2], dtype=np.int64)}) - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) # multiple setting with frame on rhs (with M8) df = DataFrame({'date': date_range('2000-01-01', '2000-01-5'), @@ -1548,16 +1548,15 @@ def test_loc_setitem_frame_multiples(self): rhs = df.loc[0:2] rhs.index = df.index[2:5] df.loc[2:4] = rhs - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) def test_iloc_getitem_frame(self): - df = DataFrame( - np.random.randn( - 10, 4), index=lrange(0, 20, 2), columns=lrange(0, 8, 2)) + df = DataFrame(np.random.randn(10, 4), index=lrange(0, 20, 2), + columns=lrange(0, 8, 2)) result = df.iloc[2] exp = df.ix[4] - assert_series_equal(result, exp) + tm.assert_series_equal(result, exp) result = df.iloc[2, 2] exp = df.ix[4, 4] @@ -1566,41 +1565,41 @@ def test_iloc_getitem_frame(self): # slice result = df.iloc[4:8] expected = df.ix[8:14] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.iloc[:, 2:3] expected = df.ix[:, 4:5] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # list of integers result = df.iloc[[0, 1, 3]] expected = df.ix[[0, 2, 6]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.iloc[[0, 1, 3], [0, 1]] expected = df.ix[[0, 2, 6], [0, 2]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # neg indicies result = df.iloc[[-1, 1, 3], [-1, 1]] expected = df.ix[[18, 2, 6], [6, 2]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # dups indicies result = df.iloc[[-1, -1, 1, 3], [-1, 1]] expected = df.ix[[18, 18, 2, 6], [6, 2]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # with index-like s = Series(index=lrange(1, 5)) result = df.iloc[s.index] expected = df.ix[[2, 4, 6, 8]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) + def test_iloc_getitem_labelled_frame(self): # try with labelled frame - df = DataFrame( - np.random.randn(10, - 4), index=list('abcdefghij'), columns=list('ABCD')) + df = DataFrame(np.random.randn(10, 4), + index=list('abcdefghij'), columns=list('ABCD')) result = df.iloc[1, 1] exp = df.ix['b', 'B'] @@ -1608,7 +1607,7 @@ def test_iloc_getitem_frame(self): result = df.iloc[:, 2:3] expected = df.ix[:, ['C']] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # negative indexing result = df.iloc[-1, -1] @@ -1631,11 +1630,11 @@ def test_iloc_getitem_panel(self): result = p.iloc[1] expected = p.loc['B'] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = p.iloc[1, 1] expected = p.loc['B', 'b'] - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) result = p.iloc[1, 1, 1] expected = p.loc['B', 'b', 'two'] @@ -1644,31 +1643,31 @@ def test_iloc_getitem_panel(self): # slice result = p.iloc[1:3] expected = p.loc[['B', 'C']] - assert_panel_equal(result, expected) + tm.assert_panel_equal(result, expected) result = p.iloc[:, 0:2] expected = p.loc[:, ['a', 'b']] - assert_panel_equal(result, expected) + tm.assert_panel_equal(result, expected) # list of integers result = p.iloc[[0, 2]] expected = p.loc[['A', 'C']] - assert_panel_equal(result, expected) + tm.assert_panel_equal(result, expected) # neg indicies result = p.iloc[[-1, 1], [-1, 1]] expected = p.loc[['D', 'B'], ['c', 'b']] - assert_panel_equal(result, expected) + tm.assert_panel_equal(result, expected) # dups indicies result = p.iloc[[-1, -1, 1], [-1, 1]] expected = p.loc[['D', 'D', 'B'], ['c', 'b']] - assert_panel_equal(result, expected) + tm.assert_panel_equal(result, expected) # combined result = p.iloc[0, [True, True], [0, 1]] expected = p.loc['A', ['a', 'b'], ['one', 'two']] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # out-of-bounds exception self.assertRaises(IndexError, p.iloc.__getitem__, tuple([10, 5])) @@ -1688,13 +1687,13 @@ def f(): expected = p['A'] result = p.iloc[0, :, :] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = p.iloc[0, [True, True, True], :] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = p.iloc[0, [True, True, True], [0, 1]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def f(): p.iloc[0, [True, True, True], [0, 1, 2]] @@ -1706,6 +1705,7 @@ def f(): self.assertRaises(IndexError, f) + def test_iloc_getitem_panel_multiindex(self): # GH 7199 # Panel with multi-index multi_index = pd.MultiIndex.from_tuples([('ONE', 'one'), @@ -1722,33 +1722,33 @@ def f(): expected1 = wd1['First'].iloc[[True, True, True, False], [0, 2]] result1 = wd1.iloc[0, [True, True, True, False], [0, 2]] # WRONG - assert_frame_equal(result1, expected1) + tm.assert_frame_equal(result1, expected1) expected2 = wd2['First'].iloc[[True, True, True, False], [0, 2]] result2 = wd2.iloc[0, [True, True, True, False], [0, 2]] - assert_frame_equal(result2, expected2) + tm.assert_frame_equal(result2, expected2) expected1 = DataFrame(index=['a'], columns=multi_index, dtype='float64') result1 = wd1.iloc[0, [0], [0, 1, 2]] - assert_frame_equal(result1, expected1) + tm.assert_frame_equal(result1, expected1) expected2 = DataFrame(index=['a'], columns=simple_index, dtype='float64') result2 = wd2.iloc[0, [0], [0, 1, 2]] - assert_frame_equal(result2, expected2) + tm.assert_frame_equal(result2, expected2) # GH 7516 mi = MultiIndex.from_tuples([(0, 'x'), (1, 'y'), (2, 'z')]) - p = Panel( - np.arange(3 * 3 * 3, dtype='int64').reshape(3, 3, 3), - items=['a', 'b', 'c'], major_axis=mi, minor_axis=['u', 'v', 'w']) + p = Panel(np.arange(3 * 3 * 3, dtype='int64').reshape(3, 3, 3), + items=['a', 'b', 'c'], major_axis=mi, + minor_axis=['u', 'v', 'w']) result = p.iloc[:, 1, 0] expected = Series([3, 12, 21], index=['a', 'b', 'c'], name='u') - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) result = p.loc[:, (1, 'y'), 'u'] - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_iloc_getitem_doc_issue(self): @@ -1769,7 +1769,7 @@ def test_iloc_getitem_doc_issue(self): expected = DataFrame(arr[3:5, 0:2], index=index[3:5], columns=columns[0:2]) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # for dups df.columns = list('aaaa') @@ -1779,7 +1779,7 @@ def test_iloc_getitem_doc_issue(self): expected = DataFrame(arr[3:5, 0:2], index=index[3:5], columns=list('aa')) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # related arr = np.random.randn(6, 4) @@ -1793,7 +1793,7 @@ def test_iloc_getitem_doc_issue(self): result.dtypes expected = DataFrame(arr[1:5, 2:4], index=index[1:5], columns=columns[2:4]) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_setitem_ndarray_1d(self): # GH5508 @@ -1815,7 +1815,7 @@ def f(): result = df.ix[2:5, 'bar'] expected = Series([2.33j, 1.23 + 0.1j, 2.2, 1.0], index=[2, 3, 4, 5], name='bar') - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) # dtype getting changed? df = DataFrame(index=Index(lrange(1, 11))) @@ -1828,9 +1828,8 @@ def f(): self.assertRaises(ValueError, f) def test_iloc_setitem_series(self): - df = DataFrame( - np.random.randn(10, - 4), index=list('abcdefghij'), columns=list('ABCD')) + df = DataFrame(np.random.randn(10, 4), index=list('abcdefghij'), + columns=list('ABCD')) df.iloc[1, 1] = 1 result = df.iloc[1, 1] @@ -1839,7 +1838,7 @@ def test_iloc_setitem_series(self): df.iloc[:, 2:3] = 0 expected = df.iloc[:, 2:3] result = df.iloc[:, 2:3] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) s = Series(np.random.randn(10), index=lrange(0, 20, 2)) @@ -1850,35 +1849,35 @@ def test_iloc_setitem_series(self): s.iloc[:4] = 0 expected = s.iloc[:4] result = s.iloc[:4] - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) s = Series([-1] * 6) s.iloc[0::2] = [0, 2, 4] s.iloc[1::2] = [1, 3, 5] result = s expected = Series([0, 1, 2, 3, 4, 5]) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_iloc_setitem_list_of_lists(self): # GH 7551 # list-of-list is set incorrectly in mixed vs. single dtyped frames - df = DataFrame(dict(A=np.arange(5, dtype='int64'), B=np.arange( - 5, 10, dtype='int64'))) + df = DataFrame(dict(A=np.arange(5, dtype='int64'), + B=np.arange(5, 10, dtype='int64'))) df.iloc[2:4] = [[10, 11], [12, 13]] expected = DataFrame(dict(A=[0, 1, 10, 12, 4], B=[5, 6, 11, 13, 9])) - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) df = DataFrame( dict(A=list('abcde'), B=np.arange(5, 10, dtype='int64'))) df.iloc[2:4] = [['x', 11], ['y', 13]] - expected = DataFrame(dict(A=['a', 'b', 'x', 'y', 'e'], B=[5, 6, 11, 13, - 9])) - assert_frame_equal(df, expected) + expected = DataFrame(dict(A=['a', 'b', 'x', 'y', 'e'], + B=[5, 6, 11, 13, 9])) + tm.assert_frame_equal(df, expected) def test_iloc_getitem_multiindex(self): - mi_labels = DataFrame(np.random.randn(4, 3), columns=[['i', 'i', 'j'], - ['A', 'A', 'B']], + mi_labels = DataFrame(np.random.randn(4, 3), + columns=[['i', 'i', 'j'], ['A', 'A', 'B']], index=[['i', 'i', 'j', 'k'], ['X', 'X', 'Y', 'Y']]) @@ -1889,14 +1888,14 @@ def test_iloc_getitem_multiindex(self): # the first row rs = mi_int.iloc[0] xp = mi_int.ix[4].ix[8] - assert_series_equal(rs, xp, check_names=False) + tm.assert_series_equal(rs, xp, check_names=False) self.assertEqual(rs.name, (4, 8)) self.assertEqual(xp.name, 8) # 2nd (last) columns rs = mi_int.iloc[:, 2] xp = mi_int.ix[:, 2] - assert_series_equal(rs, xp) + tm.assert_series_equal(rs, xp) # corner column rs = mi_int.iloc[2, 2] @@ -1910,8 +1909,8 @@ def test_iloc_getitem_multiindex(self): def test_loc_multiindex(self): - mi_labels = DataFrame(np.random.randn(3, 3), columns=[['i', 'i', 'j'], - ['A', 'A', 'B']], + mi_labels = DataFrame(np.random.randn(3, 3), + columns=[['i', 'i', 'j'], ['A', 'A', 'B']], index=[['i', 'i', 'j'], ['X', 'X', 'Y']]) mi_int = DataFrame(np.random.randn(3, 3), @@ -1921,26 +1920,28 @@ def test_loc_multiindex(self): # the first row rs = mi_labels.loc['i'] xp = mi_labels.ix['i'] - assert_frame_equal(rs, xp) + tm.assert_frame_equal(rs, xp) # 2nd (last) columns rs = mi_labels.loc[:, 'j'] xp = mi_labels.ix[:, 'j'] - assert_frame_equal(rs, xp) + tm.assert_frame_equal(rs, xp) # corner column rs = mi_labels.loc['j'].loc[:, 'j'] xp = mi_labels.ix['j'].ix[:, 'j'] - assert_frame_equal(rs, xp) + tm.assert_frame_equal(rs, xp) # with a tuple rs = mi_labels.loc[('i', 'X')] xp = mi_labels.ix[('i', 'X')] - assert_frame_equal(rs, xp) + tm.assert_frame_equal(rs, xp) rs = mi_int.loc[4] xp = mi_int.ix[4] - assert_frame_equal(rs, xp) + tm.assert_frame_equal(rs, xp) + + def test_loc_multiindex_indexer_none(self): # GH6788 # multi-index indexer is None (meaning take all) @@ -1951,45 +1952,45 @@ def test_loc_multiindex(self): df = 0.1 * np.random.randn(10, 1 * 5) + 0.5 df = DataFrame(df, columns=index) result = df[attributes] - assert_frame_equal(result, df) + tm.assert_frame_equal(result, df) # GH 7349 # loc with a multi-index seems to be doing fallback - df = DataFrame( - np.arange(12).reshape(-1, 1), - index=pd.MultiIndex.from_product([[1, 2, 3, 4], [1, 2, 3]])) + df = DataFrame(np.arange(12).reshape(-1, 1), + index=pd.MultiIndex.from_product([[1, 2, 3, 4], + [1, 2, 3]])) expected = df.loc[([1, 2], ), :] result = df.loc[[1, 2]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) + + def test_loc_multiindex_incomplete(self): # GH 7399 # incomplete indexers - s = pd.Series( - np.arange(15, dtype='int64'), - MultiIndex.from_product([range(5), ['a', 'b', 'c']])) + s = pd.Series(np.arange(15, dtype='int64'), + MultiIndex.from_product([range(5), ['a', 'b', 'c']])) expected = s.loc[:, 'a':'c'] result = s.loc[0:4, 'a':'c'] - assert_series_equal(result, expected) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) result = s.loc[:4, 'a':'c'] - assert_series_equal(result, expected) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) result = s.loc[0:, 'a':'c'] - assert_series_equal(result, expected) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) # GH 7400 # multiindexer gettitem with list of indexers skips wrong element - s = pd.Series( - np.arange(15, dtype='int64'), - MultiIndex.from_product([range(5), ['a', 'b', 'c']])) + s = pd.Series(np.arange(15, dtype='int64'), + MultiIndex.from_product([range(5), ['a', 'b', 'c']])) expected = s.iloc[[6, 7, 8, 12, 13, 14]] result = s.loc[2:4:2, 'a':'c'] - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_multiindex_perf_warn(self): @@ -2008,76 +2009,6 @@ def test_multiindex_perf_warn(self): with tm.assert_produces_warning(PerformanceWarning): df.loc[(0, )] - @slow - def test_multiindex_get_loc(self): # GH7724, GH2646 - - with warnings.catch_warnings(record=True): - - # test indexing into a multi-index before & past the lexsort depth - from numpy.random import randint, choice, randn - cols = ['jim', 'joe', 'jolie', 'joline', 'jolia'] - - def validate(mi, df, key): - mask = np.ones(len(df)).astype('bool') - - # test for all partials of this key - for i, k in enumerate(key): - mask &= df.iloc[:, i] == k - - if not mask.any(): - self.assertNotIn(key[:i + 1], mi.index) - continue - - self.assertIn(key[:i + 1], mi.index) - right = df[mask].copy() - - if i + 1 != len(key): # partial key - right.drop(cols[:i + 1], axis=1, inplace=True) - right.set_index(cols[i + 1:-1], inplace=True) - assert_frame_equal(mi.loc[key[:i + 1]], right) - - else: # full key - right.set_index(cols[:-1], inplace=True) - if len(right) == 1: # single hit - right = Series(right['jolia'].values, - name=right.index[0], - index=['jolia']) - assert_series_equal(mi.loc[key[:i + 1]], right) - else: # multi hit - assert_frame_equal(mi.loc[key[:i + 1]], right) - - def loop(mi, df, keys): - for key in keys: - validate(mi, df, key) - - n, m = 1000, 50 - - vals = [randint(0, 10, n), choice( - list('abcdefghij'), n), choice( - pd.date_range('20141009', periods=10).tolist(), n), choice( - list('ZYXWVUTSRQ'), n), randn(n)] - vals = list(map(tuple, zip(*vals))) - - # bunch of keys for testing - keys = [randint(0, 11, m), choice( - list('abcdefghijk'), m), choice( - pd.date_range('20141009', periods=11).tolist(), m), choice( - list('ZYXWVUTSRQP'), m)] - keys = list(map(tuple, zip(*keys))) - keys += list(map(lambda t: t[:-1], vals[::n // m])) - - # covers both unique index and non-unique index - df = pd.DataFrame(vals, columns=cols) - a, b = pd.concat([df, df]), df.drop_duplicates(subset=cols[:-1]) - - for frame in a, b: - for i in range(5): # lexsort depth - df = frame.copy() if i == 0 else frame.sort_values( - by=cols[:i]) - mi = df.set_index(cols[:-1]) - assert not mi.index.lexsort_depth < i - loop(mi, df, keys) - def test_series_getitem_multiindex(self): # GH 6018 @@ -2088,27 +2019,29 @@ def test_series_getitem_multiindex(self): result = s[:, 0] expected = Series([1], index=[0]) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) result = s.ix[:, 1] expected = Series([2, 3], index=[1, 2]) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) # xs result = s.xs(0, level=0) expected = Series([1], index=[0]) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) result = s.xs(1, level=1) expected = Series([2, 3], index=[1, 2]) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) # GH6258 - s = Series([1, 3, 4, 1, 3, 4], index=MultiIndex.from_product([list( - 'AB'), list(date_range('20130903', periods=3))])) + dt = list(date_range('20130903', periods=3)) + idx = MultiIndex.from_product([list('AB'), dt]) + s = Series([1, 3, 4, 1, 3, 4], index=idx) + result = s.xs('20130903', level=1) expected = Series([1, 1], index=list('AB')) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) # GH5684 idx = MultiIndex.from_tuples([('a', 'one'), ('a', 'two'), ('b', 'one'), @@ -2118,28 +2051,16 @@ def test_series_getitem_multiindex(self): result = s.xs('one', level='L2') expected = Series([1, 3], index=['a', 'b']) expected.index.set_names(['L1'], inplace=True) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_ix_general(self): # ix general issues # GH 2817 - data = {'amount': {0: 700, - 1: 600, - 2: 222, - 3: 333, - 4: 444}, - 'col': {0: 3.5, - 1: 3.5, - 2: 4.0, - 3: 4.0, - 4: 4.0}, - 'year': {0: 2012, - 1: 2011, - 2: 2012, - 3: 2012, - 4: 2012}} + data = {'amount': {0: 700, 1: 600, 2: 222, 3: 333, 4: 444}, + 'col': {0: 3.5, 1: 3.5, 2: 4.0, 3: 4.0, 4: 4.0}, + 'year': {0: 2012, 1: 2011, 2: 2012, 3: 2012, 4: 2012}} df = DataFrame(data).set_index(keys=['col', 'year']) key = 4.0, 2012 @@ -2172,7 +2093,7 @@ def test_ix_weird_slicing(self): 2: -3, 3: 4, 4: 5}}) - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) def test_xs_multiindex(self): @@ -2184,12 +2105,12 @@ def test_xs_multiindex(self): df.sortlevel(axis=1, inplace=True) result = df.xs('a', level='lvl0', axis=1) expected = df.iloc[:, 0:2].loc[:, 'a'] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.xs('foo', level='lvl1', axis=1) expected = df.iloc[:, 1:2].copy() expected.columns = expected.columns.droplevel('lvl1') - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_per_axis_per_level_getitem(self): @@ -2204,14 +2125,14 @@ def test_per_axis_per_level_getitem(self): for a, b, c, d in df.index.values if (a == 'A1' or a == 'A2' or a == 'A3') and ( c == 'C1' or c == 'C3')]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) expected = df.loc[[tuple([a, b, c, d]) for a, b, c, d in df.index.values if (a == 'A1' or a == 'A2' or a == 'A3') and ( c == 'C1' or c == 'C2' or c == 'C3')]] result = df.loc[(slice('A1', 'A3'), slice(None), slice('C1', 'C3')), :] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # test multi-index slicing with per axis and per index controls index = MultiIndex.from_tuples([('A', 1), ('A', 2), @@ -2228,40 +2149,40 @@ def test_per_axis_per_level_getitem(self): # identity result = df.loc[(slice(None), slice(None)), :] - assert_frame_equal(result, df) + tm.assert_frame_equal(result, df) result = df.loc[(slice(None), slice(None)), (slice(None), slice(None))] - assert_frame_equal(result, df) + tm.assert_frame_equal(result, df) result = df.loc[:, (slice(None), slice(None))] - assert_frame_equal(result, df) + tm.assert_frame_equal(result, df) # index result = df.loc[(slice(None), [1]), :] expected = df.iloc[[0, 3]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.loc[(slice(None), 1), :] expected = df.iloc[[0, 3]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # columns result = df.loc[:, (slice(None), ['foo'])] expected = df.iloc[:, [1, 3]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # both result = df.loc[(slice(None), 1), (slice(None), ['foo'])] expected = df.iloc[[0, 3], [1, 3]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.loc['A', 'a'] expected = DataFrame(dict(bar=[1, 5, 9], foo=[0, 4, 8]), index=Index([1, 2, 3], name='two'), columns=Index(['bar', 'foo'], name='lvl1')) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.loc[(slice(None), [1, 2]), :] expected = df.iloc[[0, 1, 3]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # multi-level series s = Series(np.arange(len(ix.get_values())), index=ix) @@ -2270,12 +2191,12 @@ def test_per_axis_per_level_getitem(self): for a, b, c, d in s.index.values if (a == 'A1' or a == 'A2' or a == 'A3') and ( c == 'C1' or c == 'C3')]] - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) # boolean indexers result = df.loc[(slice(None), df.loc[:, ('a', 'bar')] > 5), :] expected = df.iloc[[2, 3]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def f(): df.loc[(slice(None), np.array([True, False])), :] @@ -2289,7 +2210,7 @@ def f(): result = df.loc[(slice(None), [1]), :] expected = df.iloc[[0, 3]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # not lexsorted self.assertEqual(df.index.lexsort_depth, 2) @@ -2315,11 +2236,11 @@ def test_multiindex_slicers_non_unique(self): C=[1, 1], D=[1, 3])) .set_index(['A', 'B', 'C']).sortlevel()) result = df.loc[(slice(None), slice(None), 1), :] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # this is equivalent of an xs expression result = df.xs(1, level=2, drop_level=False) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) df = (DataFrame(dict(A=['foo', 'foo', 'foo', 'foo'], B=['a', 'a', 'a', 'a'], @@ -2332,7 +2253,7 @@ def test_multiindex_slicers_non_unique(self): .set_index(['A', 'B', 'C']).sortlevel()) result = df.loc[(slice(None), slice(None), 1), :] self.assertFalse(result.index.is_unique) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # GH12896 # numpy-implementation dependent bug @@ -2344,7 +2265,7 @@ def test_multiindex_slicers_non_unique(self): result = result.sort_index() result = result.loc[(slice(None), slice(100000))] expected = Series([1] * (n - 2), index=idx[:-2]).sort_index() - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_multiindex_slicers_datetimelike(self): @@ -2364,28 +2285,28 @@ def test_multiindex_slicers_datetimelike(self): # multi-axis slicing idx = pd.IndexSlice expected = df.iloc[[0, 2, 4], [0, 1]] - result = df.loc[(slice( - Timestamp('2012-01-01 12:12:12'), Timestamp( - '2012-01-03 12:12:12')), slice(1, 1)), slice('A', 'B')] - assert_frame_equal(result, expected) + result = df.loc[(slice(Timestamp('2012-01-01 12:12:12'), + Timestamp('2012-01-03 12:12:12')), + slice(1, 1)), slice('A', 'B')] + tm.assert_frame_equal(result, expected) result = df.loc[(idx[Timestamp('2012-01-01 12:12:12'):Timestamp( '2012-01-03 12:12:12')], idx[1:1]), slice('A', 'B')] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) - result = df.loc[(slice( - Timestamp('2012-01-01 12:12:12'), Timestamp( - '2012-01-03 12:12:12')), 1), slice('A', 'B')] - assert_frame_equal(result, expected) + result = df.loc[(slice(Timestamp('2012-01-01 12:12:12'), + Timestamp('2012-01-03 12:12:12')), 1), + slice('A', 'B')] + tm.assert_frame_equal(result, expected) # with strings result = df.loc[(slice('2012-01-01 12:12:12', '2012-01-03 12:12:12'), slice(1, 1)), slice('A', 'B')] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.loc[(idx['2012-01-01 12:12:12':'2012-01-03 12:12:12'], 1), idx['A', 'B']] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_multiindex_slicers_edges(self): # GH 8132 @@ -2406,47 +2327,47 @@ def test_multiindex_slicers_edges(self): # A1 - Get all values under "A0" and "A1" result = df1.loc[(slice('A1')), :] expected = df1.iloc[0:10] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # A2 - Get all values from the start to "A2" result = df1.loc[(slice('A2')), :] expected = df1 - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # A3 - Get all values under "B1" or "B2" result = df1.loc[(slice(None), slice('B1', 'B2')), :] expected = df1.iloc[[2, 3, 4, 7, 8, 9, 12, 13, 14]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # A4 - Get all values between 2013-07-02 and 2013-07-09 - result = df1.loc[(slice(None), slice(None), slice('20130702', - '20130709')), :] + result = df1.loc[(slice(None), slice(None), + slice('20130702', '20130709')), :] expected = df1.iloc[[1, 2, 6, 7, 12]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # B1 - Get all values in B0 that are also under A0, A1 and A2 result = df1.loc[(slice('A2'), slice('B0')), :] expected = df1.iloc[[0, 1, 5, 6, 10, 11]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # B2 - Get all values in B0, B1 and B2 (similar to what #2 is doing for # the As) result = df1.loc[(slice(None), slice('B2')), :] expected = df1 - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # B3 - Get all values from B1 to B2 and up to 2013-08-06 - result = df1.loc[(slice(None), slice('B1', 'B2'), slice('2013-08-06') - ), :] + result = df1.loc[(slice(None), slice('B1', 'B2'), + slice('2013-08-06')), :] expected = df1.iloc[[2, 3, 4, 7, 8, 9, 12, 13]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # B4 - Same as A4 but the start of the date slice is not a key. # shows indexing on a partial selection slice - result = df1.loc[(slice(None), slice(None), slice('20130701', - '20130709')), :] + result = df1.loc[(slice(None), slice(None), + slice('20130701', '20130709')), :] expected = df1.iloc[[1, 2, 6, 7, 12]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_per_axis_per_level_doc_examples(self): @@ -2461,24 +2382,23 @@ def test_per_axis_per_level_doc_examples(self): names=['lvl0', 'lvl1']) df = DataFrame(np.arange(len(index) * len(columns), dtype='int64') .reshape((len(index), len(columns))), - index=index, - columns=columns) + index=index, columns=columns) result = df.loc[(slice('A1', 'A3'), slice(None), ['C1', 'C3']), :] expected = df.loc[[tuple([a, b, c, d]) for a, b, c, d in df.index.values if (a == 'A1' or a == 'A2' or a == 'A3') and ( c == 'C1' or c == 'C3')]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.loc[idx['A1':'A3', :, ['C1', 'C3']], :] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.loc[(slice(None), slice(None), ['C1', 'C3']), :] expected = df.loc[[tuple([a, b, c, d]) for a, b, c, d in df.index.values if (c == 'C1' or c == 'C3')]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.loc[idx[:, :, ['C1', 'C3']], :] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # not sorted def f(): @@ -2512,22 +2432,22 @@ def test_loc_axis_arguments(self): for a, b, c, d in df.index.values if (a == 'A1' or a == 'A2' or a == 'A3') and ( c == 'C1' or c == 'C3')]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.loc(axis='index')[:, :, ['C1', 'C3']] expected = df.loc[[tuple([a, b, c, d]) for a, b, c, d in df.index.values if (c == 'C1' or c == 'C3')]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # axis 1 result = df.loc(axis=1)[:, 'foo'] expected = df.loc[:, (slice(None), 'foo')] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.loc(axis='columns')[:, 'foo'] expected = df.loc[:, (slice(None), 'foo')] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # invalid axis def f(): @@ -2553,10 +2473,10 @@ def test_loc_coerceion(self): expected = df.dtypes result = df.iloc[[0]] - assert_series_equal(result.dtypes, expected) + tm.assert_series_equal(result.dtypes, expected) result = df.iloc[[1]] - assert_series_equal(result.dtypes, expected) + tm.assert_series_equal(result.dtypes, expected) # 12045 import datetime @@ -2565,20 +2485,20 @@ def test_loc_coerceion(self): expected = df.dtypes result = df.iloc[[0]] - assert_series_equal(result.dtypes, expected) + tm.assert_series_equal(result.dtypes, expected) result = df.iloc[[1]] - assert_series_equal(result.dtypes, expected) + tm.assert_series_equal(result.dtypes, expected) # 11594 df = DataFrame({'text': ['some words'] + [None] * 9}) expected = df.dtypes result = df.iloc[0:2] - assert_series_equal(result.dtypes, expected) + tm.assert_series_equal(result.dtypes, expected) result = df.iloc[3:] - assert_series_equal(result.dtypes, expected) + tm.assert_series_equal(result.dtypes, expected) def test_per_axis_per_level_setitem(self): @@ -2603,70 +2523,70 @@ def test_per_axis_per_level_setitem(self): df.loc[(slice(None), slice(None)), :] = 100 expected = df_orig.copy() expected.iloc[:, :] = 100 - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) df = df_orig.copy() df.loc(axis=0)[:, :] = 100 expected = df_orig.copy() expected.iloc[:, :] = 100 - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) df = df_orig.copy() df.loc[(slice(None), slice(None)), (slice(None), slice(None))] = 100 expected = df_orig.copy() expected.iloc[:, :] = 100 - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) df = df_orig.copy() df.loc[:, (slice(None), slice(None))] = 100 expected = df_orig.copy() expected.iloc[:, :] = 100 - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) # index df = df_orig.copy() df.loc[(slice(None), [1]), :] = 100 expected = df_orig.copy() expected.iloc[[0, 3]] = 100 - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) df = df_orig.copy() df.loc[(slice(None), 1), :] = 100 expected = df_orig.copy() expected.iloc[[0, 3]] = 100 - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) df = df_orig.copy() df.loc(axis=0)[:, 1] = 100 expected = df_orig.copy() expected.iloc[[0, 3]] = 100 - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) # columns df = df_orig.copy() df.loc[:, (slice(None), ['foo'])] = 100 expected = df_orig.copy() expected.iloc[:, [1, 3]] = 100 - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) # both df = df_orig.copy() df.loc[(slice(None), 1), (slice(None), ['foo'])] = 100 expected = df_orig.copy() expected.iloc[[0, 3], [1, 3]] = 100 - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) df = df_orig.copy() df.loc[idx[:, 1], idx[:, ['foo']]] = 100 expected = df_orig.copy() expected.iloc[[0, 3], [1, 3]] = 100 - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) df = df_orig.copy() df.loc['A', 'a'] = 100 expected = df_orig.copy() expected.iloc[0:3, 0:2] = 100 - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) # setting with a list-like df = df_orig.copy() @@ -2674,7 +2594,7 @@ def test_per_axis_per_level_setitem(self): [[100, 100], [100, 100]], dtype='int64') expected = df_orig.copy() expected.iloc[[0, 3], [1, 3]] = 100 - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) # not enough values df = df_orig.copy() @@ -2697,14 +2617,14 @@ def f(): None), 1), (slice(None), ['foo'])] * 5 expected = df_orig.copy() expected.iloc[[0, 3], [1, 3]] = expected.iloc[[0, 3], [1, 3]] * 5 - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) df = df_orig.copy() df.loc[(slice(None), 1), (slice(None), ['foo'])] *= df.loc[(slice( None), 1), (slice(None), ['foo'])] expected = df_orig.copy() expected.iloc[[0, 3], [1, 3]] *= expected.iloc[[0, 3], [1, 3]] - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) rhs = df_orig.loc[(slice(None), 1), (slice(None), ['foo'])].copy() rhs.loc[:, ('c', 'bah')] = 10 @@ -2712,7 +2632,7 @@ def f(): df.loc[(slice(None), 1), (slice(None), ['foo'])] *= rhs expected = df_orig.copy() expected.iloc[[0, 3], [1, 3]] *= expected.iloc[[0, 3], [1, 3]] - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) def test_multiindex_setitem(self): @@ -2729,7 +2649,7 @@ def test_multiindex_setitem(self): expected = df_orig.loc[['bar']] * 2 df = df_orig.copy() df.loc[['bar']] *= 2 - assert_frame_equal(df.loc[['bar']], expected) + tm.assert_frame_equal(df.loc[['bar']], expected) # raise because these have differing levels def f(): @@ -2756,17 +2676,17 @@ def f(): idx = pd.IndexSlice df = df_orig.copy() df.loc[idx[:, :, 'Stock'], :] *= 2 - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) df = df_orig.copy() df.loc[idx[:, :, 'Stock'], 'price'] *= 2 - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) def test_getitem_multiindex(self): # GH 5725 the 'A' happens to be a valid Timestamp so the doesn't raise # the appropriate error, only in PY3 of course! - index = MultiIndex(levels=[['D', 'B', 'C'], [0, 26, 27, 37, 57, 67, 75, - 82]], + index = MultiIndex(levels=[['D', 'B', 'C'], + [0, 26, 27, 37, 57, 67, 75, 82]], labels=[[0, 0, 0, 1, 2, 2, 2, 2, 2, 2], [1, 3, 4, 6, 0, 2, 2, 3, 5, 7]], names=['tag', 'day']) @@ -2775,7 +2695,7 @@ def test_getitem_multiindex(self): result = df.val['D'] expected = Series(arr.ravel()[0:3], name='val', index=Index( [26, 37, 57], name='day')) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def f(): df.val['A'] @@ -2788,8 +2708,8 @@ def f(): self.assertRaises(KeyError, f) # A is treated as a special Timestamp - index = MultiIndex(levels=[['A', 'B', 'C'], [0, 26, 27, 37, 57, 67, 75, - 82]], + index = MultiIndex(levels=[['A', 'B', 'C'], + [0, 26, 27, 37, 57, 67, 75, 82]], labels=[[0, 0, 0, 1, 2, 2, 2, 2, 2, 2], [1, 3, 4, 6, 0, 2, 2, 3, 5, 7]], names=['tag', 'day']) @@ -2797,7 +2717,7 @@ def f(): result = df.val['A'] expected = Series(arr.ravel()[0:3], name='val', index=Index( [26, 37, 57], name='day')) - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def f(): df.val['X'] @@ -2806,20 +2726,20 @@ def f(): # GH 7866 # multi-index slicing with missing indexers - s = pd.Series(np.arange(9, dtype='int64'), - index=pd.MultiIndex.from_product( - [['A', 'B', 'C'], ['foo', 'bar', 'baz']], - names=['one', 'two'])).sortlevel() + idx = pd.MultiIndex.from_product([['A', 'B', 'C'], + ['foo', 'bar', 'baz']], + names=['one', 'two']) + s = pd.Series(np.arange(9, dtype='int64'), index=idx).sortlevel() + exp_idx = pd.MultiIndex.from_product([['A'], ['foo', 'bar', 'baz']], + names=['one', 'two']) expected = pd.Series(np.arange(3, dtype='int64'), - index=pd.MultiIndex.from_product( - [['A'], ['foo', 'bar', 'baz']], - names=['one', 'two'])).sortlevel() + index=exp_idx).sortlevel() result = s.loc[['A']] - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) result = s.loc[['A', 'D']] - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) # not any values found self.assertRaises(KeyError, lambda: s.loc[['D']]) @@ -2827,16 +2747,16 @@ def f(): # empty ok result = s.loc[[]] expected = s.iloc[[]] - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) idx = pd.IndexSlice expected = pd.Series([0, 3, 6], index=pd.MultiIndex.from_product( [['A', 'B', 'C'], ['foo']], names=['one', 'two'])).sortlevel() result = s.loc[idx[:, ['foo']]] - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) result = s.loc[idx[:, ['foo', 'bah']]] - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) # GH 8737 # empty indexer @@ -2850,8 +2770,8 @@ def f(): columns=multi_index.reindex([])[0]) result1 = df.loc[:, ([], slice(None))] result2 = df.loc[:, (['foo'], [])] - assert_frame_equal(result1, expected) - assert_frame_equal(result2, expected) + tm.assert_frame_equal(result1, expected) + tm.assert_frame_equal(result2, expected) # regression from < 0.14.0 # GH 7914 @@ -2870,11 +2790,9 @@ def test_setitem_dtype_upcast(self): self.assertEqual(df['c'].dtype, np.float64) df.ix[0, 'c'] = 'foo' - expected = DataFrame([{"a": 1, - "c": 'foo'}, {"a": 3, - "b": 2, - "c": np.nan}]) - assert_frame_equal(df, expected) + expected = DataFrame([{"a": 1, "c": 'foo'}, + {"a": 3, "b": 2, "c": np.nan}]) + tm.assert_frame_equal(df, expected) # GH10280 df = DataFrame(np.arange(6, dtype='int64').reshape(2, 3), @@ -2887,9 +2805,9 @@ def test_setitem_dtype_upcast(self): right = DataFrame([[0, val, 2], [3, 4, 5]], index=list('ab'), columns=['foo', 'bar', 'baz']) - assert_frame_equal(left, right) - self.assertTrue(com.is_integer_dtype(left['foo'])) - self.assertTrue(com.is_integer_dtype(left['baz'])) + tm.assert_frame_equal(left, right) + self.assertTrue(is_integer_dtype(left['foo'])) + self.assertTrue(is_integer_dtype(left['baz'])) left = DataFrame(np.arange(6, dtype='int64').reshape(2, 3) / 10.0, index=list('ab'), @@ -2899,23 +2817,22 @@ def test_setitem_dtype_upcast(self): right = DataFrame([[0, 'wxyz', .2], [.3, .4, .5]], index=list('ab'), columns=['foo', 'bar', 'baz']) - assert_frame_equal(left, right) - self.assertTrue(com.is_float_dtype(left['foo'])) - self.assertTrue(com.is_float_dtype(left['baz'])) + tm.assert_frame_equal(left, right) + self.assertTrue(is_float_dtype(left['foo'])) + self.assertTrue(is_float_dtype(left['baz'])) def test_setitem_iloc(self): # setitem with an iloc list - df = DataFrame( - np.arange(9).reshape((3, 3)), index=["A", "B", "C"], - columns=["A", "B", "C"]) + df = DataFrame(np.arange(9).reshape((3, 3)), index=["A", "B", "C"], + columns=["A", "B", "C"]) df.iloc[[0, 1], [1, 2]] df.iloc[[0, 1], [1, 2]] += 100 expected = DataFrame( np.array([0, 101, 102, 3, 104, 105, 6, 7, 8]).reshape((3, 3)), index=["A", "B", "C"], columns=["A", "B", "C"]) - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) def test_dups_fancy_indexing(self): @@ -2939,7 +2856,7 @@ def test_dups_fancy_indexing(self): df_v = df.iloc[:, 4] # noqa res_v = result.iloc[:, 4] # noqa - assert_frame_equal(df, result) + tm.assert_frame_equal(df, result) # GH 3561, dups not in selected order df = DataFrame( @@ -2952,10 +2869,10 @@ def test_dups_fancy_indexing(self): 'test1': [7., 6], 'other': ['d', 'c']}, index=rows) result = df.ix[rows] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df.ix[Index(rows)] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) rows = ['C', 'B', 'E'] expected = DataFrame( @@ -2964,7 +2881,7 @@ def test_dups_fancy_indexing(self): 'other': ['d', 'c', np.nan]}, index=rows) result = df.ix[rows] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # see GH5553, make sure we use the right indexer rows = ['F', 'G', 'H', 'C', 'B', 'E'] @@ -2974,7 +2891,7 @@ def test_dups_fancy_indexing(self): 'd', 'c', np.nan]}, index=rows) result = df.ix[rows] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # inconsistent returns for unique/duplicate indices when values are # missing @@ -2983,7 +2900,7 @@ def test_dups_fancy_indexing(self): dfnu = DataFrame(randn(5, 3), index=list('AABCD')) result = dfnu.ix[['E']] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # ToDo: check_index_type can be True after GH 11497 @@ -2991,19 +2908,19 @@ def test_dups_fancy_indexing(self): df = DataFrame({"A": [0, 1, 2]}) result = df.ix[[0, 8, 0]] expected = DataFrame({"A": [0, np.nan, 0]}, index=[0, 8, 0]) - assert_frame_equal(result, expected, check_index_type=False) + tm.assert_frame_equal(result, expected, check_index_type=False) df = DataFrame({"A": list('abc')}) result = df.ix[[0, 8, 0]] expected = DataFrame({"A": ['a', np.nan, 'a']}, index=[0, 8, 0]) - assert_frame_equal(result, expected, check_index_type=False) + tm.assert_frame_equal(result, expected, check_index_type=False) # non unique with non unique selector df = DataFrame({'test': [5, 7, 9, 11]}, index=['A', 'A', 'B', 'C']) expected = DataFrame( {'test': [5, 7, 5, 7, np.nan]}, index=['A', 'A', 'A', 'A', 'E']) result = df.ix[['A', 'A', 'E']] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # GH 5835 # dups on index and missing values @@ -3014,34 +2931,29 @@ def test_dups_fancy_indexing(self): [df.ix[:, ['A', 'B']], DataFrame(np.nan, columns=['C'], index=df.index)], axis=1) result = df.ix[:, ['A', 'B', 'C']] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # GH 6504, multi-axis indexing - df = DataFrame( - np.random.randn( - 9, 2), index=[1, 1, 1, 2, 2, 2, 3, 3, 3], columns=['a', 'b']) + df = DataFrame(np.random.randn(9, 2), + index=[1, 1, 1, 2, 2, 2, 3, 3, 3], columns=['a', 'b']) expected = df.iloc[0:6] result = df.loc[[1, 2]] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) expected = df result = df.loc[:, ['a', 'b']] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) expected = df.iloc[0:6, :] result = df.loc[[1, 2], ['a', 'b']] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_indexing_mixed_frame_bug(self): # GH3492 - df = DataFrame({'a': {1: 'aaa', - 2: 'bbb', - 3: 'ccc'}, - 'b': {1: 111, - 2: 222, - 3: 333}}) + df = DataFrame({'a': {1: 'aaa', 2: 'bbb', 3: 'ccc'}, + 'b': {1: 111, 2: 222, 3: 333}}) # this works, new column is created correctly df['test'] = df['a'].apply(lambda x: '_' if x == 'aaa' else x) @@ -3126,7 +3038,7 @@ def test_set_index_nan(self): result = df.set_index(['year', 'PRuid', 'QC']).reset_index().reindex( columns=df.columns) - assert_frame_equal(result, df) + tm.assert_frame_equal(result, df) def test_multi_nan_indexing(self): @@ -3141,7 +3053,7 @@ def test_multi_nan_indexing(self): index=[Index(['R1', 'R2', np.nan, 'R4'], name='a'), Index(['C1', 'C2', 'C3', 'C4'], name='b')]) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_iloc_panel_issue(self): @@ -3218,13 +3130,12 @@ def test_panel_setitem(self): index = range(3) columns = list('abc') - panel = Panel( - {'A': DataFrame( - np.random.randn(3, 3), index=index, columns=columns), - 'B': DataFrame( - np.random.randn(3, 3), index=index, columns=columns), - 'C': DataFrame( - np.random.randn(3, 3), index=index, columns=columns)}) + panel = Panel({'A': DataFrame(np.random.randn(3, 3), + index=index, columns=columns), + 'B': DataFrame(np.random.randn(3, 3), + index=index, columns=columns), + 'C': DataFrame(np.random.randn(3, 3), + index=index, columns=columns)}) replace = DataFrame(np.eye(3, 3), index=range(3), columns=columns) expected = Panel({'A': replace, 'B': replace, 'C': replace}) @@ -3243,8 +3154,8 @@ def test_panel_setitem_with_multiindex(self): # 10360 # failing with a multi-index - arr = np.array( - [[[1, 2, 3], [0, 0, 0]], [[0, 0, 0], [0, 0, 0]]], dtype=np.float64) + arr = np.array([[[1, 2, 3], [0, 0, 0]], [[0, 0, 0], [0, 0, 0]]], + dtype=np.float64) # reg index axes = dict(items=['A', 'B'], major_axis=[0, 1], @@ -3282,14 +3193,12 @@ def test_panel_setitem_with_multiindex(self): def test_panel_assignment(self): # GH3777 - wp = Panel( - randn(2, 5, 4), items=['Item1', 'Item2'], - major_axis=date_range('1/1/2000', periods=5), - minor_axis=['A', 'B', 'C', 'D']) - wp2 = Panel( - randn(2, 5, 4), items=['Item1', 'Item2'], - major_axis=date_range('1/1/2000', periods=5), - minor_axis=['A', 'B', 'C', 'D']) + wp = Panel(randn(2, 5, 4), items=['Item1', 'Item2'], + major_axis=date_range('1/1/2000', periods=5), + minor_axis=['A', 'B', 'C', 'D']) + wp2 = Panel(randn(2, 5, 4), items=['Item1', 'Item2'], + major_axis=date_range('1/1/2000', periods=5), + minor_axis=['A', 'B', 'C', 'D']) # TODO: unused? # expected = wp.loc[['Item1', 'Item2'], :, ['A', 'B']] @@ -3317,8 +3226,8 @@ def test_multiindex_assignment(self): arr = np.array([0., 1.]) df.ix[4, 'd'] = arr - assert_series_equal(df.ix[4, 'd'], Series(arr, index=[8, 10], - name='d')) + tm.assert_series_equal(df.ix[4, 'd'], + Series(arr, index=[8, 10], name='d')) # single dtype df = DataFrame(np.random.randint(5, 10, size=9).reshape(3, 3), @@ -3326,13 +3235,13 @@ def test_multiindex_assignment(self): index=[[4, 4, 8], [8, 10, 12]]) df.ix[4, 'c'] = arr - assert_series_equal(df.ix[4, 'c'], Series(arr, index=[8, 10], name='c', - dtype='float64')) + exp = Series(arr, index=[8, 10], name='c', dtype='float64') + tm.assert_series_equal(df.ix[4, 'c'], exp) # scalar ok df.ix[4, 'c'] = 10 - assert_series_equal(df.ix[4, 'c'], Series(10, index=[8, 10], name='c', - dtype='float64')) + exp = Series(10, index=[8, 10], name='c', dtype='float64') + tm.assert_series_equal(df.ix[4, 'c'], exp) # invalid assignments def f(): @@ -3348,13 +3257,12 @@ def f(): # groupby example NUM_ROWS = 100 NUM_COLS = 10 - col_names = ['A' + num - for num in map(str, np.arange(NUM_COLS).tolist())] + col_names = ['A' + num for num in + map(str, np.arange(NUM_COLS).tolist())] index_cols = col_names[:5] - df = DataFrame( - np.random.randint(5, size=(NUM_ROWS, NUM_COLS)), dtype=np.int64, - columns=col_names) + df = DataFrame(np.random.randint(5, size=(NUM_ROWS, NUM_COLS)), + dtype=np.int64, columns=col_names) df = df.set_index(index_cols).sort_index() grp = df.groupby(level=index_cols[:4]) df['new_col'] = np.nan @@ -3362,9 +3270,8 @@ def f(): f_index = np.arange(5) def f(name, df2): - return Series( - np.arange(df2.shape[0]), - name=df2.index.values[0]).reindex(f_index) + return Series(np.arange(df2.shape[0]), + name=df2.index.values[0]).reindex(f_index) # TODO(wesm): unused? # new_df = pd.concat([f(name, df2) for name, df2 in grp], axis=1).T @@ -3398,17 +3305,17 @@ def test_multi_assign(self): # frame on rhs df2.ix[mask, cols] = dft.ix[mask, cols] - assert_frame_equal(df2, expected) + tm.assert_frame_equal(df2, expected) df2.ix[mask, cols] = dft.ix[mask, cols] - assert_frame_equal(df2, expected) + tm.assert_frame_equal(df2, expected) # with an ndarray on rhs df2 = df.copy() df2.ix[mask, cols] = dft.ix[mask, cols].values - assert_frame_equal(df2, expected) + tm.assert_frame_equal(df2, expected) df2.ix[mask, cols] = dft.ix[mask, cols].values - assert_frame_equal(df2, expected) + tm.assert_frame_equal(df2, expected) # broadcasting on the rhs is required df = DataFrame(dict(A=[1, 2, 0, 0, 0], B=[0, 0, 0, 10, 11], C=[ @@ -3420,7 +3327,7 @@ def test_multi_assign(self): expected.loc[mask, col] = df['D'] df.loc[df['A'] == 0, ['A', 'B']] = df['D'] - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) def test_ix_assign_column_mixed(self): # GH #1142 @@ -3429,7 +3336,7 @@ def test_ix_assign_column_mixed(self): orig = df.ix[:, 'B'].copy() df.ix[:, 'B'] = df.ix[:, 'B'] + 1 - assert_series_equal(df.B, orig + 1) + tm.assert_series_equal(df.B, orig + 1) # GH 3668, mixed frame with series value df = DataFrame({'x': lrange(10), 'y': lrange(10, 20), 'z': 'bar'}) @@ -3442,20 +3349,20 @@ def test_ix_assign_column_mixed(self): self.assertEqual(expected.ix[indexer, 'y'], v) df.ix[df.x % 2 == 0, 'y'] = df.ix[df.x % 2 == 0, 'y'] * 100 - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) # GH 4508, making sure consistency of assignments df = DataFrame({'a': [1, 2, 3], 'b': [0, 1, 2]}) df.ix[[0, 2, ], 'b'] = [100, -100] expected = DataFrame({'a': [1, 2, 3], 'b': [100, 1, -100]}) - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) df = pd.DataFrame({'a': lrange(4)}) df['b'] = np.nan df.ix[[1, 3], 'b'] = [100, -100] expected = DataFrame({'a': [0, 1, 2, 3], 'b': [np.nan, 100, np.nan, -100]}) - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) # ok, but chained assignments are dangerous # if we turn off chained assignement it will work @@ -3463,7 +3370,7 @@ def test_ix_assign_column_mixed(self): df = pd.DataFrame({'a': lrange(4)}) df['b'] = np.nan df['b'].ix[[1, 3]] = [100, -100] - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) def test_ix_get_set_consistency(self): @@ -3496,7 +3403,7 @@ def test_setitem_list(self): result = DataFrame(index=[0, 1], columns=[0]) result.ix[1, 0] = [1, 2] - assert_frame_equal(result, df) + tm.assert_frame_equal(result, df) # ix with an object class TO(object): @@ -3522,7 +3429,7 @@ def view(self): result = DataFrame(index=[0, 1], columns=[0]) result.ix[1, 0] = TO(2) - assert_frame_equal(result, df) + tm.assert_frame_equal(result, df) # remains object dtype even after setting it back df = DataFrame(index=[0, 1], columns=[0]) @@ -3530,7 +3437,7 @@ def view(self): df.ix[1, 0] = np.nan result = DataFrame(index=[0, 1], columns=[0]) - assert_frame_equal(result, df) + tm.assert_frame_equal(result, df) def test_iloc_mask(self): @@ -3544,7 +3451,7 @@ def test_iloc_mask(self): # ndarray ok result = df.iloc[np.array([True] * len(mask), dtype=bool)] - assert_frame_equal(result, df) + tm.assert_frame_equal(result, df) # the possibilities locs = np.arange(4) @@ -3617,7 +3524,7 @@ def test_ix_slicing_strings(self): 3: 'correct', 4: 'aaa'}}) # bug was 4: 'bbb' - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) def test_non_unique_loc(self): # GH3659 @@ -3625,9 +3532,8 @@ def test_non_unique_loc(self): # https://groups.google.com/forum/?fromgroups#!topic/pydata/zTm2No0crYs # these are going to raise becuase the we are non monotonic - df = DataFrame( - {'A': [1, 2, 3, 4, 5, 6], - 'B': [3, 4, 5, 6, 7, 8]}, index=[0, 1, 0, 1, 2, 3]) + df = DataFrame({'A': [1, 2, 3, 4, 5, 6], + 'B': [3, 4, 5, 6, 7, 8]}, index=[0, 1, 0, 1, 2, 3]) self.assertRaises(KeyError, df.loc.__getitem__, tuple([slice(1, None)])) self.assertRaises(KeyError, df.loc.__getitem__, @@ -3635,22 +3541,21 @@ def test_non_unique_loc(self): self.assertRaises(KeyError, df.loc.__getitem__, tuple([slice(1, 2)])) # monotonic are ok - df = DataFrame( - {'A': [1, 2, 3, 4, 5, 6], - 'B': [3, 4, 5, 6, 7, 8]}, index=[0, 1, 0, 1, 2, 3]).sort_index( - axis=0) + df = DataFrame({'A': [1, 2, 3, 4, 5, 6], + 'B': [3, 4, 5, 6, 7, 8]}, + index=[0, 1, 0, 1, 2, 3]).sort_index(axis=0) result = df.loc[1:] - expected = DataFrame( - {'A': [2, 4, 5, 6], - 'B': [4, 6, 7, 8]}, index=[1, 1, 2, 3]) - assert_frame_equal(result, expected) + expected = DataFrame({'A': [2, 4, 5, 6], 'B': [4, 6, 7, 8]}, + index=[1, 1, 2, 3]) + tm.assert_frame_equal(result, expected) result = df.loc[0:] - assert_frame_equal(result, df) + tm.assert_frame_equal(result, df) result = df.loc[1:2] - expected = DataFrame({'A': [2, 4, 5], 'B': [4, 6, 7]}, index=[1, 1, 2]) - assert_frame_equal(result, expected) + expected = DataFrame({'A': [2, 4, 5], 'B': [4, 6, 7]}, + index=[1, 1, 2]) + tm.assert_frame_equal(result, expected) def test_loc_name(self): # GH 3880 @@ -3675,7 +3580,7 @@ def test_iloc_non_unique_indexing(self): df3 = pd.concat([df, 2 * df, 3 * df]) result = df3.iloc[idx] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) df2 = DataFrame({'A': [0.1] * 1000, 'B': [1] * 1000}) df2 = pd.concat([df2, 2 * df2, 3 * df2]) @@ -3693,7 +3598,7 @@ def test_iloc_non_unique_indexing(self): expected = pd.concat([expected, DataFrame(index=idx[idx > sidx.max()]) ]) result = df2.loc[idx] - assert_frame_equal(result, expected, check_index_type=False) + tm.assert_frame_equal(result, expected, check_index_type=False) def test_mi_access(self): @@ -3714,10 +3619,10 @@ def test_mi_access(self): expected = DataFrame([['a', 1, 1]], index=columns, columns=index).T result = df2.loc[:, ('A', 'A1')] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) result = df2[('A', 'A1')] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # GH 4146, not returning a block manager when selecting a unique index # from a duplicate index @@ -3725,15 +3630,14 @@ def test_mi_access(self): # with a non-unique) expected = Series(['a', 1, 1], index=['h1', 'h3', 'h5'], name='A1') result = df2['A']['A1'] - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) # selecting a non_unique from the 2nd level expected = DataFrame([['d', 4, 4], ['e', 5, 5]], - index=Index( - ['B2', 'B2'], name='sub'), + index=Index(['B2', 'B2'], name='sub'), columns=['h1', 'h3', 'h5'], ).T result = df2['A']['B2'] - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_non_unique_loc_memory_error(self): @@ -3743,11 +3647,10 @@ def test_non_unique_loc_memory_error(self): columns = list('ABCDEFG') def gen_test(l, l2): - return pd.concat([DataFrame( - randn(l, len(columns)), index=lrange( - l), columns=columns), DataFrame( - np.ones((l2, len(columns) - )), index=[0] * l2, columns=columns)]) + return pd.concat([DataFrame(randn(l, len(columns)), + index=lrange(l), columns=columns), + DataFrame(np.ones((l2, len(columns))), + index=[0] * l2, columns=columns)]) def gen_expected(df, mask): l = len(mask) @@ -3763,7 +3666,7 @@ def gen_expected(df, mask): mask = np.arange(100) result = df.loc[mask] expected = gen_expected(df, mask) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) df = gen_test(900000, 100000) self.assertFalse(df.index.is_unique) @@ -3771,61 +3674,57 @@ def gen_expected(df, mask): mask = np.arange(100000) result = df.loc[mask] expected = gen_expected(df, mask) - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_astype_assignment(self): # GH4312 (iloc) - df_orig = DataFrame( - [['1', '2', '3', '.4', 5, 6., 'foo']], columns=list('ABCDEFG')) + df_orig = DataFrame([['1', '2', '3', '.4', 5, 6., 'foo']], + columns=list('ABCDEFG')) df = df_orig.copy() df.iloc[:, 0:2] = df.iloc[:, 0:2].astype(np.int64) - expected = DataFrame( - [[1, 2, '3', '.4', 5, 6., 'foo']], columns=list('ABCDEFG')) - assert_frame_equal(df, expected) + expected = DataFrame([[1, 2, '3', '.4', 5, 6., 'foo']], + columns=list('ABCDEFG')) + tm.assert_frame_equal(df, expected) df = df_orig.copy() df.iloc[:, 0:2] = df.iloc[:, 0:2]._convert(datetime=True, numeric=True) - expected = DataFrame( - [[1, 2, '3', '.4', 5, 6., 'foo']], columns=list('ABCDEFG')) - assert_frame_equal(df, expected) + expected = DataFrame([[1, 2, '3', '.4', 5, 6., 'foo']], + columns=list('ABCDEFG')) + tm.assert_frame_equal(df, expected) # GH5702 (loc) df = df_orig.copy() df.loc[:, 'A'] = df.loc[:, 'A'].astype(np.int64) - expected = DataFrame( - [[1, '2', '3', '.4', 5, 6., 'foo']], columns=list('ABCDEFG')) - assert_frame_equal(df, expected) + expected = DataFrame([[1, '2', '3', '.4', 5, 6., 'foo']], + columns=list('ABCDEFG')) + tm.assert_frame_equal(df, expected) df = df_orig.copy() df.loc[:, ['B', 'C']] = df.loc[:, ['B', 'C']].astype(np.int64) - expected = DataFrame( - [['1', 2, 3, '.4', 5, 6., 'foo']], columns=list('ABCDEFG')) - assert_frame_equal(df, expected) + expected = DataFrame([['1', 2, 3, '.4', 5, 6., 'foo']], + columns=list('ABCDEFG')) + tm.assert_frame_equal(df, expected) # full replacements / no nans df = DataFrame({'A': [1., 2., 3., 4.]}) df.iloc[:, 0] = df['A'].astype(np.int64) expected = DataFrame({'A': [1, 2, 3, 4]}) - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) df = DataFrame({'A': [1., 2., 3., 4.]}) df.loc[:, 'A'] = df['A'].astype(np.int64) expected = DataFrame({'A': [1, 2, 3, 4]}) - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) def test_astype_assignment_with_dups(self): # GH 4686 # assignment with dups that has a dtype change - df = DataFrame( - np.arange(3).reshape((1, 3)), - columns=pd.MultiIndex.from_tuples( - [('A', '1'), ('B', '1'), ('A', '2')] - ), - dtype=object - ) + cols = pd.MultiIndex.from_tuples([('A', '1'), ('B', '1'), ('A', '2')]) + df = DataFrame(np.arange(3).reshape((1, 3)), + columns=cols, dtype=object) index = df.index.copy() df['A'] = df['A'].astype(np.float64) @@ -3845,10 +3744,10 @@ def test_dups_loc(self): index=['a', 'a', 'a', 'a', 'a'], name=1) result = df.iloc[0] - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) result = df.loc[1] - assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) def test_partial_setting(self): @@ -3860,22 +3759,22 @@ def test_partial_setting(self): s = s_orig.copy() s[5] = 5 expected = Series([1, 2, 3, 5], index=[0, 1, 2, 5]) - assert_series_equal(s, expected) + tm.assert_series_equal(s, expected) s = s_orig.copy() s.loc[5] = 5 expected = Series([1, 2, 3, 5], index=[0, 1, 2, 5]) - assert_series_equal(s, expected) + tm.assert_series_equal(s, expected) s = s_orig.copy() s[5] = 5. expected = Series([1, 2, 3, 5.], index=[0, 1, 2, 5]) - assert_series_equal(s, expected) + tm.assert_series_equal(s, expected) s = s_orig.copy() s.loc[5] = 5. expected = Series([1, 2, 3, 5.], index=[0, 1, 2, 5]) - assert_series_equal(s, expected) + tm.assert_series_equal(s, expected) # iloc/iat raise s = s_orig.copy() @@ -3912,96 +3811,97 @@ def f(): expected = DataFrame(dict({'A': [0, 4, 4], 'B': [1, 5, 5]})) df = df_orig.copy() df.iloc[1] = df.iloc[2] - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) expected = DataFrame(dict({'A': [0, 4, 4], 'B': [1, 5, 5]})) df = df_orig.copy() df.loc[1] = df.loc[2] - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) # like 2578, partial setting with dtype preservation expected = DataFrame(dict({'A': [0, 2, 4, 4], 'B': [1, 3, 5, 5]})) df = df_orig.copy() df.loc[3] = df.loc[2] - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) # single dtype frame, overwrite expected = DataFrame(dict({'A': [0, 2, 4], 'B': [0, 2, 4]})) df = df_orig.copy() df.ix[:, 'B'] = df.ix[:, 'A'] - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) # mixed dtype frame, overwrite expected = DataFrame(dict({'A': [0, 2, 4], 'B': Series([0, 2, 4])})) df = df_orig.copy() df['B'] = df['B'].astype(np.float64) df.ix[:, 'B'] = df.ix[:, 'A'] - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) # single dtype frame, partial setting expected = df_orig.copy() expected['C'] = df['A'] df = df_orig.copy() df.ix[:, 'C'] = df.ix[:, 'A'] - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) # mixed frame, partial setting expected = df_orig.copy() expected['C'] = df['A'] df = df_orig.copy() df.ix[:, 'C'] = df.ix[:, 'A'] - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) # ## panel ## - p_orig = Panel( - np.arange(16).reshape(2, 4, 2), items=['Item1', 'Item2'], - major_axis=pd.date_range('2001/1/12', periods=4), - minor_axis=['A', 'B'], dtype='float64') + p_orig = Panel(np.arange(16).reshape(2, 4, 2), + items=['Item1', 'Item2'], + major_axis=pd.date_range('2001/1/12', periods=4), + minor_axis=['A', 'B'], dtype='float64') # panel setting via item - p_orig = Panel( - np.arange(16).reshape(2, 4, 2), items=['Item1', 'Item2'], - major_axis=pd.date_range('2001/1/12', periods=4), - minor_axis=['A', 'B'], dtype='float64') + p_orig = Panel(np.arange(16).reshape(2, 4, 2), + items=['Item1', 'Item2'], + major_axis=pd.date_range('2001/1/12', periods=4), + minor_axis=['A', 'B'], dtype='float64') expected = p_orig.copy() expected['Item3'] = expected['Item1'] p = p_orig.copy() p.loc['Item3'] = p['Item1'] - assert_panel_equal(p, expected) + tm.assert_panel_equal(p, expected) # panel with aligned series expected = p_orig.copy() expected = expected.transpose(2, 1, 0) - expected['C'] = DataFrame( - {'Item1': [30, 30, 30, 30], - 'Item2': [32, 32, 32, 32]}, index=p_orig.major_axis) + expected['C'] = DataFrame({'Item1': [30, 30, 30, 30], + 'Item2': [32, 32, 32, 32]}, + index=p_orig.major_axis) expected = expected.transpose(2, 1, 0) p = p_orig.copy() p.loc[:, :, 'C'] = Series([30, 32], index=p_orig.items) - assert_panel_equal(p, expected) + tm.assert_panel_equal(p, expected) # GH 8473 dates = date_range('1/1/2000', periods=8) - df_orig = DataFrame( - np.random.randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D']) + df_orig = DataFrame(np.random.randn(8, 4), index=dates, + columns=['A', 'B', 'C', 'D']) expected = pd.concat([df_orig, DataFrame( {'A': 7}, index=[dates[-1] + 1])]) df = df_orig.copy() df.loc[dates[-1] + 1, 'A'] = 7 - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) df = df_orig.copy() df.at[dates[-1] + 1, 'A'] = 7 - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) + + exp_other = DataFrame({0: 7}, index=[dates[-1] + 1]) + expected = pd.concat([df_orig, exp_other], axis=1) - expected = pd.concat( - [df_orig, DataFrame({0: 7}, index=[dates[-1] + 1])], axis=1) df = df_orig.copy() df.loc[dates[-1] + 1, 0] = 7 - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) df = df_orig.copy() df.at[dates[-1] + 1, 0] = 7 - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) def test_partial_setting_mixed_dtype(self): @@ -4014,18 +3914,20 @@ def test_partial_setting_mixed_dtype(self): expected = df.append(s) df.loc[2] = df.loc[1] - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) # columns will align df = DataFrame(columns=['A', 'B']) df.loc[0] = Series(1, index=range(4)) - assert_frame_equal(df, DataFrame(columns=['A', 'B'], index=[0])) + tm.assert_frame_equal(df, DataFrame(columns=['A', 'B'], index=[0])) # columns will align df = DataFrame(columns=['A', 'B']) df.loc[0] = Series(1, index=['B']) - assert_frame_equal(df, DataFrame( - [[np.nan, 1]], columns=['A', 'B'], index=[0], dtype='float64')) + + exp = DataFrame([[np.nan, 1]], columns=['A', 'B'], + index=[0], dtype='float64') + tm.assert_frame_equal(df, exp) # list-like must conform df = DataFrame(columns=['A', 'B']) @@ -4038,8 +3940,10 @@ def f(): # these are coerced to float unavoidably (as its a list-like to begin) df = DataFrame(columns=['A', 'B']) df.loc[3] = [6, 7] - assert_frame_equal(df, DataFrame( - [[6, 7]], index=[3], columns=['A', 'B'], dtype='float64')) + + exp = DataFrame([[6, 7]], index=[3], columns=['A', 'B'], + dtype='float64') + tm.assert_frame_equal(df, exp) def test_partial_setting_with_datetimelike_dtype(self): @@ -4053,7 +3957,7 @@ def test_partial_setting_with_datetimelike_dtype(self): mask = df.A < 1 df.loc[mask, 'C'] = df.loc[mask].index - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) def test_loc_setitem_datetime(self): @@ -4069,7 +3973,7 @@ def test_loc_setitem_datetime(self): df.loc[conv(dt2), 'one'] = 200 expected = DataFrame({'one': [100.0, 200.0]}, index=[dt1, dt2]) - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) def test_series_partial_set(self): # partial set with new index @@ -4079,55 +3983,55 @@ def test_series_partial_set(self): # loc expected = Series([np.nan, 0.2, np.nan], index=[3, 2, 3]) result = ser.loc[[3, 2, 3]] - assert_series_equal(result, expected, check_index_type=True) + tm.assert_series_equal(result, expected, check_index_type=True) expected = Series([np.nan, 0.2, np.nan, np.nan], index=[3, 2, 3, 'x']) result = ser.loc[[3, 2, 3, 'x']] - assert_series_equal(result, expected, check_index_type=True) + tm.assert_series_equal(result, expected, check_index_type=True) expected = Series([0.2, 0.2, 0.1], index=[2, 2, 1]) result = ser.loc[[2, 2, 1]] - assert_series_equal(result, expected, check_index_type=True) + tm.assert_series_equal(result, expected, check_index_type=True) expected = Series([0.2, 0.2, np.nan, 0.1], index=[2, 2, 'x', 1]) result = ser.loc[[2, 2, 'x', 1]] - assert_series_equal(result, expected, check_index_type=True) + tm.assert_series_equal(result, expected, check_index_type=True) # raises as nothing in in the index self.assertRaises(KeyError, lambda: ser.loc[[3, 3, 3]]) expected = Series([0.2, 0.2, np.nan], index=[2, 2, 3]) result = ser.loc[[2, 2, 3]] - assert_series_equal(result, expected, check_index_type=True) + tm.assert_series_equal(result, expected, check_index_type=True) expected = Series([0.3, np.nan, np.nan], index=[3, 4, 4]) result = Series([0.1, 0.2, 0.3], index=[1, 2, 3]).loc[[3, 4, 4]] - assert_series_equal(result, expected, check_index_type=True) + tm.assert_series_equal(result, expected, check_index_type=True) expected = Series([np.nan, 0.3, 0.3], index=[5, 3, 3]) result = Series([0.1, 0.2, 0.3, 0.4], index=[1, 2, 3, 4]).loc[[5, 3, 3]] - assert_series_equal(result, expected, check_index_type=True) + tm.assert_series_equal(result, expected, check_index_type=True) expected = Series([np.nan, 0.4, 0.4], index=[5, 4, 4]) result = Series([0.1, 0.2, 0.3, 0.4], index=[1, 2, 3, 4]).loc[[5, 4, 4]] - assert_series_equal(result, expected, check_index_type=True) + tm.assert_series_equal(result, expected, check_index_type=True) expected = Series([0.4, np.nan, np.nan], index=[7, 2, 2]) result = Series([0.1, 0.2, 0.3, 0.4], index=[4, 5, 6, 7]).loc[[7, 2, 2]] - assert_series_equal(result, expected, check_index_type=True) + tm.assert_series_equal(result, expected, check_index_type=True) expected = Series([0.4, np.nan, np.nan], index=[4, 5, 5]) result = Series([0.1, 0.2, 0.3, 0.4], index=[1, 2, 3, 4]).loc[[4, 5, 5]] - assert_series_equal(result, expected, check_index_type=True) + tm.assert_series_equal(result, expected, check_index_type=True) # iloc expected = Series([0.2, 0.2, 0.1, 0.1], index=[2, 2, 1, 1]) result = ser.iloc[[1, 1, 0, 0]] - assert_series_equal(result, expected, check_index_type=True) + tm.assert_series_equal(result, expected, check_index_type=True) def test_series_partial_set_with_name(self): # GH 11497 @@ -4139,23 +4043,23 @@ def test_series_partial_set_with_name(self): exp_idx = Index([3, 2, 3], dtype='int64', name='idx') expected = Series([np.nan, 0.2, np.nan], index=exp_idx, name='s') result = ser.loc[[3, 2, 3]] - assert_series_equal(result, expected, check_index_type=True) + tm.assert_series_equal(result, expected, check_index_type=True) exp_idx = Index([3, 2, 3, 'x'], dtype='object', name='idx') expected = Series([np.nan, 0.2, np.nan, np.nan], index=exp_idx, name='s') result = ser.loc[[3, 2, 3, 'x']] - assert_series_equal(result, expected, check_index_type=True) + tm.assert_series_equal(result, expected, check_index_type=True) exp_idx = Index([2, 2, 1], dtype='int64', name='idx') expected = Series([0.2, 0.2, 0.1], index=exp_idx, name='s') result = ser.loc[[2, 2, 1]] - assert_series_equal(result, expected, check_index_type=True) + tm.assert_series_equal(result, expected, check_index_type=True) exp_idx = Index([2, 2, 'x', 1], dtype='object', name='idx') expected = Series([0.2, 0.2, np.nan, 0.1], index=exp_idx, name='s') result = ser.loc[[2, 2, 'x', 1]] - assert_series_equal(result, expected, check_index_type=True) + tm.assert_series_equal(result, expected, check_index_type=True) # raises as nothing in in the index self.assertRaises(KeyError, lambda: ser.loc[[3, 3, 3]]) @@ -4163,47 +4067,47 @@ def test_series_partial_set_with_name(self): exp_idx = Index([2, 2, 3], dtype='int64', name='idx') expected = Series([0.2, 0.2, np.nan], index=exp_idx, name='s') result = ser.loc[[2, 2, 3]] - assert_series_equal(result, expected, check_index_type=True) + tm.assert_series_equal(result, expected, check_index_type=True) exp_idx = Index([3, 4, 4], dtype='int64', name='idx') expected = Series([0.3, np.nan, np.nan], index=exp_idx, name='s') idx = Index([1, 2, 3], dtype='int64', name='idx') result = Series([0.1, 0.2, 0.3], index=idx, name='s').loc[[3, 4, 4]] - assert_series_equal(result, expected, check_index_type=True) + tm.assert_series_equal(result, expected, check_index_type=True) exp_idx = Index([5, 3, 3], dtype='int64', name='idx') expected = Series([np.nan, 0.3, 0.3], index=exp_idx, name='s') idx = Index([1, 2, 3, 4], dtype='int64', name='idx') result = Series([0.1, 0.2, 0.3, 0.4], index=idx, name='s').loc[[5, 3, 3]] - assert_series_equal(result, expected, check_index_type=True) + tm.assert_series_equal(result, expected, check_index_type=True) exp_idx = Index([5, 4, 4], dtype='int64', name='idx') expected = Series([np.nan, 0.4, 0.4], index=exp_idx, name='s') idx = Index([1, 2, 3, 4], dtype='int64', name='idx') result = Series([0.1, 0.2, 0.3, 0.4], index=idx, name='s').loc[[5, 4, 4]] - assert_series_equal(result, expected, check_index_type=True) + tm.assert_series_equal(result, expected, check_index_type=True) exp_idx = Index([7, 2, 2], dtype='int64', name='idx') expected = Series([0.4, np.nan, np.nan], index=exp_idx, name='s') idx = Index([4, 5, 6, 7], dtype='int64', name='idx') result = Series([0.1, 0.2, 0.3, 0.4], index=idx, name='s').loc[[7, 2, 2]] - assert_series_equal(result, expected, check_index_type=True) + tm.assert_series_equal(result, expected, check_index_type=True) exp_idx = Index([4, 5, 5], dtype='int64', name='idx') expected = Series([0.4, np.nan, np.nan], index=exp_idx, name='s') idx = Index([1, 2, 3, 4], dtype='int64', name='idx') result = Series([0.1, 0.2, 0.3, 0.4], index=idx, name='s').loc[[4, 5, 5]] - assert_series_equal(result, expected, check_index_type=True) + tm.assert_series_equal(result, expected, check_index_type=True) # iloc exp_idx = Index([2, 2, 1, 1], dtype='int64', name='idx') expected = Series([0.2, 0.2, 0.1, 0.1], index=exp_idx, name='s') result = ser.iloc[[1, 1, 0, 0]] - assert_series_equal(result, expected, check_index_type=True) + tm.assert_series_equal(result, expected, check_index_type=True) def test_series_partial_set_datetime(self): # GH 11497 @@ -4213,19 +4117,19 @@ def test_series_partial_set_datetime(self): result = ser.loc[[Timestamp('2011-01-01'), Timestamp('2011-01-02')]] exp = Series([0.1, 0.2], index=idx, name='s') - assert_series_equal(result, exp, check_index_type=True) + tm.assert_series_equal(result, exp, check_index_type=True) keys = [Timestamp('2011-01-02'), Timestamp('2011-01-02'), Timestamp('2011-01-01')] exp = Series([0.2, 0.2, 0.1], index=pd.DatetimeIndex(keys, name='idx'), name='s') - assert_series_equal(ser.loc[keys], exp, check_index_type=True) + tm.assert_series_equal(ser.loc[keys], exp, check_index_type=True) keys = [Timestamp('2011-01-03'), Timestamp('2011-01-02'), Timestamp('2011-01-03')] exp = Series([np.nan, 0.2, np.nan], index=pd.DatetimeIndex(keys, name='idx'), name='s') - assert_series_equal(ser.loc[keys], exp, check_index_type=True) + tm.assert_series_equal(ser.loc[keys], exp, check_index_type=True) def test_series_partial_set_period(self): # GH 11497 @@ -4233,17 +4137,17 @@ def test_series_partial_set_period(self): idx = pd.period_range('2011-01-01', '2011-01-02', freq='D', name='idx') ser = Series([0.1, 0.2], index=idx, name='s') - result = ser.loc[[pd.Period('2011-01-01', freq='D'), pd.Period( - '2011-01-02', freq='D')]] + result = ser.loc[[pd.Period('2011-01-01', freq='D'), + pd.Period('2011-01-02', freq='D')]] exp = Series([0.1, 0.2], index=idx, name='s') - assert_series_equal(result, exp, check_index_type=True) + tm.assert_series_equal(result, exp, check_index_type=True) keys = [pd.Period('2011-01-02', freq='D'), pd.Period('2011-01-02', freq='D'), pd.Period('2011-01-01', freq='D')] exp = Series([0.2, 0.2, 0.1], index=pd.PeriodIndex(keys, name='idx'), name='s') - assert_series_equal(ser.loc[keys], exp, check_index_type=True) + tm.assert_series_equal(ser.loc[keys], exp, check_index_type=True) keys = [pd.Period('2011-01-03', freq='D'), pd.Period('2011-01-02', freq='D'), @@ -4251,14 +4155,15 @@ def test_series_partial_set_period(self): exp = Series([np.nan, 0.2, np.nan], index=pd.PeriodIndex(keys, name='idx'), name='s') result = ser.loc[keys] - assert_series_equal(result, exp) + tm.assert_series_equal(result, exp) def test_partial_set_invalid(self): # GH 4940 # allow only setting of 'valid' values - df = tm.makeTimeDataFrame() + orig = tm.makeTimeDataFrame() + df = orig.copy() # don't allow not string inserts def f(): @@ -4282,33 +4187,40 @@ def f(): self.assertRaises(ValueError, f) # allow object conversion here + df = orig.copy() df.loc['a', :] = df.ix[0] + exp = orig.append(pd.Series(df.ix[0], name='a')) + tm.assert_frame_equal(df, exp) + tm.assert_index_equal(df.index, + pd.Index(orig.index.tolist() + ['a'])) + self.assertEqual(df.index.dtype, 'object') - def test_partial_set_empty(self): + def test_partial_set_empty_series(self): # GH5226 - # partially set with an empty object - # series + # partially set with an empty object series s = Series() s.loc[1] = 1 - assert_series_equal(s, Series([1], index=[1])) + tm.assert_series_equal(s, Series([1], index=[1])) s.loc[3] = 3 - assert_series_equal(s, Series([1, 3], index=[1, 3])) + tm.assert_series_equal(s, Series([1, 3], index=[1, 3])) s = Series() s.loc[1] = 1. - assert_series_equal(s, Series([1.], index=[1])) + tm.assert_series_equal(s, Series([1.], index=[1])) s.loc[3] = 3. - assert_series_equal(s, Series([1., 3.], index=[1, 3])) + tm.assert_series_equal(s, Series([1., 3.], index=[1, 3])) s = Series() s.loc['foo'] = 1 - assert_series_equal(s, Series([1], index=['foo'])) + tm.assert_series_equal(s, Series([1], index=['foo'])) s.loc['bar'] = 3 - assert_series_equal(s, Series([1, 3], index=['foo', 'bar'])) + tm.assert_series_equal(s, Series([1, 3], index=['foo', 'bar'])) s.loc[3] = 4 - assert_series_equal(s, Series([1, 3, 4], index=['foo', 'bar', 3])) + tm.assert_series_equal(s, Series([1, 3, 4], index=['foo', 'bar', 3])) + + def test_partial_set_empty_frame(self): # partially set with an empty object # frame @@ -4340,24 +4252,24 @@ def f(): df['foo'] = Series([], dtype='object') return df - assert_frame_equal(f(), expected) + tm.assert_frame_equal(f(), expected) def f(): df = DataFrame() df['foo'] = Series(df.index) return df - assert_frame_equal(f(), expected) + tm.assert_frame_equal(f(), expected) def f(): df = DataFrame() df['foo'] = df.index return df - assert_frame_equal(f(), expected) + tm.assert_frame_equal(f(), expected) - expected = DataFrame(columns=['foo'], index=pd.Index( - [], dtype='int64')) + expected = DataFrame(columns=['foo'], + index=pd.Index([], dtype='int64')) expected['foo'] = expected['foo'].astype('float64') def f(): @@ -4365,109 +4277,119 @@ def f(): df['foo'] = [] return df - assert_frame_equal(f(), expected) + tm.assert_frame_equal(f(), expected) def f(): df = DataFrame() df['foo'] = Series(range(len(df))) return df - assert_frame_equal(f(), expected) + tm.assert_frame_equal(f(), expected) def f(): df = DataFrame() + tm.assert_index_equal(df.index, pd.Index([], dtype='object')) df['foo'] = range(len(df)) return df - assert_frame_equal(f(), expected) + expected = DataFrame(columns=['foo'], + index=pd.Index([], dtype='int64')) + expected['foo'] = expected['foo'].astype('float64') + tm.assert_frame_equal(f(), expected) df = DataFrame() + tm.assert_index_equal(df.columns, pd.Index([], dtype=object)) df2 = DataFrame() df2[1] = Series([1], index=['foo']) df.loc[:, 1] = Series([1], index=['foo']) - assert_frame_equal(df, DataFrame([[1]], index=['foo'], columns=[1])) - assert_frame_equal(df, df2) + tm.assert_frame_equal(df, DataFrame([[1]], index=['foo'], columns=[1])) + tm.assert_frame_equal(df, df2) # no index to start - expected = DataFrame( - {0: Series(1, index=range(4))}, columns=['A', 'B', 0]) + expected = DataFrame({0: Series(1, index=range(4))}, + columns=['A', 'B', 0]) df = DataFrame(columns=['A', 'B']) df[0] = Series(1, index=range(4)) df.dtypes str(df) - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) df = DataFrame(columns=['A', 'B']) df.loc[:, 0] = Series(1, index=range(4)) df.dtypes str(df) - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) + def test_partial_set_empty_frame_row(self): # GH5720, GH5744 # don't create rows when empty - expected = DataFrame(columns=['A', 'B', 'New'], index=pd.Index( - [], dtype='int64')) + expected = DataFrame(columns=['A', 'B', 'New'], + index=pd.Index([], dtype='int64')) expected['A'] = expected['A'].astype('int64') expected['B'] = expected['B'].astype('float64') expected['New'] = expected['New'].astype('float64') + df = DataFrame({"A": [1, 2, 3], "B": [1.2, 4.2, 5.2]}) y = df[df.A > 5] y['New'] = np.nan - assert_frame_equal(y, expected) - # assert_frame_equal(y,expected) + tm.assert_frame_equal(y, expected) + # tm.assert_frame_equal(y,expected) expected = DataFrame(columns=['a', 'b', 'c c', 'd']) expected['d'] = expected['d'].astype('int64') df = DataFrame(columns=['a', 'b', 'c c']) df['d'] = 3 - assert_frame_equal(df, expected) - assert_series_equal(df['c c'], Series(name='c c', dtype=object)) + tm.assert_frame_equal(df, expected) + tm.assert_series_equal(df['c c'], Series(name='c c', dtype=object)) # reindex columns is ok df = DataFrame({"A": [1, 2, 3], "B": [1.2, 4.2, 5.2]}) y = df[df.A > 5] result = y.reindex(columns=['A', 'B', 'C']) - expected = DataFrame(columns=['A', 'B', 'C'], index=pd.Index( - [], dtype='int64')) + expected = DataFrame(columns=['A', 'B', 'C'], + index=pd.Index([], dtype='int64')) expected['A'] = expected['A'].astype('int64') expected['B'] = expected['B'].astype('float64') expected['C'] = expected['C'].astype('float64') - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) + def test_partial_set_empty_frame_set_series(self): # GH 5756 # setting with empty Series df = DataFrame(Series()) - assert_frame_equal(df, DataFrame({0: Series()})) + tm.assert_frame_equal(df, DataFrame({0: Series()})) df = DataFrame(Series(name='foo')) - assert_frame_equal(df, DataFrame({'foo': Series()})) + tm.assert_frame_equal(df, DataFrame({'foo': Series()})) + def test_partial_set_empty_frame_empty_copy_assignment(self): # GH 5932 # copy on empty with assignment fails df = DataFrame(index=[0]) df = df.copy() df['a'] = 0 expected = DataFrame(0, index=[0], columns=['a']) - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) + def test_partial_set_empty_frame_empty_consistencies(self): # GH 6171 # consistency on empty frames df = DataFrame(columns=['x', 'y']) df['x'] = [1, 2] expected = DataFrame(dict(x=[1, 2], y=[np.nan, np.nan])) - assert_frame_equal(df, expected, check_dtype=False) + tm.assert_frame_equal(df, expected, check_dtype=False) df = DataFrame(columns=['x', 'y']) df['x'] = ['1', '2'] expected = DataFrame( dict(x=['1', '2'], y=[np.nan, np.nan]), dtype=object) - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) df = DataFrame(columns=['x', 'y']) df.loc[0, 'x'] = 1 expected = DataFrame(dict(x=[1], y=[np.nan])) - assert_frame_equal(df, expected, check_dtype=False) + tm.assert_frame_equal(df, expected, check_dtype=False) def test_cache_updating(self): # GH 4939, make sure to update the cache on setitem @@ -4516,9 +4438,9 @@ def test_cache_updating(self): expected = DataFrame(np.zeros((5, 6), dtype='int64'), columns=[ 'a', 'b', 'c', 'd', 'e', 'f'], index=range(5)) expected.at[3, 'f'] = 2 - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) expected = Series([0, 0, 0, 2, 0], name='f') - assert_series_equal(df.f, expected) + tm.assert_series_equal(df.f, expected) def test_slice_consolidate_invalidate_item_cache(self): @@ -4575,8 +4497,8 @@ def test_setitem_cache_updating(self): for ix, row in df.iterrows(): out.loc[six:eix, row['C']] = out.loc[six:eix, row['C']] + row['D'] - assert_frame_equal(out, expected) - assert_series_equal(out['A'], expected['A']) + tm.assert_frame_equal(out, expected) + tm.assert_series_equal(out['A'], expected['A']) # try via a chain indexing # this actually works @@ -4586,16 +4508,16 @@ def test_setitem_cache_updating(self): v = out[row['C']][six:eix] + row['D'] out[row['C']][six:eix] = v - assert_frame_equal(out, expected) - assert_series_equal(out['A'], expected['A']) + tm.assert_frame_equal(out, expected) + tm.assert_series_equal(out['A'], expected['A']) out = DataFrame({'A': [0, 0, 0]}, index=date_range('5/7/2014', '5/9/2014')) for ix, row in df.iterrows(): out.loc[six:eix, row['C']] += row['D'] - assert_frame_equal(out, expected) - assert_series_equal(out['A'], expected['A']) + tm.assert_frame_equal(out, expected) + tm.assert_series_equal(out['A'], expected['A']) def test_setitem_chained_setfault(self): @@ -4607,31 +4529,31 @@ def test_setitem_chained_setfault(self): df = DataFrame({'response': np.array(data)}) mask = df.response == 'timeout' df.response[mask] = 'none' - assert_frame_equal(df, DataFrame({'response': mdata})) + tm.assert_frame_equal(df, DataFrame({'response': mdata})) recarray = np.rec.fromarrays([data], names=['response']) df = DataFrame(recarray) mask = df.response == 'timeout' df.response[mask] = 'none' - assert_frame_equal(df, DataFrame({'response': mdata})) + tm.assert_frame_equal(df, DataFrame({'response': mdata})) df = DataFrame({'response': data, 'response1': data}) mask = df.response == 'timeout' df.response[mask] = 'none' - assert_frame_equal(df, DataFrame({'response': mdata, - 'response1': data})) + tm.assert_frame_equal(df, DataFrame({'response': mdata, + 'response1': data})) # GH 6056 expected = DataFrame(dict(A=[np.nan, 'bar', 'bah', 'foo', 'bar'])) df = DataFrame(dict(A=np.array(['foo', 'bar', 'bah', 'foo', 'bar']))) df['A'].iloc[0] = np.nan result = df.head() - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) df = DataFrame(dict(A=np.array(['foo', 'bar', 'bah', 'foo', 'bar']))) df.A.iloc[0] = np.nan result = df.head() - assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_detect_chained_assignment(self): @@ -4639,18 +4561,16 @@ def test_detect_chained_assignment(self): # work with the chain expected = DataFrame([[-5, 1], [-6, 3]], columns=list('AB')) - df = DataFrame( - np.arange(4).reshape(2, 2), columns=list('AB'), dtype='int64') + df = DataFrame(np.arange(4).reshape(2, 2), + columns=list('AB'), dtype='int64') self.assertIsNone(df.is_copy) df['A'][0] = -5 df['A'][1] = -6 - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) # test with the chaining - df = DataFrame({'A': Series( - range(2), dtype='int64'), - 'B': np.array( - np.arange(2, 4), dtype=np.float64)}) + df = DataFrame({'A': Series(range(2), dtype='int64'), + 'B': np.array(np.arange(2, 4), dtype=np.float64)}) self.assertIsNone(df.is_copy) def f(): @@ -4665,10 +4585,8 @@ def f(): self.assertIsNone(df['A'].is_copy) # using a copy (the chain), fails - df = DataFrame({'A': Series( - range(2), dtype='int64'), - 'B': np.array( - np.arange(2, 4), dtype=np.float64)}) + df = DataFrame({'A': Series(range(2), dtype='int64'), + 'B': np.array(np.arange(2, 4), dtype=np.float64)}) def f(): df.loc[0]['A'] = -5 @@ -4676,13 +4594,12 @@ def f(): self.assertRaises(com.SettingWithCopyError, f) # doc example - df = DataFrame({'a': ['one', 'one', 'two', 'three', 'two', 'one', 'six' - ], - 'c': Series( - range(7), dtype='int64')}) + df = DataFrame({'a': ['one', 'one', 'two', 'three', + 'two', 'one', 'six'], + 'c': Series(range(7), dtype='int64')}) self.assertIsNone(df.is_copy) - expected = DataFrame({'a': ['one', 'one', 'two', 'three', 'two', 'one', - 'six'], + expected = DataFrame({'a': ['one', 'one', 'two', 'three', + 'two', 'one', 'six'], 'c': [42, 42, 2, 3, 4, 42, 6]}) def f(): @@ -4705,7 +4622,7 @@ def f(): self.assertRaises(com.SettingWithCopyError, f) df.loc[0, 'A'] = 111 - assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) # make sure that is_copy is picked up reconstruction # GH5475 @@ -4795,8 +4712,8 @@ def f(): df = DataFrame(np.random.randn(10, 4)) s = df.iloc[:, 0].sort_values() - assert_series_equal(s, df.iloc[:, 0].sort_values()) - assert_series_equal(s, df[0].sort_values()) + tm.assert_series_equal(s, df.iloc[:, 0].sort_values()) + tm.assert_series_equal(s, df[0].sort_values()) # false positives GH6025 df = DataFrame({'column1': ['a', 'a', 'a'], 'column2': [4, 8, 9]}) @@ -5044,40 +4961,45 @@ def test_iloc_empty_list_indexer_is_ok(self): from pandas.util.testing import makeCustomDataframe as mkdf df = mkdf(5, 2) # vertical empty - assert_frame_equal(df.iloc[:, []], df.iloc[:, :0], - check_index_type=True, check_column_type=True) + tm.assert_frame_equal(df.iloc[:, []], df.iloc[:, :0], + check_index_type=True, check_column_type=True) # horizontal empty - assert_frame_equal(df.iloc[[], :], df.iloc[:0, :], - check_index_type=True, check_column_type=True) + tm.assert_frame_equal(df.iloc[[], :], df.iloc[:0, :], + check_index_type=True, check_column_type=True) # horizontal empty - assert_frame_equal(df.iloc[[]], df.iloc[:0, :], check_index_type=True, - check_column_type=True) + tm.assert_frame_equal(df.iloc[[]], df.iloc[:0, :], + check_index_type=True, + check_column_type=True) def test_loc_empty_list_indexer_is_ok(self): from pandas.util.testing import makeCustomDataframe as mkdf df = mkdf(5, 2) # vertical empty - assert_frame_equal(df.loc[:, []], df.iloc[:, :0], - check_index_type=True, check_column_type=True) + tm.assert_frame_equal(df.loc[:, []], df.iloc[:, :0], + check_index_type=True, check_column_type=True) # horizontal empty - assert_frame_equal(df.loc[[], :], df.iloc[:0, :], - check_index_type=True, check_column_type=True) + tm.assert_frame_equal(df.loc[[], :], df.iloc[:0, :], + check_index_type=True, check_column_type=True) # horizontal empty - assert_frame_equal(df.loc[[]], df.iloc[:0, :], check_index_type=True, - check_column_type=True) + tm.assert_frame_equal(df.loc[[]], df.iloc[:0, :], + check_index_type=True, + check_column_type=True) def test_ix_empty_list_indexer_is_ok(self): from pandas.util.testing import makeCustomDataframe as mkdf df = mkdf(5, 2) # vertical empty - assert_frame_equal(df.ix[:, []], df.iloc[:, :0], check_index_type=True, - check_column_type=True) + tm.assert_frame_equal(df.ix[:, []], df.iloc[:, :0], + check_index_type=True, + check_column_type=True) # horizontal empty - assert_frame_equal(df.ix[[], :], df.iloc[:0, :], check_index_type=True, - check_column_type=True) + tm.assert_frame_equal(df.ix[[], :], df.iloc[:0, :], + check_index_type=True, + check_column_type=True) # horizontal empty - assert_frame_equal(df.ix[[]], df.iloc[:0, :], check_index_type=True, - check_column_type=True) + tm.assert_frame_equal(df.ix[[]], df.iloc[:0, :], + check_index_type=True, + check_column_type=True) def test_index_type_coercion(self): @@ -5176,23 +5098,23 @@ def run_tests(df, rhs, right): left = df.copy() left.loc[r, c] = rhs - assert_frame_equal(left, right) + tm.assert_frame_equal(left, right) left = df.copy() left.iloc[i, j] = rhs - assert_frame_equal(left, right) + tm.assert_frame_equal(left, right) left = df.copy() left.ix[s, l] = rhs - assert_frame_equal(left, right) + tm.assert_frame_equal(left, right) left = df.copy() left.ix[i, j] = rhs - assert_frame_equal(left, right) + tm.assert_frame_equal(left, right) left = df.copy() left.ix[r, c] = rhs - assert_frame_equal(left, right) + tm.assert_frame_equal(left, right) xs = np.arange(20).reshape(5, 4) cols = ['jim', 'joe', 'jolie', 'joline'] @@ -5219,12 +5141,12 @@ def test_str_label_slicing_with_negative_step(self): SLC = pd.IndexSlice def assert_slices_equivalent(l_slc, i_slc): - assert_series_equal(s.loc[l_slc], s.iloc[i_slc]) + tm.assert_series_equal(s.loc[l_slc], s.iloc[i_slc]) if not idx.is_integer: # For integer indices, ix and plain getitem are position-based. - assert_series_equal(s[l_slc], s.iloc[i_slc]) - assert_series_equal(s.ix[l_slc], s.iloc[i_slc]) + tm.assert_series_equal(s[l_slc], s.iloc[i_slc]) + tm.assert_series_equal(s.ix[l_slc], s.iloc[i_slc]) for idx in [_mklbl('A', 20), np.arange(20) + 100, np.linspace(100, 150, 20)]: @@ -5241,9 +5163,9 @@ def test_multiindex_label_slicing_with_negative_step(self): SLC = pd.IndexSlice def assert_slices_equivalent(l_slc, i_slc): - assert_series_equal(s.loc[l_slc], s.iloc[i_slc]) - assert_series_equal(s[l_slc], s.iloc[i_slc]) - assert_series_equal(s.ix[l_slc], s.iloc[i_slc]) + tm.assert_series_equal(s.loc[l_slc], s.iloc[i_slc]) + tm.assert_series_equal(s[l_slc], s.iloc[i_slc]) + tm.assert_series_equal(s.ix[l_slc], s.iloc[i_slc]) assert_slices_equivalent(SLC[::-1], SLC[::-1]) @@ -5288,8 +5210,8 @@ def test_indexing_dtypes_on_empty(self): df2 = df.ix[[], :] self.assertEqual(df2.loc[:, 'a'].dtype, np.int64) - assert_series_equal(df2.loc[:, 'a'], df2.iloc[:, 0]) - assert_series_equal(df2.loc[:, 'a'], df2.ix[:, 0]) + tm.assert_series_equal(df2.loc[:, 'a'], df2.iloc[:, 0]) + tm.assert_series_equal(df2.loc[:, 'a'], df2.ix[:, 0]) def test_range_in_series_indexing(self): # range can cause an indexing error @@ -5297,24 +5219,10 @@ def test_range_in_series_indexing(self): for x in [5, 999999, 1000000]: s = pd.Series(index=range(x)) s.loc[range(1)] = 42 - assert_series_equal(s.loc[range(1)], Series(42.0, index=[0])) + tm.assert_series_equal(s.loc[range(1)], Series(42.0, index=[0])) s.loc[range(2)] = 43 - assert_series_equal(s.loc[range(2)], Series(43.0, index=[0, 1])) - - @slow - def test_large_dataframe_indexing(self): - # GH10692 - result = DataFrame({'x': range(10 ** 6)}, dtype='int64') - result.loc[len(result)] = len(result) + 1 - expected = DataFrame({'x': range(10 ** 6 + 1)}, dtype='int64') - assert_frame_equal(result, expected) - - @slow - def test_large_mi_dataframe_indexing(self): - # GH10645 - result = MultiIndex.from_arrays([range(10 ** 6), range(10 ** 6)]) - assert (not (10 ** 6, 0) in result) + tm.assert_series_equal(s.loc[range(2)], Series(43.0, index=[0, 1])) def test_non_reducing_slice(self): df = pd.DataFrame([[0, 1], [2, 3]]) @@ -5379,11 +5287,7 @@ def test_coercion_with_setitem(self): start_series[0] = None expected_series = Series(expected_result) - - assert_attr_equal('dtype', start_series, expected_series) - tm.assert_numpy_array_equal(start_series.values, - expected_series.values, - strict_nan=True) + tm.assert_series_equal(start_series, expected_series) def test_coercion_with_loc_setitem(self): for start_data, expected_result in self.EXPECTED_RESULTS: @@ -5391,11 +5295,7 @@ def test_coercion_with_loc_setitem(self): start_series.loc[0] = None expected_series = Series(expected_result) - - assert_attr_equal('dtype', start_series, expected_series) - tm.assert_numpy_array_equal(start_series.values, - expected_series.values, - strict_nan=True) + tm.assert_series_equal(start_series, expected_series) def test_coercion_with_setitem_and_series(self): for start_data, expected_result in self.EXPECTED_RESULTS: @@ -5403,11 +5303,7 @@ def test_coercion_with_setitem_and_series(self): start_series[start_series == start_series[0]] = None expected_series = Series(expected_result) - - assert_attr_equal('dtype', start_series, expected_series) - tm.assert_numpy_array_equal(start_series.values, - expected_series.values, - strict_nan=True) + tm.assert_series_equal(start_series, expected_series) def test_coercion_with_loc_and_series(self): for start_data, expected_result in self.EXPECTED_RESULTS: @@ -5415,11 +5311,7 @@ def test_coercion_with_loc_and_series(self): start_series.loc[start_series == start_series[0]] = None expected_series = Series(expected_result) - - assert_attr_equal('dtype', start_series, expected_series) - tm.assert_numpy_array_equal(start_series.values, - expected_series.values, - strict_nan=True) + tm.assert_series_equal(start_series, expected_series) class TestDataframeNoneCoercion(tm.TestCase): @@ -5442,12 +5334,7 @@ def test_coercion_with_loc(self): start_dataframe.loc[0, ['foo']] = None expected_dataframe = DataFrame({'foo': expected_result}) - - assert_attr_equal('dtype', start_dataframe['foo'], - expected_dataframe['foo']) - tm.assert_numpy_array_equal(start_dataframe['foo'].values, - expected_dataframe['foo'].values, - strict_nan=True) + tm.assert_frame_equal(start_dataframe, expected_dataframe) def test_coercion_with_setitem_and_dataframe(self): for start_data, expected_result, in self.EXPECTED_SINGLE_ROW_RESULTS: @@ -5456,12 +5343,7 @@ def test_coercion_with_setitem_and_dataframe(self): 0]] = None expected_dataframe = DataFrame({'foo': expected_result}) - - assert_attr_equal('dtype', start_dataframe['foo'], - expected_dataframe['foo']) - tm.assert_numpy_array_equal(start_dataframe['foo'].values, - expected_dataframe['foo'].values, - strict_nan=True) + tm.assert_frame_equal(start_dataframe, expected_dataframe) def test_none_coercion_loc_and_dataframe(self): for start_data, expected_result, in self.EXPECTED_SINGLE_ROW_RESULTS: @@ -5470,12 +5352,7 @@ def test_none_coercion_loc_and_dataframe(self): 'foo'][0]] = None expected_dataframe = DataFrame({'foo': expected_result}) - - assert_attr_equal('dtype', start_dataframe['foo'], - expected_dataframe['foo']) - tm.assert_numpy_array_equal(start_dataframe['foo'].values, - expected_dataframe['foo'].values, - strict_nan=True) + tm.assert_frame_equal(start_dataframe, expected_dataframe) def test_none_coercion_mixed_dtypes(self): start_dataframe = DataFrame({ @@ -5487,19 +5364,12 @@ def test_none_coercion_mixed_dtypes(self): }) start_dataframe.iloc[0] = None - expected_dataframe = DataFrame({ - 'a': [np.nan, 2, 3], - 'b': [np.nan, 2.0, 3.0], - 'c': [NaT, datetime(2000, 1, 2), datetime(2000, 1, 3)], - 'd': [None, 'b', 'c'] - }) - - for column in expected_dataframe.columns: - assert_attr_equal('dtype', start_dataframe[column], - expected_dataframe[column]) - tm.assert_numpy_array_equal(start_dataframe[column].values, - expected_dataframe[column].values, - strict_nan=True) + exp = DataFrame({'a': [np.nan, 2, 3], + 'b': [np.nan, 2.0, 3.0], + 'c': [NaT, datetime(2000, 1, 2), + datetime(2000, 1, 3)], + 'd': [None, 'b', 'c']}) + tm.assert_frame_equal(start_dataframe, exp) if __name__ == '__main__': diff --git a/pandas/tests/indexing/test_indexing_slow.py b/pandas/tests/indexing/test_indexing_slow.py new file mode 100644 index 0000000000000..5d563e20087b9 --- /dev/null +++ b/pandas/tests/indexing/test_indexing_slow.py @@ -0,0 +1,97 @@ +# -*- coding: utf-8 -*- + +import warnings + +import numpy as np +import pandas as pd +from pandas.core.api import Series, DataFrame, MultiIndex +import pandas.util.testing as tm + + +class TestIndexingSlow(tm.TestCase): + + _multiprocess_can_split_ = True + + @tm.slow + def test_multiindex_get_loc(self): # GH7724, GH2646 + + with warnings.catch_warnings(record=True): + + # test indexing into a multi-index before & past the lexsort depth + from numpy.random import randint, choice, randn + cols = ['jim', 'joe', 'jolie', 'joline', 'jolia'] + + def validate(mi, df, key): + mask = np.ones(len(df)).astype('bool') + + # test for all partials of this key + for i, k in enumerate(key): + mask &= df.iloc[:, i] == k + + if not mask.any(): + self.assertNotIn(key[:i + 1], mi.index) + continue + + self.assertIn(key[:i + 1], mi.index) + right = df[mask].copy() + + if i + 1 != len(key): # partial key + right.drop(cols[:i + 1], axis=1, inplace=True) + right.set_index(cols[i + 1:-1], inplace=True) + tm.assert_frame_equal(mi.loc[key[:i + 1]], right) + + else: # full key + right.set_index(cols[:-1], inplace=True) + if len(right) == 1: # single hit + right = Series(right['jolia'].values, + name=right.index[0], + index=['jolia']) + tm.assert_series_equal(mi.loc[key[:i + 1]], right) + else: # multi hit + tm.assert_frame_equal(mi.loc[key[:i + 1]], right) + + def loop(mi, df, keys): + for key in keys: + validate(mi, df, key) + + n, m = 1000, 50 + + vals = [randint(0, 10, n), choice( + list('abcdefghij'), n), choice( + pd.date_range('20141009', periods=10).tolist(), n), choice( + list('ZYXWVUTSRQ'), n), randn(n)] + vals = list(map(tuple, zip(*vals))) + + # bunch of keys for testing + keys = [randint(0, 11, m), choice( + list('abcdefghijk'), m), choice( + pd.date_range('20141009', periods=11).tolist(), m), choice( + list('ZYXWVUTSRQP'), m)] + keys = list(map(tuple, zip(*keys))) + keys += list(map(lambda t: t[:-1], vals[::n // m])) + + # covers both unique index and non-unique index + df = pd.DataFrame(vals, columns=cols) + a, b = pd.concat([df, df]), df.drop_duplicates(subset=cols[:-1]) + + for frame in a, b: + for i in range(5): # lexsort depth + df = frame.copy() if i == 0 else frame.sort_values( + by=cols[:i]) + mi = df.set_index(cols[:-1]) + assert not mi.index.lexsort_depth < i + loop(mi, df, keys) + + @tm.slow + def test_large_dataframe_indexing(self): + # GH10692 + result = DataFrame({'x': range(10 ** 6)}, dtype='int64') + result.loc[len(result)] = len(result) + 1 + expected = DataFrame({'x': range(10 ** 6 + 1)}, dtype='int64') + tm.assert_frame_equal(result, expected) + + @tm.slow + def test_large_mi_dataframe_indexing(self): + # GH10645 + result = MultiIndex.from_arrays([range(10 ** 6), range(10 ** 6)]) + assert (not (10 ** 6, 0) in result) diff --git a/pandas/tests/plotting/__init__.py b/pandas/tests/plotting/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/plotting/common.py b/pandas/tests/plotting/common.py new file mode 100644 index 0000000000000..9fe1d7cacd38f --- /dev/null +++ b/pandas/tests/plotting/common.py @@ -0,0 +1,563 @@ +#!/usr/bin/env python +# coding: utf-8 + +import nose +import os +import warnings + +from pandas import DataFrame, Series +from pandas.compat import zip, iteritems +from pandas.util.decorators import cache_readonly +from pandas.types.api import is_list_like +import pandas.util.testing as tm +from pandas.util.testing import (ensure_clean, + assert_is_valid_plot_return_object) + +import numpy as np +from numpy import random + +import pandas.tools.plotting as plotting + + +""" +This is a common base class used for various plotting tests +""" + + +def _skip_if_no_scipy_gaussian_kde(): + try: + from scipy.stats import gaussian_kde # noqa + except ImportError: + raise nose.SkipTest("scipy version doesn't support gaussian_kde") + + +def _ok_for_gaussian_kde(kind): + if kind in ['kde', 'density']: + try: + from scipy.stats import gaussian_kde # noqa + except ImportError: + return False + return True + + +@tm.mplskip +class TestPlotBase(tm.TestCase): + + def setUp(self): + + import matplotlib as mpl + mpl.rcdefaults() + + self.mpl_le_1_2_1 = plotting._mpl_le_1_2_1() + self.mpl_ge_1_3_1 = plotting._mpl_ge_1_3_1() + self.mpl_ge_1_4_0 = plotting._mpl_ge_1_4_0() + self.mpl_ge_1_5_0 = plotting._mpl_ge_1_5_0() + self.mpl_ge_2_0_0 = plotting._mpl_ge_2_0_0() + + if self.mpl_ge_1_4_0: + self.bp_n_objects = 7 + else: + self.bp_n_objects = 8 + if self.mpl_ge_1_5_0: + # 1.5 added PolyCollections to legend handler + # so we have twice as many items. + self.polycollection_factor = 2 + else: + self.polycollection_factor = 1 + + if self.mpl_ge_2_0_0: + self.default_figsize = (6.4, 4.8) + else: + self.default_figsize = (8.0, 6.0) + self.default_tick_position = 'left' if self.mpl_ge_2_0_0 else 'default' + # common test data + from pandas import read_csv + path = os.path.join(os.path.dirname(curpath()), 'data', 'iris.csv') + self.iris = read_csv(path) + + n = 100 + with tm.RNGContext(42): + gender = np.random.choice(['Male', 'Female'], size=n) + classroom = np.random.choice(['A', 'B', 'C'], size=n) + + self.hist_df = DataFrame({'gender': gender, + 'classroom': classroom, + 'height': random.normal(66, 4, size=n), + 'weight': random.normal(161, 32, size=n), + 'category': random.randint(4, size=n)}) + + self.tdf = tm.makeTimeDataFrame() + self.hexbin_df = DataFrame({"A": np.random.uniform(size=20), + "B": np.random.uniform(size=20), + "C": np.arange(20) + np.random.uniform( + size=20)}) + + def tearDown(self): + tm.close() + + @cache_readonly + def plt(self): + import matplotlib.pyplot as plt + return plt + + @cache_readonly + def colorconverter(self): + import matplotlib.colors as colors + return colors.colorConverter + + def _check_legend_labels(self, axes, labels=None, visible=True): + """ + Check each axes has expected legend labels + + Parameters + ---------- + axes : matplotlib Axes object, or its list-like + labels : list-like + expected legend labels + visible : bool + expected legend visibility. labels are checked only when visible is + True + """ + + if visible and (labels is None): + raise ValueError('labels must be specified when visible is True') + axes = self._flatten_visible(axes) + for ax in axes: + if visible: + self.assertTrue(ax.get_legend() is not None) + self._check_text_labels(ax.get_legend().get_texts(), labels) + else: + self.assertTrue(ax.get_legend() is None) + + def _check_data(self, xp, rs): + """ + Check each axes has identical lines + + Parameters + ---------- + xp : matplotlib Axes object + rs : matplotlib Axes object + """ + xp_lines = xp.get_lines() + rs_lines = rs.get_lines() + + def check_line(xpl, rsl): + xpdata = xpl.get_xydata() + rsdata = rsl.get_xydata() + tm.assert_almost_equal(xpdata, rsdata) + + self.assertEqual(len(xp_lines), len(rs_lines)) + [check_line(xpl, rsl) for xpl, rsl in zip(xp_lines, rs_lines)] + tm.close() + + def _check_visible(self, collections, visible=True): + """ + Check each artist is visible or not + + Parameters + ---------- + collections : matplotlib Artist or its list-like + target Artist or its list or collection + visible : bool + expected visibility + """ + from matplotlib.collections import Collection + if not isinstance(collections, + Collection) and not is_list_like(collections): + collections = [collections] + + for patch in collections: + self.assertEqual(patch.get_visible(), visible) + + def _get_colors_mapped(self, series, colors): + unique = series.unique() + # unique and colors length can be differed + # depending on slice value + mapped = dict(zip(unique, colors)) + return [mapped[v] for v in series.values] + + def _check_colors(self, collections, linecolors=None, facecolors=None, + mapping=None): + """ + Check each artist has expected line colors and face colors + + Parameters + ---------- + collections : list-like + list or collection of target artist + linecolors : list-like which has the same length as collections + list of expected line colors + facecolors : list-like which has the same length as collections + list of expected face colors + mapping : Series + Series used for color grouping key + used for andrew_curves, parallel_coordinates, radviz test + """ + + from matplotlib.lines import Line2D + from matplotlib.collections import ( + Collection, PolyCollection, LineCollection + ) + conv = self.colorconverter + if linecolors is not None: + + if mapping is not None: + linecolors = self._get_colors_mapped(mapping, linecolors) + linecolors = linecolors[:len(collections)] + + self.assertEqual(len(collections), len(linecolors)) + for patch, color in zip(collections, linecolors): + if isinstance(patch, Line2D): + result = patch.get_color() + # Line2D may contains string color expression + result = conv.to_rgba(result) + elif isinstance(patch, (PolyCollection, LineCollection)): + result = tuple(patch.get_edgecolor()[0]) + else: + result = patch.get_edgecolor() + + expected = conv.to_rgba(color) + self.assertEqual(result, expected) + + if facecolors is not None: + + if mapping is not None: + facecolors = self._get_colors_mapped(mapping, facecolors) + facecolors = facecolors[:len(collections)] + + self.assertEqual(len(collections), len(facecolors)) + for patch, color in zip(collections, facecolors): + if isinstance(patch, Collection): + # returned as list of np.array + result = patch.get_facecolor()[0] + else: + result = patch.get_facecolor() + + if isinstance(result, np.ndarray): + result = tuple(result) + + expected = conv.to_rgba(color) + self.assertEqual(result, expected) + + def _check_text_labels(self, texts, expected): + """ + Check each text has expected labels + + Parameters + ---------- + texts : matplotlib Text object, or its list-like + target text, or its list + expected : str or list-like which has the same length as texts + expected text label, or its list + """ + if not is_list_like(texts): + self.assertEqual(texts.get_text(), expected) + else: + labels = [t.get_text() for t in texts] + self.assertEqual(len(labels), len(expected)) + for l, e in zip(labels, expected): + self.assertEqual(l, e) + + def _check_ticks_props(self, axes, xlabelsize=None, xrot=None, + ylabelsize=None, yrot=None): + """ + Check each axes has expected tick properties + + Parameters + ---------- + axes : matplotlib Axes object, or its list-like + xlabelsize : number + expected xticks font size + xrot : number + expected xticks rotation + ylabelsize : number + expected yticks font size + yrot : number + expected yticks rotation + """ + from matplotlib.ticker import NullFormatter + axes = self._flatten_visible(axes) + for ax in axes: + if xlabelsize or xrot: + if isinstance(ax.xaxis.get_minor_formatter(), NullFormatter): + # If minor ticks has NullFormatter, rot / fontsize are not + # retained + labels = ax.get_xticklabels() + else: + labels = ax.get_xticklabels() + ax.get_xticklabels( + minor=True) + + for label in labels: + if xlabelsize is not None: + self.assertAlmostEqual(label.get_fontsize(), + xlabelsize) + if xrot is not None: + self.assertAlmostEqual(label.get_rotation(), xrot) + + if ylabelsize or yrot: + if isinstance(ax.yaxis.get_minor_formatter(), NullFormatter): + labels = ax.get_yticklabels() + else: + labels = ax.get_yticklabels() + ax.get_yticklabels( + minor=True) + + for label in labels: + if ylabelsize is not None: + self.assertAlmostEqual(label.get_fontsize(), + ylabelsize) + if yrot is not None: + self.assertAlmostEqual(label.get_rotation(), yrot) + + def _check_ax_scales(self, axes, xaxis='linear', yaxis='linear'): + """ + Check each axes has expected scales + + Parameters + ---------- + axes : matplotlib Axes object, or its list-like + xaxis : {'linear', 'log'} + expected xaxis scale + yaxis : {'linear', 'log'} + expected yaxis scale + """ + axes = self._flatten_visible(axes) + for ax in axes: + self.assertEqual(ax.xaxis.get_scale(), xaxis) + self.assertEqual(ax.yaxis.get_scale(), yaxis) + + def _check_axes_shape(self, axes, axes_num=None, layout=None, + figsize=None): + """ + Check expected number of axes is drawn in expected layout + + Parameters + ---------- + axes : matplotlib Axes object, or its list-like + axes_num : number + expected number of axes. Unnecessary axes should be set to + invisible. + layout : tuple + expected layout, (expected number of rows , columns) + figsize : tuple + expected figsize. default is matplotlib default + """ + if figsize is None: + figsize = self.default_figsize + visible_axes = self._flatten_visible(axes) + + if axes_num is not None: + self.assertEqual(len(visible_axes), axes_num) + for ax in visible_axes: + # check something drawn on visible axes + self.assertTrue(len(ax.get_children()) > 0) + + if layout is not None: + result = self._get_axes_layout(plotting._flatten(axes)) + self.assertEqual(result, layout) + + self.assert_numpy_array_equal( + visible_axes[0].figure.get_size_inches(), + np.array(figsize, dtype=np.float64)) + + def _get_axes_layout(self, axes): + x_set = set() + y_set = set() + for ax in axes: + # check axes coordinates to estimate layout + points = ax.get_position().get_points() + x_set.add(points[0][0]) + y_set.add(points[0][1]) + return (len(y_set), len(x_set)) + + def _flatten_visible(self, axes): + """ + Flatten axes, and filter only visible + + Parameters + ---------- + axes : matplotlib Axes object, or its list-like + + """ + axes = plotting._flatten(axes) + axes = [ax for ax in axes if ax.get_visible()] + return axes + + def _check_has_errorbars(self, axes, xerr=0, yerr=0): + """ + Check axes has expected number of errorbars + + Parameters + ---------- + axes : matplotlib Axes object, or its list-like + xerr : number + expected number of x errorbar + yerr : number + expected number of y errorbar + """ + axes = self._flatten_visible(axes) + for ax in axes: + containers = ax.containers + xerr_count = 0 + yerr_count = 0 + for c in containers: + has_xerr = getattr(c, 'has_xerr', False) + has_yerr = getattr(c, 'has_yerr', False) + if has_xerr: + xerr_count += 1 + if has_yerr: + yerr_count += 1 + self.assertEqual(xerr, xerr_count) + self.assertEqual(yerr, yerr_count) + + def _check_box_return_type(self, returned, return_type, expected_keys=None, + check_ax_title=True): + """ + Check box returned type is correct + + Parameters + ---------- + returned : object to be tested, returned from boxplot + return_type : str + return_type passed to boxplot + expected_keys : list-like, optional + group labels in subplot case. If not passed, + the function checks assuming boxplot uses single ax + check_ax_title : bool + Whether to check the ax.title is the same as expected_key + Intended to be checked by calling from ``boxplot``. + Normal ``plot`` doesn't attach ``ax.title``, it must be disabled. + """ + from matplotlib.axes import Axes + types = {'dict': dict, 'axes': Axes, 'both': tuple} + if expected_keys is None: + # should be fixed when the returning default is changed + if return_type is None: + return_type = 'dict' + + self.assertTrue(isinstance(returned, types[return_type])) + if return_type == 'both': + self.assertIsInstance(returned.ax, Axes) + self.assertIsInstance(returned.lines, dict) + else: + # should be fixed when the returning default is changed + if return_type is None: + for r in self._flatten_visible(returned): + self.assertIsInstance(r, Axes) + return + + self.assertTrue(isinstance(returned, Series)) + + self.assertEqual(sorted(returned.keys()), sorted(expected_keys)) + for key, value in iteritems(returned): + self.assertTrue(isinstance(value, types[return_type])) + # check returned dict has correct mapping + if return_type == 'axes': + if check_ax_title: + self.assertEqual(value.get_title(), key) + elif return_type == 'both': + if check_ax_title: + self.assertEqual(value.ax.get_title(), key) + self.assertIsInstance(value.ax, Axes) + self.assertIsInstance(value.lines, dict) + elif return_type == 'dict': + line = value['medians'][0] + axes = line.axes if self.mpl_ge_1_5_0 else line.get_axes() + if check_ax_title: + self.assertEqual(axes.get_title(), key) + else: + raise AssertionError + + def _check_grid_settings(self, obj, kinds, kws={}): + # Make sure plot defaults to rcParams['axes.grid'] setting, GH 9792 + + import matplotlib as mpl + + def is_grid_on(): + xoff = all(not g.gridOn + for g in self.plt.gca().xaxis.get_major_ticks()) + yoff = all(not g.gridOn + for g in self.plt.gca().yaxis.get_major_ticks()) + return not (xoff and yoff) + + spndx = 1 + for kind in kinds: + if not _ok_for_gaussian_kde(kind): + continue + + self.plt.subplot(1, 4 * len(kinds), spndx) + spndx += 1 + mpl.rc('axes', grid=False) + obj.plot(kind=kind, **kws) + self.assertFalse(is_grid_on()) + + self.plt.subplot(1, 4 * len(kinds), spndx) + spndx += 1 + mpl.rc('axes', grid=True) + obj.plot(kind=kind, grid=False, **kws) + self.assertFalse(is_grid_on()) + + if kind != 'pie': + self.plt.subplot(1, 4 * len(kinds), spndx) + spndx += 1 + mpl.rc('axes', grid=True) + obj.plot(kind=kind, **kws) + self.assertTrue(is_grid_on()) + + self.plt.subplot(1, 4 * len(kinds), spndx) + spndx += 1 + mpl.rc('axes', grid=False) + obj.plot(kind=kind, grid=True, **kws) + self.assertTrue(is_grid_on()) + + def _maybe_unpack_cycler(self, rcParams, field='color'): + """ + Compat layer for MPL 1.5 change to color cycle + + Before: plt.rcParams['axes.color_cycle'] -> ['b', 'g', 'r'...] + After : plt.rcParams['axes.prop_cycle'] -> cycler(...) + """ + if self.mpl_ge_1_5_0: + cyl = rcParams['axes.prop_cycle'] + colors = [v[field] for v in cyl] + else: + colors = rcParams['axes.color_cycle'] + return colors + + +def _check_plot_works(f, filterwarnings='always', **kwargs): + import matplotlib.pyplot as plt + ret = None + with warnings.catch_warnings(): + warnings.simplefilter(filterwarnings) + try: + try: + fig = kwargs['figure'] + except KeyError: + fig = plt.gcf() + + plt.clf() + + ax = kwargs.get('ax', fig.add_subplot(211)) # noqa + ret = f(**kwargs) + + assert_is_valid_plot_return_object(ret) + + try: + kwargs['ax'] = fig.add_subplot(212) + ret = f(**kwargs) + except Exception: + pass + else: + assert_is_valid_plot_return_object(ret) + + with ensure_clean(return_filelike=True) as path: + plt.savefig(path) + finally: + tm.close(fig) + + return ret + + +def curpath(): + pth, _ = os.path.split(os.path.abspath(__file__)) + return pth diff --git a/pandas/tests/plotting/test_boxplot_method.py b/pandas/tests/plotting/test_boxplot_method.py new file mode 100644 index 0000000000000..333792c5ffdb2 --- /dev/null +++ b/pandas/tests/plotting/test_boxplot_method.py @@ -0,0 +1,376 @@ +#!/usr/bin/env python +# coding: utf-8 + +import nose +import itertools +import string +from distutils.version import LooseVersion + +from pandas import Series, DataFrame, MultiIndex +from pandas.compat import range, lzip +import pandas.util.testing as tm +from pandas.util.testing import slow + +import numpy as np +from numpy import random +from numpy.random import randn + +import pandas.tools.plotting as plotting + +from pandas.tests.plotting.common import (TestPlotBase, _check_plot_works) + + +""" Test cases for .boxplot method """ + + +def _skip_if_mpl_14_or_dev_boxplot(): + # GH 8382 + # Boxplot failures on 1.4 and 1.4.1 + # Don't need try / except since that's done at class level + import matplotlib + if str(matplotlib.__version__) >= LooseVersion('1.4'): + raise nose.SkipTest("Matplotlib Regression in 1.4 and current dev.") + + +@tm.mplskip +class TestDataFramePlots(TestPlotBase): + + @slow + def test_boxplot_legacy(self): + df = DataFrame(randn(6, 4), + index=list(string.ascii_letters[:6]), + columns=['one', 'two', 'three', 'four']) + df['indic'] = ['foo', 'bar'] * 3 + df['indic2'] = ['foo', 'bar', 'foo'] * 2 + + _check_plot_works(df.boxplot, return_type='dict') + _check_plot_works(df.boxplot, column=[ + 'one', 'two'], return_type='dict') + # _check_plot_works adds an ax so catch warning. see GH #13188 + with tm.assert_produces_warning(UserWarning): + _check_plot_works(df.boxplot, column=['one', 'two'], + by='indic') + _check_plot_works(df.boxplot, column='one', by=['indic', 'indic2']) + with tm.assert_produces_warning(UserWarning): + _check_plot_works(df.boxplot, by='indic') + with tm.assert_produces_warning(UserWarning): + _check_plot_works(df.boxplot, by=['indic', 'indic2']) + _check_plot_works(plotting.boxplot, data=df['one'], return_type='dict') + _check_plot_works(df.boxplot, notch=1, return_type='dict') + with tm.assert_produces_warning(UserWarning): + _check_plot_works(df.boxplot, by='indic', notch=1) + + df = DataFrame(np.random.rand(10, 2), columns=['Col1', 'Col2']) + df['X'] = Series(['A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B']) + df['Y'] = Series(['A'] * 10) + with tm.assert_produces_warning(UserWarning): + _check_plot_works(df.boxplot, by='X') + + # When ax is supplied and required number of axes is 1, + # passed ax should be used: + fig, ax = self.plt.subplots() + axes = df.boxplot('Col1', by='X', ax=ax) + ax_axes = ax.axes if self.mpl_ge_1_5_0 else ax.get_axes() + self.assertIs(ax_axes, axes) + + fig, ax = self.plt.subplots() + axes = df.groupby('Y').boxplot(ax=ax, return_type='axes') + ax_axes = ax.axes if self.mpl_ge_1_5_0 else ax.get_axes() + self.assertIs(ax_axes, axes['A']) + + # Multiple columns with an ax argument should use same figure + fig, ax = self.plt.subplots() + with tm.assert_produces_warning(UserWarning): + axes = df.boxplot(column=['Col1', 'Col2'], + by='X', ax=ax, return_type='axes') + self.assertIs(axes['Col1'].get_figure(), fig) + + # When by is None, check that all relevant lines are present in the + # dict + fig, ax = self.plt.subplots() + d = df.boxplot(ax=ax, return_type='dict') + lines = list(itertools.chain.from_iterable(d.values())) + self.assertEqual(len(ax.get_lines()), len(lines)) + + @slow + def test_boxplot_return_type_none(self): + # GH 12216; return_type=None & by=None -> axes + result = self.hist_df.boxplot() + self.assertTrue(isinstance(result, self.plt.Axes)) + + @slow + def test_boxplot_return_type_legacy(self): + # API change in https://github.com/pydata/pandas/pull/7096 + import matplotlib as mpl # noqa + + df = DataFrame(randn(6, 4), + index=list(string.ascii_letters[:6]), + columns=['one', 'two', 'three', 'four']) + with tm.assertRaises(ValueError): + df.boxplot(return_type='NOTATYPE') + + result = df.boxplot() + self._check_box_return_type(result, 'axes') + + with tm.assert_produces_warning(False): + result = df.boxplot(return_type='dict') + self._check_box_return_type(result, 'dict') + + with tm.assert_produces_warning(False): + result = df.boxplot(return_type='axes') + self._check_box_return_type(result, 'axes') + + with tm.assert_produces_warning(False): + result = df.boxplot(return_type='both') + self._check_box_return_type(result, 'both') + + @slow + def test_boxplot_axis_limits(self): + + def _check_ax_limits(col, ax): + y_min, y_max = ax.get_ylim() + self.assertTrue(y_min <= col.min()) + self.assertTrue(y_max >= col.max()) + + df = self.hist_df.copy() + df['age'] = np.random.randint(1, 20, df.shape[0]) + # One full row + height_ax, weight_ax = df.boxplot(['height', 'weight'], by='category') + _check_ax_limits(df['height'], height_ax) + _check_ax_limits(df['weight'], weight_ax) + self.assertEqual(weight_ax._sharey, height_ax) + + # Two rows, one partial + p = df.boxplot(['height', 'weight', 'age'], by='category') + height_ax, weight_ax, age_ax = p[0, 0], p[0, 1], p[1, 0] + dummy_ax = p[1, 1] + + _check_ax_limits(df['height'], height_ax) + _check_ax_limits(df['weight'], weight_ax) + _check_ax_limits(df['age'], age_ax) + self.assertEqual(weight_ax._sharey, height_ax) + self.assertEqual(age_ax._sharey, height_ax) + self.assertIsNone(dummy_ax._sharey) + + @slow + def test_boxplot_empty_column(self): + _skip_if_mpl_14_or_dev_boxplot() + df = DataFrame(np.random.randn(20, 4)) + df.loc[:, 0] = np.nan + _check_plot_works(df.boxplot, return_type='axes') + + +@tm.mplskip +class TestDataFrameGroupByPlots(TestPlotBase): + + @slow + def test_boxplot_legacy(self): + grouped = self.hist_df.groupby(by='gender') + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(grouped.boxplot, return_type='axes') + self._check_axes_shape(list(axes.values), axes_num=2, layout=(1, 2)) + axes = _check_plot_works(grouped.boxplot, subplots=False, + return_type='axes') + self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) + tuples = lzip(string.ascii_letters[:10], range(10)) + df = DataFrame(np.random.rand(10, 3), + index=MultiIndex.from_tuples(tuples)) + + grouped = df.groupby(level=1) + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(grouped.boxplot, return_type='axes') + self._check_axes_shape(list(axes.values), axes_num=10, layout=(4, 3)) + + axes = _check_plot_works(grouped.boxplot, subplots=False, + return_type='axes') + self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) + + grouped = df.unstack(level=1).groupby(level=0, axis=1) + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(grouped.boxplot, return_type='axes') + self._check_axes_shape(list(axes.values), axes_num=3, layout=(2, 2)) + axes = _check_plot_works(grouped.boxplot, subplots=False, + return_type='axes') + self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) + + @slow + def test_grouped_plot_fignums(self): + n = 10 + weight = Series(np.random.normal(166, 20, size=n)) + height = Series(np.random.normal(60, 10, size=n)) + with tm.RNGContext(42): + gender = np.random.choice(['male', 'female'], size=n) + df = DataFrame({'height': height, 'weight': weight, 'gender': gender}) + gb = df.groupby('gender') + + res = gb.plot() + self.assertEqual(len(self.plt.get_fignums()), 2) + self.assertEqual(len(res), 2) + tm.close() + + res = gb.boxplot(return_type='axes') + self.assertEqual(len(self.plt.get_fignums()), 1) + self.assertEqual(len(res), 2) + tm.close() + + # now works with GH 5610 as gender is excluded + res = df.groupby('gender').hist() + tm.close() + + @slow + def test_grouped_box_return_type(self): + df = self.hist_df + + # old style: return_type=None + result = df.boxplot(by='gender') + self.assertIsInstance(result, np.ndarray) + self._check_box_return_type( + result, None, + expected_keys=['height', 'weight', 'category']) + + # now for groupby + result = df.groupby('gender').boxplot(return_type='dict') + self._check_box_return_type( + result, 'dict', expected_keys=['Male', 'Female']) + + columns2 = 'X B C D A G Y N Q O'.split() + df2 = DataFrame(random.randn(50, 10), columns=columns2) + categories2 = 'A B C D E F G H I J'.split() + df2['category'] = categories2 * 5 + + for t in ['dict', 'axes', 'both']: + returned = df.groupby('classroom').boxplot(return_type=t) + self._check_box_return_type( + returned, t, expected_keys=['A', 'B', 'C']) + + returned = df.boxplot(by='classroom', return_type=t) + self._check_box_return_type( + returned, t, + expected_keys=['height', 'weight', 'category']) + + returned = df2.groupby('category').boxplot(return_type=t) + self._check_box_return_type(returned, t, expected_keys=categories2) + + returned = df2.boxplot(by='category', return_type=t) + self._check_box_return_type(returned, t, expected_keys=columns2) + + @slow + def test_grouped_box_layout(self): + df = self.hist_df + + self.assertRaises(ValueError, df.boxplot, column=['weight', 'height'], + by=df.gender, layout=(1, 1)) + self.assertRaises(ValueError, df.boxplot, + column=['height', 'weight', 'category'], + layout=(2, 1), return_type='dict') + self.assertRaises(ValueError, df.boxplot, column=['weight', 'height'], + by=df.gender, layout=(-1, -1)) + + # _check_plot_works adds an ax so catch warning. see GH #13188 + with tm.assert_produces_warning(UserWarning): + box = _check_plot_works(df.groupby('gender').boxplot, + column='height', return_type='dict') + self._check_axes_shape(self.plt.gcf().axes, axes_num=2, layout=(1, 2)) + + with tm.assert_produces_warning(UserWarning): + box = _check_plot_works(df.groupby('category').boxplot, + column='height', + return_type='dict') + self._check_axes_shape(self.plt.gcf().axes, axes_num=4, layout=(2, 2)) + + # GH 6769 + with tm.assert_produces_warning(UserWarning): + box = _check_plot_works(df.groupby('classroom').boxplot, + column='height', return_type='dict') + self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(2, 2)) + + # GH 5897 + axes = df.boxplot(column=['height', 'weight', 'category'], by='gender', + return_type='axes') + self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(2, 2)) + for ax in [axes['height']]: + self._check_visible(ax.get_xticklabels(), visible=False) + self._check_visible([ax.xaxis.get_label()], visible=False) + for ax in [axes['weight'], axes['category']]: + self._check_visible(ax.get_xticklabels()) + self._check_visible([ax.xaxis.get_label()]) + + box = df.groupby('classroom').boxplot( + column=['height', 'weight', 'category'], return_type='dict') + self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(2, 2)) + + with tm.assert_produces_warning(UserWarning): + box = _check_plot_works(df.groupby('category').boxplot, + column='height', + layout=(3, 2), return_type='dict') + self._check_axes_shape(self.plt.gcf().axes, axes_num=4, layout=(3, 2)) + with tm.assert_produces_warning(UserWarning): + box = _check_plot_works(df.groupby('category').boxplot, + column='height', + layout=(3, -1), return_type='dict') + self._check_axes_shape(self.plt.gcf().axes, axes_num=4, layout=(3, 2)) + + box = df.boxplot(column=['height', 'weight', 'category'], by='gender', + layout=(4, 1)) + self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(4, 1)) + + box = df.boxplot(column=['height', 'weight', 'category'], by='gender', + layout=(-1, 1)) + self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(3, 1)) + + box = df.groupby('classroom').boxplot( + column=['height', 'weight', 'category'], layout=(1, 4), + return_type='dict') + self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(1, 4)) + + box = df.groupby('classroom').boxplot( # noqa + column=['height', 'weight', 'category'], layout=(1, -1), + return_type='dict') + self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(1, 3)) + + @slow + def test_grouped_box_multiple_axes(self): + # GH 6970, GH 7069 + df = self.hist_df + + # check warning to ignore sharex / sharey + # this check should be done in the first function which + # passes multiple axes to plot, hist or boxplot + # location should be changed if other test is added + # which has earlier alphabetical order + with tm.assert_produces_warning(UserWarning): + fig, axes = self.plt.subplots(2, 2) + df.groupby('category').boxplot( + column='height', return_type='axes', ax=axes) + self._check_axes_shape(self.plt.gcf().axes, + axes_num=4, layout=(2, 2)) + + fig, axes = self.plt.subplots(2, 3) + with tm.assert_produces_warning(UserWarning): + returned = df.boxplot(column=['height', 'weight', 'category'], + by='gender', return_type='axes', ax=axes[0]) + returned = np.array(list(returned.values)) + self._check_axes_shape(returned, axes_num=3, layout=(1, 3)) + self.assert_numpy_array_equal(returned, axes[0]) + self.assertIs(returned[0].figure, fig) + + # draw on second row + with tm.assert_produces_warning(UserWarning): + returned = df.groupby('classroom').boxplot( + column=['height', 'weight', 'category'], + return_type='axes', ax=axes[1]) + returned = np.array(list(returned.values)) + self._check_axes_shape(returned, axes_num=3, layout=(1, 3)) + self.assert_numpy_array_equal(returned, axes[1]) + self.assertIs(returned[0].figure, fig) + + with tm.assertRaises(ValueError): + fig, axes = self.plt.subplots(2, 3) + # pass different number of axes from required + with tm.assert_produces_warning(UserWarning): + axes = df.groupby('classroom').boxplot(ax=axes) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tseries/tests/test_plotting.py b/pandas/tests/plotting/test_datetimelike.py similarity index 97% rename from pandas/tseries/tests/test_plotting.py rename to pandas/tests/plotting/test_datetimelike.py index 2255f9fae73de..0f7bc02e24915 100644 --- a/pandas/tseries/tests/test_plotting.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -14,12 +14,19 @@ from pandas.util.testing import assert_series_equal, ensure_clean, slow import pandas.util.testing as tm -from pandas.tests.test_graphics import _skip_if_no_scipy_gaussian_kde +from pandas.tests.plotting.common import (TestPlotBase, + _skip_if_no_scipy_gaussian_kde) + + +""" Test cases for time series specific (freq conversion, etc) """ @tm.mplskip -class TestTSPlot(tm.TestCase): +class TestTSPlot(TestPlotBase): + def setUp(self): + TestPlotBase.setUp(self) + freq = ['S', 'T', 'H', 'D', 'W', 'M', 'Q', 'A'] idx = [period_range('12/31/1999', freq=x, periods=100) for x in freq] self.period_ser = [Series(np.random.randn(len(x)), x) for x in idx] @@ -116,7 +123,8 @@ def test_tsplot(self): _check_plot_works(s.plot, ax=ax) ax = ts.plot(style='k') - self.assertEqual((0., 0., 0.), ax.get_lines()[0].get_color()) + color = (0., 0., 0., 1) if self.mpl_ge_2_0_0 else (0., 0., 0.) + self.assertEqual(color, ax.get_lines()[0].get_color()) def test_both_style_and_color(self): import matplotlib.pyplot as plt # noqa @@ -569,7 +577,8 @@ def test_secondary_y(self): plt.close(fig) ax2 = ser2.plot() - self.assertEqual(ax2.get_yaxis().get_ticks_position(), 'default') + self.assertEqual(ax2.get_yaxis().get_ticks_position(), + self.default_tick_position) plt.close(ax2.get_figure()) ax = ser2.plot() @@ -599,7 +608,8 @@ def test_secondary_y_ts(self): plt.close(fig) ax2 = ser2.plot() - self.assertEqual(ax2.get_yaxis().get_ticks_position(), 'default') + self.assertEqual(ax2.get_yaxis().get_ticks_position(), + self.default_tick_position) plt.close(ax2.get_figure()) ax = ser2.plot() @@ -633,7 +643,8 @@ def test_secondary_frame(self): df = DataFrame(np.random.randn(5, 3), columns=['a', 'b', 'c']) axes = df.plot(secondary_y=['a', 'c'], subplots=True) self.assertEqual(axes[0].get_yaxis().get_ticks_position(), 'right') - self.assertEqual(axes[1].get_yaxis().get_ticks_position(), 'default') + self.assertEqual(axes[1].get_yaxis().get_ticks_position(), + self.default_tick_position) self.assertEqual(axes[2].get_yaxis().get_ticks_position(), 'right') @slow @@ -641,7 +652,8 @@ def test_secondary_bar_frame(self): df = DataFrame(np.random.randn(5, 3), columns=['a', 'b', 'c']) axes = df.plot(kind='bar', secondary_y=['a', 'c'], subplots=True) self.assertEqual(axes[0].get_yaxis().get_ticks_position(), 'right') - self.assertEqual(axes[1].get_yaxis().get_ticks_position(), 'default') + self.assertEqual(axes[1].get_yaxis().get_ticks_position(), + self.default_tick_position) self.assertEqual(axes[2].get_yaxis().get_ticks_position(), 'right') def test_mixed_freq_regular_first(self): @@ -1216,6 +1228,14 @@ def test_secondary_y_irregular_ts_xlim(self): self.assertEqual(left, ts_irregular.index.min().toordinal()) self.assertEqual(right, ts_irregular.index.max().toordinal()) + def test_plot_outofbounds_datetime(self): + # 2579 - checking this does not raise + values = [date(1677, 1, 1), date(1677, 1, 2)] + self.plt.plot(values) + + values = [datetime(1677, 1, 1, 12), datetime(1677, 1, 2, 12)] + self.plt.plot(values) + def _check_plot_works(f, freq=None, series=None, *args, **kwargs): import matplotlib.pyplot as plt diff --git a/pandas/tests/test_graphics.py b/pandas/tests/plotting/test_frame.py similarity index 66% rename from pandas/tests/test_graphics.py rename to pandas/tests/plotting/test_frame.py index b09185c19bffb..4d0c1e9213b17 100644 --- a/pandas/tests/test_graphics.py +++ b/pandas/tests/plotting/test_frame.py @@ -2,8 +2,6 @@ # coding: utf-8 import nose -import itertools -import os import string import warnings @@ -12,1271 +10,24 @@ import pandas as pd from pandas import (Series, DataFrame, MultiIndex, PeriodIndex, date_range, bdate_range) -from pandas.compat import (range, lrange, StringIO, lmap, lzip, u, zip, - iteritems, OrderedDict, PY3) -from pandas.util.decorators import cache_readonly +from pandas.types.api import is_list_like +from pandas.compat import (range, lrange, StringIO, lmap, lzip, u, zip, PY3) from pandas.formats.printing import pprint_thing -import pandas.core.common as com import pandas.util.testing as tm -from pandas.util.testing import (ensure_clean, - assert_is_valid_plot_return_object, slow) +from pandas.util.testing import slow from pandas.core.config import set_option import numpy as np -from numpy import random from numpy.random import rand, randn import pandas.tools.plotting as plotting -""" -These tests are for ``Dataframe.plot`` and ``Series.plot``. -Other plot methods such as ``.hist``, ``.boxplot`` and other miscellaneous -are tested in test_graphics_others.py -""" +from pandas.tests.plotting.common import (TestPlotBase, _check_plot_works, + _skip_if_no_scipy_gaussian_kde, + _ok_for_gaussian_kde) -def _skip_if_no_scipy_gaussian_kde(): - try: - from scipy.stats import gaussian_kde # noqa - except ImportError: - raise nose.SkipTest("scipy version doesn't support gaussian_kde") - - -def _ok_for_gaussian_kde(kind): - if kind in ['kde', 'density']: - try: - from scipy.stats import gaussian_kde # noqa - except ImportError: - return False - return True - - -@tm.mplskip -class TestPlotBase(tm.TestCase): - - def setUp(self): - - import matplotlib as mpl - mpl.rcdefaults() - - n = 100 - with tm.RNGContext(42): - gender = np.random.choice(['Male', 'Female'], size=n) - classroom = np.random.choice(['A', 'B', 'C'], size=n) - - self.hist_df = DataFrame({'gender': gender, - 'classroom': classroom, - 'height': random.normal(66, 4, size=n), - 'weight': random.normal(161, 32, size=n), - 'category': random.randint(4, size=n)}) - - self.mpl_le_1_2_1 = plotting._mpl_le_1_2_1() - self.mpl_ge_1_3_1 = plotting._mpl_ge_1_3_1() - self.mpl_ge_1_4_0 = plotting._mpl_ge_1_4_0() - self.mpl_ge_1_5_0 = plotting._mpl_ge_1_5_0() - - if self.mpl_ge_1_4_0: - self.bp_n_objects = 7 - else: - self.bp_n_objects = 8 - if self.mpl_ge_1_5_0: - # 1.5 added PolyCollections to legend handler - # so we have twice as many items. - self.polycollection_factor = 2 - else: - self.polycollection_factor = 1 - - def tearDown(self): - tm.close() - - @cache_readonly - def plt(self): - import matplotlib.pyplot as plt - return plt - - @cache_readonly - def colorconverter(self): - import matplotlib.colors as colors - return colors.colorConverter - - def _check_legend_labels(self, axes, labels=None, visible=True): - """ - Check each axes has expected legend labels - - Parameters - ---------- - axes : matplotlib Axes object, or its list-like - labels : list-like - expected legend labels - visible : bool - expected legend visibility. labels are checked only when visible is - True - """ - - if visible and (labels is None): - raise ValueError('labels must be specified when visible is True') - axes = self._flatten_visible(axes) - for ax in axes: - if visible: - self.assertTrue(ax.get_legend() is not None) - self._check_text_labels(ax.get_legend().get_texts(), labels) - else: - self.assertTrue(ax.get_legend() is None) - - def _check_data(self, xp, rs): - """ - Check each axes has identical lines - - Parameters - ---------- - xp : matplotlib Axes object - rs : matplotlib Axes object - """ - xp_lines = xp.get_lines() - rs_lines = rs.get_lines() - - def check_line(xpl, rsl): - xpdata = xpl.get_xydata() - rsdata = rsl.get_xydata() - tm.assert_almost_equal(xpdata, rsdata) - - self.assertEqual(len(xp_lines), len(rs_lines)) - [check_line(xpl, rsl) for xpl, rsl in zip(xp_lines, rs_lines)] - tm.close() - - def _check_visible(self, collections, visible=True): - """ - Check each artist is visible or not - - Parameters - ---------- - collections : matplotlib Artist or its list-like - target Artist or its list or collection - visible : bool - expected visibility - """ - from matplotlib.collections import Collection - if not isinstance(collections, - Collection) and not com.is_list_like(collections): - collections = [collections] - - for patch in collections: - self.assertEqual(patch.get_visible(), visible) - - def _get_colors_mapped(self, series, colors): - unique = series.unique() - # unique and colors length can be differed - # depending on slice value - mapped = dict(zip(unique, colors)) - return [mapped[v] for v in series.values] - - def _check_colors(self, collections, linecolors=None, facecolors=None, - mapping=None): - """ - Check each artist has expected line colors and face colors - - Parameters - ---------- - collections : list-like - list or collection of target artist - linecolors : list-like which has the same length as collections - list of expected line colors - facecolors : list-like which has the same length as collections - list of expected face colors - mapping : Series - Series used for color grouping key - used for andrew_curves, parallel_coordinates, radviz test - """ - - from matplotlib.lines import Line2D - from matplotlib.collections import Collection, PolyCollection - conv = self.colorconverter - if linecolors is not None: - - if mapping is not None: - linecolors = self._get_colors_mapped(mapping, linecolors) - linecolors = linecolors[:len(collections)] - - self.assertEqual(len(collections), len(linecolors)) - for patch, color in zip(collections, linecolors): - if isinstance(patch, Line2D): - result = patch.get_color() - # Line2D may contains string color expression - result = conv.to_rgba(result) - elif isinstance(patch, PolyCollection): - result = tuple(patch.get_edgecolor()[0]) - else: - result = patch.get_edgecolor() - - expected = conv.to_rgba(color) - self.assertEqual(result, expected) - - if facecolors is not None: - - if mapping is not None: - facecolors = self._get_colors_mapped(mapping, facecolors) - facecolors = facecolors[:len(collections)] - - self.assertEqual(len(collections), len(facecolors)) - for patch, color in zip(collections, facecolors): - if isinstance(patch, Collection): - # returned as list of np.array - result = patch.get_facecolor()[0] - else: - result = patch.get_facecolor() - - if isinstance(result, np.ndarray): - result = tuple(result) - - expected = conv.to_rgba(color) - self.assertEqual(result, expected) - - def _check_text_labels(self, texts, expected): - """ - Check each text has expected labels - - Parameters - ---------- - texts : matplotlib Text object, or its list-like - target text, or its list - expected : str or list-like which has the same length as texts - expected text label, or its list - """ - if not com.is_list_like(texts): - self.assertEqual(texts.get_text(), expected) - else: - labels = [t.get_text() for t in texts] - self.assertEqual(len(labels), len(expected)) - for l, e in zip(labels, expected): - self.assertEqual(l, e) - - def _check_ticks_props(self, axes, xlabelsize=None, xrot=None, - ylabelsize=None, yrot=None): - """ - Check each axes has expected tick properties - - Parameters - ---------- - axes : matplotlib Axes object, or its list-like - xlabelsize : number - expected xticks font size - xrot : number - expected xticks rotation - ylabelsize : number - expected yticks font size - yrot : number - expected yticks rotation - """ - from matplotlib.ticker import NullFormatter - axes = self._flatten_visible(axes) - for ax in axes: - if xlabelsize or xrot: - if isinstance(ax.xaxis.get_minor_formatter(), NullFormatter): - # If minor ticks has NullFormatter, rot / fontsize are not - # retained - labels = ax.get_xticklabels() - else: - labels = ax.get_xticklabels() + ax.get_xticklabels( - minor=True) - - for label in labels: - if xlabelsize is not None: - self.assertAlmostEqual(label.get_fontsize(), - xlabelsize) - if xrot is not None: - self.assertAlmostEqual(label.get_rotation(), xrot) - - if ylabelsize or yrot: - if isinstance(ax.yaxis.get_minor_formatter(), NullFormatter): - labels = ax.get_yticklabels() - else: - labels = ax.get_yticklabels() + ax.get_yticklabels( - minor=True) - - for label in labels: - if ylabelsize is not None: - self.assertAlmostEqual(label.get_fontsize(), - ylabelsize) - if yrot is not None: - self.assertAlmostEqual(label.get_rotation(), yrot) - - def _check_ax_scales(self, axes, xaxis='linear', yaxis='linear'): - """ - Check each axes has expected scales - - Parameters - ---------- - axes : matplotlib Axes object, or its list-like - xaxis : {'linear', 'log'} - expected xaxis scale - yaxis : {'linear', 'log'} - expected yaxis scale - """ - axes = self._flatten_visible(axes) - for ax in axes: - self.assertEqual(ax.xaxis.get_scale(), xaxis) - self.assertEqual(ax.yaxis.get_scale(), yaxis) - - def _check_axes_shape(self, axes, axes_num=None, layout=None, - figsize=(8.0, 6.0)): - """ - Check expected number of axes is drawn in expected layout - - Parameters - ---------- - axes : matplotlib Axes object, or its list-like - axes_num : number - expected number of axes. Unnecessary axes should be set to - invisible. - layout : tuple - expected layout, (expected number of rows , columns) - figsize : tuple - expected figsize. default is matplotlib default - """ - visible_axes = self._flatten_visible(axes) - - if axes_num is not None: - self.assertEqual(len(visible_axes), axes_num) - for ax in visible_axes: - # check something drawn on visible axes - self.assertTrue(len(ax.get_children()) > 0) - - if layout is not None: - result = self._get_axes_layout(plotting._flatten(axes)) - self.assertEqual(result, layout) - - self.assert_numpy_array_equal( - np.round(visible_axes[0].figure.get_size_inches()), - np.array(figsize, dtype=np.float64)) - - def _get_axes_layout(self, axes): - x_set = set() - y_set = set() - for ax in axes: - # check axes coordinates to estimate layout - points = ax.get_position().get_points() - x_set.add(points[0][0]) - y_set.add(points[0][1]) - return (len(y_set), len(x_set)) - - def _flatten_visible(self, axes): - """ - Flatten axes, and filter only visible - - Parameters - ---------- - axes : matplotlib Axes object, or its list-like - - """ - axes = plotting._flatten(axes) - axes = [ax for ax in axes if ax.get_visible()] - return axes - - def _check_has_errorbars(self, axes, xerr=0, yerr=0): - """ - Check axes has expected number of errorbars - - Parameters - ---------- - axes : matplotlib Axes object, or its list-like - xerr : number - expected number of x errorbar - yerr : number - expected number of y errorbar - """ - axes = self._flatten_visible(axes) - for ax in axes: - containers = ax.containers - xerr_count = 0 - yerr_count = 0 - for c in containers: - has_xerr = getattr(c, 'has_xerr', False) - has_yerr = getattr(c, 'has_yerr', False) - if has_xerr: - xerr_count += 1 - if has_yerr: - yerr_count += 1 - self.assertEqual(xerr, xerr_count) - self.assertEqual(yerr, yerr_count) - - def _check_box_return_type(self, returned, return_type, expected_keys=None, - check_ax_title=True): - """ - Check box returned type is correct - - Parameters - ---------- - returned : object to be tested, returned from boxplot - return_type : str - return_type passed to boxplot - expected_keys : list-like, optional - group labels in subplot case. If not passed, - the function checks assuming boxplot uses single ax - check_ax_title : bool - Whether to check the ax.title is the same as expected_key - Intended to be checked by calling from ``boxplot``. - Normal ``plot`` doesn't attach ``ax.title``, it must be disabled. - """ - from matplotlib.axes import Axes - types = {'dict': dict, 'axes': Axes, 'both': tuple} - if expected_keys is None: - # should be fixed when the returning default is changed - if return_type is None: - return_type = 'dict' - - self.assertTrue(isinstance(returned, types[return_type])) - if return_type == 'both': - self.assertIsInstance(returned.ax, Axes) - self.assertIsInstance(returned.lines, dict) - else: - # should be fixed when the returning default is changed - if return_type is None: - for r in self._flatten_visible(returned): - self.assertIsInstance(r, Axes) - return - - self.assertTrue(isinstance(returned, OrderedDict)) - self.assertEqual(sorted(returned.keys()), sorted(expected_keys)) - for key, value in iteritems(returned): - self.assertTrue(isinstance(value, types[return_type])) - # check returned dict has correct mapping - if return_type == 'axes': - if check_ax_title: - self.assertEqual(value.get_title(), key) - elif return_type == 'both': - if check_ax_title: - self.assertEqual(value.ax.get_title(), key) - self.assertIsInstance(value.ax, Axes) - self.assertIsInstance(value.lines, dict) - elif return_type == 'dict': - line = value['medians'][0] - if check_ax_title: - self.assertEqual(line.get_axes().get_title(), key) - else: - raise AssertionError - - def _check_grid_settings(self, obj, kinds, kws={}): - # Make sure plot defaults to rcParams['axes.grid'] setting, GH 9792 - - import matplotlib as mpl - - def is_grid_on(): - xoff = all(not g.gridOn - for g in self.plt.gca().xaxis.get_major_ticks()) - yoff = all(not g.gridOn - for g in self.plt.gca().yaxis.get_major_ticks()) - return not (xoff and yoff) - - spndx = 1 - for kind in kinds: - if not _ok_for_gaussian_kde(kind): - continue - - self.plt.subplot(1, 4 * len(kinds), spndx) - spndx += 1 - mpl.rc('axes', grid=False) - obj.plot(kind=kind, **kws) - self.assertFalse(is_grid_on()) - - self.plt.subplot(1, 4 * len(kinds), spndx) - spndx += 1 - mpl.rc('axes', grid=True) - obj.plot(kind=kind, grid=False, **kws) - self.assertFalse(is_grid_on()) - - if kind != 'pie': - self.plt.subplot(1, 4 * len(kinds), spndx) - spndx += 1 - mpl.rc('axes', grid=True) - obj.plot(kind=kind, **kws) - self.assertTrue(is_grid_on()) - - self.plt.subplot(1, 4 * len(kinds), spndx) - spndx += 1 - mpl.rc('axes', grid=False) - obj.plot(kind=kind, grid=True, **kws) - self.assertTrue(is_grid_on()) - - def _maybe_unpack_cycler(self, rcParams, field='color'): - """ - Compat layer for MPL 1.5 change to color cycle - - Before: plt.rcParams['axes.color_cycle'] -> ['b', 'g', 'r'...] - After : plt.rcParams['axes.prop_cycle'] -> cycler(...) - """ - if self.mpl_ge_1_5_0: - cyl = rcParams['axes.prop_cycle'] - colors = [v[field] for v in cyl] - else: - colors = rcParams['axes.color_cycle'] - return colors - - -@tm.mplskip -class TestSeriesPlots(TestPlotBase): - - def setUp(self): - TestPlotBase.setUp(self) - import matplotlib as mpl - mpl.rcdefaults() - - self.ts = tm.makeTimeSeries() - self.ts.name = 'ts' - - self.series = tm.makeStringSeries() - self.series.name = 'series' - - self.iseries = tm.makePeriodSeries() - self.iseries.name = 'iseries' - - @slow - def test_plot(self): - _check_plot_works(self.ts.plot, label='foo') - _check_plot_works(self.ts.plot, use_index=False) - axes = _check_plot_works(self.ts.plot, rot=0) - self._check_ticks_props(axes, xrot=0) - - ax = _check_plot_works(self.ts.plot, style='.', logy=True) - self._check_ax_scales(ax, yaxis='log') - - ax = _check_plot_works(self.ts.plot, style='.', logx=True) - self._check_ax_scales(ax, xaxis='log') - - ax = _check_plot_works(self.ts.plot, style='.', loglog=True) - self._check_ax_scales(ax, xaxis='log', yaxis='log') - - _check_plot_works(self.ts[:10].plot.bar) - _check_plot_works(self.ts.plot.area, stacked=False) - _check_plot_works(self.iseries.plot) - - for kind in ['line', 'bar', 'barh', 'kde', 'hist', 'box']: - if not _ok_for_gaussian_kde(kind): - continue - _check_plot_works(self.series[:5].plot, kind=kind) - - _check_plot_works(self.series[:10].plot.barh) - ax = _check_plot_works(Series(randn(10)).plot.bar, color='black') - self._check_colors([ax.patches[0]], facecolors=['black']) - - # GH 6951 - ax = _check_plot_works(self.ts.plot, subplots=True) - self._check_axes_shape(ax, axes_num=1, layout=(1, 1)) - - ax = _check_plot_works(self.ts.plot, subplots=True, layout=(-1, 1)) - self._check_axes_shape(ax, axes_num=1, layout=(1, 1)) - ax = _check_plot_works(self.ts.plot, subplots=True, layout=(1, -1)) - self._check_axes_shape(ax, axes_num=1, layout=(1, 1)) - - @slow - def test_plot_figsize_and_title(self): - # figsize and title - ax = self.series.plot(title='Test', figsize=(16, 8)) - self._check_text_labels(ax.title, 'Test') - self._check_axes_shape(ax, axes_num=1, layout=(1, 1), figsize=(16, 8)) - - def test_dont_modify_rcParams(self): - # GH 8242 - if self.mpl_ge_1_5_0: - key = 'axes.prop_cycle' - else: - key = 'axes.color_cycle' - colors = self.plt.rcParams[key] - Series([1, 2, 3]).plot() - self.assertEqual(colors, self.plt.rcParams[key]) - - def test_ts_line_lim(self): - ax = self.ts.plot() - xmin, xmax = ax.get_xlim() - lines = ax.get_lines() - self.assertEqual(xmin, lines[0].get_data(orig=False)[0][0]) - self.assertEqual(xmax, lines[0].get_data(orig=False)[0][-1]) - tm.close() - - ax = self.ts.plot(secondary_y=True) - xmin, xmax = ax.get_xlim() - lines = ax.get_lines() - self.assertEqual(xmin, lines[0].get_data(orig=False)[0][0]) - self.assertEqual(xmax, lines[0].get_data(orig=False)[0][-1]) - - def test_ts_area_lim(self): - ax = self.ts.plot.area(stacked=False) - xmin, xmax = ax.get_xlim() - line = ax.get_lines()[0].get_data(orig=False)[0] - self.assertEqual(xmin, line[0]) - self.assertEqual(xmax, line[-1]) - tm.close() - - # GH 7471 - ax = self.ts.plot.area(stacked=False, x_compat=True) - xmin, xmax = ax.get_xlim() - line = ax.get_lines()[0].get_data(orig=False)[0] - self.assertEqual(xmin, line[0]) - self.assertEqual(xmax, line[-1]) - tm.close() - - tz_ts = self.ts.copy() - tz_ts.index = tz_ts.tz_localize('GMT').tz_convert('CET') - ax = tz_ts.plot.area(stacked=False, x_compat=True) - xmin, xmax = ax.get_xlim() - line = ax.get_lines()[0].get_data(orig=False)[0] - self.assertEqual(xmin, line[0]) - self.assertEqual(xmax, line[-1]) - tm.close() - - ax = tz_ts.plot.area(stacked=False, secondary_y=True) - xmin, xmax = ax.get_xlim() - line = ax.get_lines()[0].get_data(orig=False)[0] - self.assertEqual(xmin, line[0]) - self.assertEqual(xmax, line[-1]) - - def test_label(self): - s = Series([1, 2]) - ax = s.plot(label='LABEL', legend=True) - self._check_legend_labels(ax, labels=['LABEL']) - self.plt.close() - ax = s.plot(legend=True) - self._check_legend_labels(ax, labels=['None']) - self.plt.close() - # get name from index - s.name = 'NAME' - ax = s.plot(legend=True) - self._check_legend_labels(ax, labels=['NAME']) - self.plt.close() - # override the default - ax = s.plot(legend=True, label='LABEL') - self._check_legend_labels(ax, labels=['LABEL']) - self.plt.close() - # Add lebel info, but don't draw - ax = s.plot(legend=False, label='LABEL') - self.assertEqual(ax.get_legend(), None) # Hasn't been drawn - ax.legend() # draw it - self._check_legend_labels(ax, labels=['LABEL']) - - def test_line_area_nan_series(self): - values = [1, 2, np.nan, 3] - s = Series(values) - ts = Series(values, index=tm.makeDateIndex(k=4)) - - for d in [s, ts]: - ax = _check_plot_works(d.plot) - masked = ax.lines[0].get_ydata() - # remove nan for comparison purpose - exp = np.array([1, 2, 3], dtype=np.float64) - self.assert_numpy_array_equal(np.delete(masked.data, 2), exp) - self.assert_numpy_array_equal( - masked.mask, np.array([False, False, True, False])) - - expected = np.array([1, 2, 0, 3], dtype=np.float64) - ax = _check_plot_works(d.plot, stacked=True) - self.assert_numpy_array_equal(ax.lines[0].get_ydata(), expected) - ax = _check_plot_works(d.plot.area) - self.assert_numpy_array_equal(ax.lines[0].get_ydata(), expected) - ax = _check_plot_works(d.plot.area, stacked=False) - self.assert_numpy_array_equal(ax.lines[0].get_ydata(), expected) - - def test_line_use_index_false(self): - s = Series([1, 2, 3], index=['a', 'b', 'c']) - s.index.name = 'The Index' - ax = s.plot(use_index=False) - label = ax.get_xlabel() - self.assertEqual(label, '') - ax2 = s.plot.bar(use_index=False) - label2 = ax2.get_xlabel() - self.assertEqual(label2, '') - - @slow - def test_bar_log(self): - expected = np.array([1., 10., 100., 1000.]) - - if not self.mpl_le_1_2_1: - expected = np.hstack((.1, expected, 1e4)) - - ax = Series([200, 500]).plot.bar(log=True) - tm.assert_numpy_array_equal(ax.yaxis.get_ticklocs(), expected) - tm.close() - - ax = Series([200, 500]).plot.barh(log=True) - tm.assert_numpy_array_equal(ax.xaxis.get_ticklocs(), expected) - tm.close() - - # GH 9905 - expected = np.array([1.0e-03, 1.0e-02, 1.0e-01, 1.0e+00]) - - if not self.mpl_le_1_2_1: - expected = np.hstack((1.0e-04, expected, 1.0e+01)) - - ax = Series([0.1, 0.01, 0.001]).plot(log=True, kind='bar') - self.assertEqual(ax.get_ylim(), (0.001, 0.10000000000000001)) - tm.assert_numpy_array_equal(ax.yaxis.get_ticklocs(), expected) - tm.close() - - ax = Series([0.1, 0.01, 0.001]).plot(log=True, kind='barh') - self.assertEqual(ax.get_xlim(), (0.001, 0.10000000000000001)) - tm.assert_numpy_array_equal(ax.xaxis.get_ticklocs(), expected) - - @slow - def test_bar_ignore_index(self): - df = Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']) - ax = df.plot.bar(use_index=False) - self._check_text_labels(ax.get_xticklabels(), ['0', '1', '2', '3']) - - def test_rotation(self): - df = DataFrame(randn(5, 5)) - # Default rot 0 - axes = df.plot() - self._check_ticks_props(axes, xrot=0) - - axes = df.plot(rot=30) - self._check_ticks_props(axes, xrot=30) - - def test_irregular_datetime(self): - rng = date_range('1/1/2000', '3/1/2000') - rng = rng[[0, 1, 2, 3, 5, 9, 10, 11, 12]] - ser = Series(randn(len(rng)), rng) - ax = ser.plot() - xp = datetime(1999, 1, 1).toordinal() - ax.set_xlim('1/1/1999', '1/1/2001') - self.assertEqual(xp, ax.get_xlim()[0]) - - @slow - def test_pie_series(self): - # if sum of values is less than 1.0, pie handle them as rate and draw - # semicircle. - series = Series(np.random.randint(1, 5), - index=['a', 'b', 'c', 'd', 'e'], name='YLABEL') - ax = _check_plot_works(series.plot.pie) - self._check_text_labels(ax.texts, series.index) - self.assertEqual(ax.get_ylabel(), 'YLABEL') - - # without wedge labels - ax = _check_plot_works(series.plot.pie, labels=None) - self._check_text_labels(ax.texts, [''] * 5) - - # with less colors than elements - color_args = ['r', 'g', 'b'] - ax = _check_plot_works(series.plot.pie, colors=color_args) - - color_expected = ['r', 'g', 'b', 'r', 'g'] - self._check_colors(ax.patches, facecolors=color_expected) - - # with labels and colors - labels = ['A', 'B', 'C', 'D', 'E'] - color_args = ['r', 'g', 'b', 'c', 'm'] - ax = _check_plot_works(series.plot.pie, labels=labels, - colors=color_args) - self._check_text_labels(ax.texts, labels) - self._check_colors(ax.patches, facecolors=color_args) - - # with autopct and fontsize - ax = _check_plot_works(series.plot.pie, colors=color_args, - autopct='%.2f', fontsize=7) - pcts = ['{0:.2f}'.format(s * 100) - for s in series.values / float(series.sum())] - iters = [iter(series.index), iter(pcts)] - expected_texts = list(next(it) for it in itertools.cycle(iters)) - self._check_text_labels(ax.texts, expected_texts) - for t in ax.texts: - self.assertEqual(t.get_fontsize(), 7) - - # includes negative value - with tm.assertRaises(ValueError): - series = Series([1, 2, 0, 4, -1], index=['a', 'b', 'c', 'd', 'e']) - series.plot.pie() - - # includes nan - series = Series([1, 2, np.nan, 4], index=['a', 'b', 'c', 'd'], - name='YLABEL') - ax = _check_plot_works(series.plot.pie) - self._check_text_labels(ax.texts, ['a', 'b', '', 'd']) - - def test_pie_nan(self): - s = Series([1, np.nan, 1, 1]) - ax = s.plot.pie(legend=True) - expected = ['0', '', '2', '3'] - result = [x.get_text() for x in ax.texts] - self.assertEqual(result, expected) - - @slow - def test_hist_df_kwargs(self): - df = DataFrame(np.random.randn(10, 2)) - ax = df.plot.hist(bins=5) - self.assertEqual(len(ax.patches), 10) - - @slow - def test_hist_df_with_nonnumerics(self): - # GH 9853 - with tm.RNGContext(1): - df = DataFrame( - np.random.randn(10, 4), columns=['A', 'B', 'C', 'D']) - df['E'] = ['x', 'y'] * 5 - ax = df.plot.hist(bins=5) - self.assertEqual(len(ax.patches), 20) - - ax = df.plot.hist() # bins=10 - self.assertEqual(len(ax.patches), 40) - - @slow - def test_hist_legacy(self): - _check_plot_works(self.ts.hist) - _check_plot_works(self.ts.hist, grid=False) - _check_plot_works(self.ts.hist, figsize=(8, 10)) - _check_plot_works(self.ts.hist, filterwarnings='ignore', - by=self.ts.index.month) - _check_plot_works(self.ts.hist, filterwarnings='ignore', - by=self.ts.index.month, bins=5) - - fig, ax = self.plt.subplots(1, 1) - _check_plot_works(self.ts.hist, ax=ax) - _check_plot_works(self.ts.hist, ax=ax, figure=fig) - _check_plot_works(self.ts.hist, figure=fig) - tm.close() - - fig, (ax1, ax2) = self.plt.subplots(1, 2) - _check_plot_works(self.ts.hist, figure=fig, ax=ax1) - _check_plot_works(self.ts.hist, figure=fig, ax=ax2) - - with tm.assertRaises(ValueError): - self.ts.hist(by=self.ts.index, figure=fig) - - @slow - def test_hist_bins_legacy(self): - df = DataFrame(np.random.randn(10, 2)) - ax = df.hist(bins=2)[0][0] - self.assertEqual(len(ax.patches), 2) - - @slow - def test_hist_layout(self): - df = self.hist_df - with tm.assertRaises(ValueError): - df.height.hist(layout=(1, 1)) - - with tm.assertRaises(ValueError): - df.height.hist(layout=[1, 1]) - - @slow - def test_hist_layout_with_by(self): - df = self.hist_df - - axes = _check_plot_works(df.height.hist, filterwarnings='ignore', - by=df.gender, layout=(2, 1)) - self._check_axes_shape(axes, axes_num=2, layout=(2, 1)) - - axes = _check_plot_works(df.height.hist, filterwarnings='ignore', - by=df.gender, layout=(3, -1)) - self._check_axes_shape(axes, axes_num=2, layout=(3, 1)) - - axes = _check_plot_works(df.height.hist, filterwarnings='ignore', - by=df.category, layout=(4, 1)) - self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) - - axes = _check_plot_works(df.height.hist, filterwarnings='ignore', - by=df.category, layout=(2, -1)) - self._check_axes_shape(axes, axes_num=4, layout=(2, 2)) - - axes = _check_plot_works(df.height.hist, filterwarnings='ignore', - by=df.category, layout=(3, -1)) - self._check_axes_shape(axes, axes_num=4, layout=(3, 2)) - - axes = _check_plot_works(df.height.hist, filterwarnings='ignore', - by=df.category, layout=(-1, 4)) - self._check_axes_shape(axes, axes_num=4, layout=(1, 4)) - - axes = _check_plot_works(df.height.hist, filterwarnings='ignore', - by=df.classroom, layout=(2, 2)) - self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) - - axes = df.height.hist(by=df.category, layout=(4, 2), figsize=(12, 7)) - self._check_axes_shape(axes, axes_num=4, layout=(4, 2), - figsize=(12, 7)) - - @slow - def test_hist_no_overlap(self): - from matplotlib.pyplot import subplot, gcf - x = Series(randn(2)) - y = Series(randn(2)) - subplot(121) - x.hist() - subplot(122) - y.hist() - fig = gcf() - axes = fig.get_axes() - self.assertEqual(len(axes), 2) - - @slow - def test_hist_secondary_legend(self): - # GH 9610 - df = DataFrame(np.random.randn(30, 4), columns=list('abcd')) - - # primary -> secondary - ax = df['a'].plot.hist(legend=True) - df['b'].plot.hist(ax=ax, legend=True, secondary_y=True) - # both legends are dran on left ax - # left and right axis must be visible - self._check_legend_labels(ax, labels=['a', 'b (right)']) - self.assertTrue(ax.get_yaxis().get_visible()) - self.assertTrue(ax.right_ax.get_yaxis().get_visible()) - tm.close() - - # secondary -> secondary - ax = df['a'].plot.hist(legend=True, secondary_y=True) - df['b'].plot.hist(ax=ax, legend=True, secondary_y=True) - # both legends are draw on left ax - # left axis must be invisible, right axis must be visible - self._check_legend_labels(ax.left_ax, - labels=['a (right)', 'b (right)']) - self.assertFalse(ax.left_ax.get_yaxis().get_visible()) - self.assertTrue(ax.get_yaxis().get_visible()) - tm.close() - - # secondary -> primary - ax = df['a'].plot.hist(legend=True, secondary_y=True) - # right axes is returned - df['b'].plot.hist(ax=ax, legend=True) - # both legends are draw on left ax - # left and right axis must be visible - self._check_legend_labels(ax.left_ax, labels=['a (right)', 'b']) - self.assertTrue(ax.left_ax.get_yaxis().get_visible()) - self.assertTrue(ax.get_yaxis().get_visible()) - tm.close() - - @slow - def test_df_series_secondary_legend(self): - # GH 9779 - df = DataFrame(np.random.randn(30, 3), columns=list('abc')) - s = Series(np.random.randn(30), name='x') - - # primary -> secondary (without passing ax) - ax = df.plot() - s.plot(legend=True, secondary_y=True) - # both legends are dran on left ax - # left and right axis must be visible - self._check_legend_labels(ax, labels=['a', 'b', 'c', 'x (right)']) - self.assertTrue(ax.get_yaxis().get_visible()) - self.assertTrue(ax.right_ax.get_yaxis().get_visible()) - tm.close() - - # primary -> secondary (with passing ax) - ax = df.plot() - s.plot(ax=ax, legend=True, secondary_y=True) - # both legends are dran on left ax - # left and right axis must be visible - self._check_legend_labels(ax, labels=['a', 'b', 'c', 'x (right)']) - self.assertTrue(ax.get_yaxis().get_visible()) - self.assertTrue(ax.right_ax.get_yaxis().get_visible()) - tm.close() - - # seconcary -> secondary (without passing ax) - ax = df.plot(secondary_y=True) - s.plot(legend=True, secondary_y=True) - # both legends are dran on left ax - # left axis must be invisible and right axis must be visible - expected = ['a (right)', 'b (right)', 'c (right)', 'x (right)'] - self._check_legend_labels(ax.left_ax, labels=expected) - self.assertFalse(ax.left_ax.get_yaxis().get_visible()) - self.assertTrue(ax.get_yaxis().get_visible()) - tm.close() - - # secondary -> secondary (with passing ax) - ax = df.plot(secondary_y=True) - s.plot(ax=ax, legend=True, secondary_y=True) - # both legends are dran on left ax - # left axis must be invisible and right axis must be visible - expected = ['a (right)', 'b (right)', 'c (right)', 'x (right)'] - self._check_legend_labels(ax.left_ax, expected) - self.assertFalse(ax.left_ax.get_yaxis().get_visible()) - self.assertTrue(ax.get_yaxis().get_visible()) - tm.close() - - # secondary -> secondary (with passing ax) - ax = df.plot(secondary_y=True, mark_right=False) - s.plot(ax=ax, legend=True, secondary_y=True) - # both legends are dran on left ax - # left axis must be invisible and right axis must be visible - expected = ['a', 'b', 'c', 'x (right)'] - self._check_legend_labels(ax.left_ax, expected) - self.assertFalse(ax.left_ax.get_yaxis().get_visible()) - self.assertTrue(ax.get_yaxis().get_visible()) - tm.close() - - @slow - def test_plot_fails_with_dupe_color_and_style(self): - x = Series(randn(2)) - with tm.assertRaises(ValueError): - x.plot(style='k--', color='k') - - @slow - def test_hist_kde(self): - ax = self.ts.plot.hist(logy=True) - self._check_ax_scales(ax, yaxis='log') - xlabels = ax.get_xticklabels() - # ticks are values, thus ticklabels are blank - self._check_text_labels(xlabels, [''] * len(xlabels)) - ylabels = ax.get_yticklabels() - self._check_text_labels(ylabels, [''] * len(ylabels)) - - tm._skip_if_no_scipy() - _skip_if_no_scipy_gaussian_kde() - _check_plot_works(self.ts.plot.kde) - _check_plot_works(self.ts.plot.density) - ax = self.ts.plot.kde(logy=True) - self._check_ax_scales(ax, yaxis='log') - xlabels = ax.get_xticklabels() - self._check_text_labels(xlabels, [''] * len(xlabels)) - ylabels = ax.get_yticklabels() - self._check_text_labels(ylabels, [''] * len(ylabels)) - - @slow - def test_kde_kwargs(self): - tm._skip_if_no_scipy() - _skip_if_no_scipy_gaussian_kde() - from numpy import linspace - _check_plot_works(self.ts.plot.kde, bw_method=.5, - ind=linspace(-100, 100, 20)) - _check_plot_works(self.ts.plot.density, bw_method=.5, - ind=linspace(-100, 100, 20)) - ax = self.ts.plot.kde(logy=True, bw_method=.5, - ind=linspace(-100, 100, 20)) - self._check_ax_scales(ax, yaxis='log') - self._check_text_labels(ax.yaxis.get_label(), 'Density') - - @slow - def test_kde_missing_vals(self): - tm._skip_if_no_scipy() - _skip_if_no_scipy_gaussian_kde() - s = Series(np.random.uniform(size=50)) - s[0] = np.nan - _check_plot_works(s.plot.kde) - - @slow - def test_hist_kwargs(self): - ax = self.ts.plot.hist(bins=5) - self.assertEqual(len(ax.patches), 5) - self._check_text_labels(ax.yaxis.get_label(), 'Frequency') - tm.close() - - if self.mpl_ge_1_3_1: - ax = self.ts.plot.hist(orientation='horizontal') - self._check_text_labels(ax.xaxis.get_label(), 'Frequency') - tm.close() - - ax = self.ts.plot.hist(align='left', stacked=True) - tm.close() - - @slow - def test_hist_kde_color(self): - ax = self.ts.plot.hist(logy=True, bins=10, color='b') - self._check_ax_scales(ax, yaxis='log') - self.assertEqual(len(ax.patches), 10) - self._check_colors(ax.patches, facecolors=['b'] * 10) - - tm._skip_if_no_scipy() - _skip_if_no_scipy_gaussian_kde() - ax = self.ts.plot.kde(logy=True, color='r') - self._check_ax_scales(ax, yaxis='log') - lines = ax.get_lines() - self.assertEqual(len(lines), 1) - self._check_colors(lines, ['r']) - - @slow - def test_boxplot_series(self): - ax = self.ts.plot.box(logy=True) - self._check_ax_scales(ax, yaxis='log') - xlabels = ax.get_xticklabels() - self._check_text_labels(xlabels, [self.ts.name]) - ylabels = ax.get_yticklabels() - self._check_text_labels(ylabels, [''] * len(ylabels)) - - @slow - def test_kind_both_ways(self): - s = Series(range(3)) - for kind in plotting._common_kinds + plotting._series_kinds: - if not _ok_for_gaussian_kde(kind): - continue - s.plot(kind=kind) - getattr(s.plot, kind)() - - @slow - def test_invalid_plot_data(self): - s = Series(list('abcd')) - for kind in plotting._common_kinds: - if not _ok_for_gaussian_kde(kind): - continue - with tm.assertRaises(TypeError): - s.plot(kind=kind) - - @slow - def test_valid_object_plot(self): - s = Series(lrange(10), dtype=object) - for kind in plotting._common_kinds: - if not _ok_for_gaussian_kde(kind): - continue - _check_plot_works(s.plot, kind=kind) - - def test_partially_invalid_plot_data(self): - s = Series(['a', 'b', 1.0, 2]) - for kind in plotting._common_kinds: - if not _ok_for_gaussian_kde(kind): - continue - with tm.assertRaises(TypeError): - s.plot(kind=kind) - - def test_invalid_kind(self): - s = Series([1, 2]) - with tm.assertRaises(ValueError): - s.plot(kind='aasdf') - - @slow - def test_dup_datetime_index_plot(self): - dr1 = date_range('1/1/2009', periods=4) - dr2 = date_range('1/2/2009', periods=4) - index = dr1.append(dr2) - values = randn(index.size) - s = Series(values, index=index) - _check_plot_works(s.plot) - - @slow - def test_errorbar_plot(self): - - s = Series(np.arange(10), name='x') - s_err = np.random.randn(10) - d_err = DataFrame(randn(10, 2), index=s.index, columns=['x', 'y']) - # test line and bar plots - kinds = ['line', 'bar'] - for kind in kinds: - ax = _check_plot_works(s.plot, yerr=Series(s_err), kind=kind) - self._check_has_errorbars(ax, xerr=0, yerr=1) - ax = _check_plot_works(s.plot, yerr=s_err, kind=kind) - self._check_has_errorbars(ax, xerr=0, yerr=1) - ax = _check_plot_works(s.plot, yerr=s_err.tolist(), kind=kind) - self._check_has_errorbars(ax, xerr=0, yerr=1) - ax = _check_plot_works(s.plot, yerr=d_err, kind=kind) - self._check_has_errorbars(ax, xerr=0, yerr=1) - ax = _check_plot_works(s.plot, xerr=0.2, yerr=0.2, kind=kind) - self._check_has_errorbars(ax, xerr=1, yerr=1) - - ax = _check_plot_works(s.plot, xerr=s_err) - self._check_has_errorbars(ax, xerr=1, yerr=0) - - # test time series plotting - ix = date_range('1/1/2000', '1/1/2001', freq='M') - ts = Series(np.arange(12), index=ix, name='x') - ts_err = Series(np.random.randn(12), index=ix) - td_err = DataFrame(randn(12, 2), index=ix, columns=['x', 'y']) - - ax = _check_plot_works(ts.plot, yerr=ts_err) - self._check_has_errorbars(ax, xerr=0, yerr=1) - ax = _check_plot_works(ts.plot, yerr=td_err) - self._check_has_errorbars(ax, xerr=0, yerr=1) - - # check incorrect lengths and types - with tm.assertRaises(ValueError): - s.plot(yerr=np.arange(11)) - - s_err = ['zzz'] * 10 - # in mpl 1.5+ this is a TypeError - with tm.assertRaises((ValueError, TypeError)): - s.plot(yerr=s_err) - - def test_table(self): - _check_plot_works(self.series.plot, table=True) - _check_plot_works(self.series.plot, table=self.series) - - @slow - def test_series_grid_settings(self): - # Make sure plot defaults to rcParams['axes.grid'] setting, GH 9792 - self._check_grid_settings(Series([1, 2, 3]), - plotting._series_kinds + - plotting._common_kinds) - - @slow - def test_standard_colors(self): - for c in ['r', 'red', 'green', '#FF0000']: - result = plotting._get_standard_colors(1, color=c) - self.assertEqual(result, [c]) - - result = plotting._get_standard_colors(1, color=[c]) - self.assertEqual(result, [c]) - - result = plotting._get_standard_colors(3, color=c) - self.assertEqual(result, [c] * 3) - - result = plotting._get_standard_colors(3, color=[c]) - self.assertEqual(result, [c] * 3) - - @slow - def test_standard_colors_all(self): - import matplotlib.colors as colors - - # multiple colors like mediumaquamarine - for c in colors.cnames: - result = plotting._get_standard_colors(num_colors=1, color=c) - self.assertEqual(result, [c]) - - result = plotting._get_standard_colors(num_colors=1, color=[c]) - self.assertEqual(result, [c]) - - result = plotting._get_standard_colors(num_colors=3, color=c) - self.assertEqual(result, [c] * 3) - - result = plotting._get_standard_colors(num_colors=3, color=[c]) - self.assertEqual(result, [c] * 3) - - # single letter colors like k - for c in colors.ColorConverter.colors: - result = plotting._get_standard_colors(num_colors=1, color=c) - self.assertEqual(result, [c]) - - result = plotting._get_standard_colors(num_colors=1, color=[c]) - self.assertEqual(result, [c]) - - result = plotting._get_standard_colors(num_colors=3, color=c) - self.assertEqual(result, [c] * 3) - - result = plotting._get_standard_colors(num_colors=3, color=[c]) - self.assertEqual(result, [c] * 3) - - def test_series_plot_color_kwargs(self): - # GH1890 - ax = Series(np.arange(12) + 1).plot(color='green') - self._check_colors(ax.get_lines(), linecolors=['green']) - - def test_time_series_plot_color_kwargs(self): - # #1890 - ax = Series(np.arange(12) + 1, index=date_range( - '1/1/2000', periods=12)).plot(color='green') - self._check_colors(ax.get_lines(), linecolors=['green']) - - def test_time_series_plot_color_with_empty_kwargs(self): - import matplotlib as mpl - - if self.mpl_ge_1_5_0: - def_colors = self._maybe_unpack_cycler(mpl.rcParams) - else: - def_colors = mpl.rcParams['axes.color_cycle'] - index = date_range('1/1/2000', periods=12) - s = Series(np.arange(1, 13), index=index) - - ncolors = 3 - - for i in range(ncolors): - ax = s.plot() - self._check_colors(ax.get_lines(), linecolors=def_colors[:ncolors]) - - def test_xticklabels(self): - # GH11529 - s = Series(np.arange(10), index=['P%02d' % i for i in range(10)]) - ax = s.plot(xticks=[0, 3, 5, 9]) - exp = ['P%02d' % i for i in [0, 3, 5, 9]] - self._check_text_labels(ax.get_xticklabels(), exp) - - def test_custom_business_day_freq(self): - # GH7222 - from pandas.tseries.offsets import CustomBusinessDay - s = Series(range(100, 121), index=pd.bdate_range( - start='2014-05-01', end='2014-06-01', - freq=CustomBusinessDay(holidays=['2014-05-26']))) - - _check_plot_works(s.plot) +""" Test cases for DataFrame.plot """ @tm.mplskip @@ -1293,28 +44,29 @@ def setUp(self): "C": np.arange(20) + np.random.uniform( size=20)}) - from pandas import read_csv - path = os.path.join(curpath(), 'data', 'iris.csv') - self.iris = read_csv(path) - @slow def test_plot(self): df = self.tdf - _check_plot_works(df.plot, filterwarnings='ignore', grid=False) - axes = _check_plot_works(df.plot, filterwarnings='ignore', - subplots=True) + _check_plot_works(df.plot, grid=False) + # _check_plot_works adds an ax so catch warning. see GH #13188 + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.plot, + subplots=True) self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) - axes = _check_plot_works(df.plot, filterwarnings='ignore', - subplots=True, layout=(-1, 2)) + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.plot, + subplots=True, layout=(-1, 2)) self._check_axes_shape(axes, axes_num=4, layout=(2, 2)) - axes = _check_plot_works(df.plot, filterwarnings='ignore', - subplots=True, use_index=False) + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.plot, + subplots=True, use_index=False) self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) df = DataFrame({'x': [1, 2], 'y': [3, 4]}) - with tm.assertRaises(TypeError): + # mpl >= 1.5.2 (or slightly below) throw AttributError + with tm.assertRaises((TypeError, AttributeError)): df.plot.line(blarg=True) df = DataFrame(np.random.rand(10, 3), @@ -1326,8 +78,8 @@ def test_plot(self): _check_plot_works(df.plot, xticks=[1, 5, 10]) _check_plot_works(df.plot, ylim=(-100, 100), xlim=(-100, 100)) - _check_plot_works(df.plot, filterwarnings='ignore', - subplots=True, title='blah') + with tm.assert_produces_warning(UserWarning): + _check_plot_works(df.plot, subplots=True, title='blah') # We have to redo it here because _check_plot_works does two plots, # once without an ax kwarg and once with an ax kwarg and the new sharex @@ -2201,9 +953,12 @@ def test_scatter_colors(self): with tm.assertRaises(TypeError): df.plot.scatter(x='a', y='b', c='c', color='green') + default_colors = self._maybe_unpack_cycler(self.plt.rcParams) + ax = df.plot.scatter(x='a', y='b', c='c') - tm.assert_numpy_array_equal(ax.collections[0].get_facecolor()[0], - np.array([0, 0, 1, 1], dtype=np.float64)) + tm.assert_numpy_array_equal( + ax.collections[0].get_facecolor()[0], + np.array(self.colorconverter.to_rgba(default_colors[0]))) ax = df.plot.scatter(x='a', y='b', color='white') tm.assert_numpy_array_equal(ax.collections[0].get_facecolor()[0], @@ -2217,7 +972,9 @@ def test_plot_bar(self): _check_plot_works(df.plot.bar) _check_plot_works(df.plot.bar, legend=False) - _check_plot_works(df.plot.bar, filterwarnings='ignore', subplots=True) + # _check_plot_works adds an ax so catch warning. see GH #13188 + with tm.assert_produces_warning(UserWarning): + _check_plot_works(df.plot.bar, subplots=True) _check_plot_works(df.plot.bar, stacked=True) df = DataFrame(randn(10, 15), @@ -2399,7 +1156,8 @@ def test_boxplot(self): # different warning on py3 if not PY3: - axes = _check_plot_works(df.plot.box, subplots=True, logy=True) + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.plot.box, subplots=True, logy=True) self._check_axes_shape(axes, axes_num=3, layout=(1, 3)) self._check_ax_scales(axes, yaxis='log') @@ -2433,8 +1191,10 @@ def test_boxplot_vertical(self): self._check_text_labels(ax.get_yticklabels(), labels) self.assertEqual(len(ax.lines), self.bp_n_objects * len(numeric_cols)) - axes = _check_plot_works(df.plot.box, filterwarnings='ignore', - subplots=True, vert=False, logx=True) + # _check_plot_works adds an ax so catch warning. see GH #13188 + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.plot.box, + subplots=True, vert=False, logx=True) self._check_axes_shape(axes, axes_num=3, layout=(1, 3)) self._check_ax_scales(axes, xaxis='log') for ax, label in zip(axes, labels): @@ -2461,6 +1221,9 @@ def test_boxplot_return_type(self): result = df.plot.box(return_type='axes') self._check_box_return_type(result, 'axes') + result = df.plot.box() # default axes + self._check_box_return_type(result, 'axes') + result = df.plot.box(return_type='both') self._check_box_return_type(result, 'both') @@ -2470,7 +1233,7 @@ def test_boxplot_subplots_return_type(self): # normal style: return_type=None result = df.plot.box(subplots=True) - self.assertIsInstance(result, np.ndarray) + self.assertIsInstance(result, Series) self._check_box_return_type(result, None, expected_keys=[ 'height', 'weight', 'category']) @@ -2494,8 +1257,9 @@ def test_kde_df(self): ax = df.plot(kind='kde', rot=20, fontsize=5) self._check_ticks_props(ax, xrot=20, xlabelsize=5, ylabelsize=5) - axes = _check_plot_works(df.plot, filterwarnings='ignore', kind='kde', - subplots=True) + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.plot, kind='kde', + subplots=True) self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) axes = df.plot(kind='kde', logy=True, subplots=True) @@ -2522,8 +1286,9 @@ def test_hist_df(self): expected = [pprint_thing(c) for c in df.columns] self._check_legend_labels(ax, labels=expected) - axes = _check_plot_works(df.plot.hist, filterwarnings='ignore', - subplots=True, logy=True) + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.plot.hist, + subplots=True, logy=True) self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) self._check_ax_scales(axes, yaxis='log') @@ -2865,6 +1630,8 @@ def test_line_colors_and_styles_subplots(self): axes = df.plot(subplots=True) for ax, c in zip(axes, list(default_colors)): + if self.mpl_ge_2_0_0: + c = [c] self._check_colors(ax.get_lines(), linecolors=c) tm.close() @@ -2902,8 +1669,9 @@ def test_line_colors_and_styles_subplots(self): # Color contains shorthand hex value results in ValueError custom_colors = ['#F00', '#00F', '#FF0', '#000', '#FFF'] # Forced show plot - _check_plot_works(df.plot, color=custom_colors, subplots=True, - filterwarnings='ignore') + # _check_plot_works adds an ax so catch warning. see GH #13188 + with tm.assert_produces_warning(UserWarning): + _check_plot_works(df.plot, color=custom_colors, subplots=True) rgba_colors = lmap(cm.jet, np.linspace(0, 1, len(df))) for cmap in ['jet', cm.jet]: @@ -2944,9 +1712,14 @@ def test_area_colors(self): self._check_colors(poly, facecolors=custom_colors) handles, labels = ax.get_legend_handles_labels() - # legend is stored as Line2D, thus check linecolors - linehandles = [x for x in handles if not isinstance(x, PolyCollection)] - self._check_colors(linehandles, linecolors=custom_colors) + if self.mpl_ge_1_5_0: + self._check_colors(handles, facecolors=custom_colors) + else: + # legend is stored as Line2D, thus check linecolors + linehandles = [x for x in handles + if not isinstance(x, PolyCollection)] + self._check_colors(linehandles, linecolors=custom_colors) + for h in handles: self.assertTrue(h.get_alpha() is None) tm.close() @@ -2958,8 +1731,12 @@ def test_area_colors(self): self._check_colors(poly, facecolors=jet_colors) handles, labels = ax.get_legend_handles_labels() - linehandles = [x for x in handles if not isinstance(x, PolyCollection)] - self._check_colors(linehandles, linecolors=jet_colors) + if self.mpl_ge_1_5_0: + self._check_colors(handles, facecolors=jet_colors) + else: + linehandles = [x for x in handles + if not isinstance(x, PolyCollection)] + self._check_colors(linehandles, linecolors=jet_colors) for h in handles: self.assertTrue(h.get_alpha() is None) tm.close() @@ -2972,8 +1749,12 @@ def test_area_colors(self): self._check_colors(poly, facecolors=jet_with_alpha) handles, labels = ax.get_legend_handles_labels() - # Line2D can't have alpha in its linecolor - self._check_colors(handles[:len(jet_colors)], linecolors=jet_colors) + if self.mpl_ge_1_5_0: + linecolors = jet_with_alpha + else: + # Line2D can't have alpha in its linecolor + linecolors = jet_colors + self._check_colors(handles[:len(jet_colors)], linecolors=linecolors) for h in handles: self.assertEqual(h.get_alpha(), 0.5) @@ -3096,7 +1877,10 @@ def test_kde_colors_and_styles_subplots(self): @slow def test_boxplot_colors(self): def _check_colors(bp, box_c, whiskers_c, medians_c, caps_c='k', - fliers_c='b'): + fliers_c=None): + # TODO: outside this func? + if fliers_c is None: + fliers_c = 'k' if self.mpl_ge_2_0_0 else 'b' self._check_colors(bp['boxes'], linecolors=[box_c] * len(bp['boxes'])) self._check_colors(bp['whiskers'], @@ -3294,8 +2078,10 @@ def test_pie_df(self): ax = _check_plot_works(df.plot.pie, y=2) self._check_text_labels(ax.texts, df.index) - axes = _check_plot_works(df.plot.pie, filterwarnings='ignore', - subplots=True) + # _check_plot_works adds an ax so catch warning. see GH #13188 + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.plot.pie, + subplots=True) self.assertEqual(len(axes), len(df.columns)) for ax in axes: self._check_text_labels(ax.texts, df.index) @@ -3304,9 +2090,10 @@ def test_pie_df(self): labels = ['A', 'B', 'C', 'D', 'E'] color_args = ['r', 'g', 'b', 'c', 'm'] - axes = _check_plot_works(df.plot.pie, filterwarnings='ignore', - subplots=True, labels=labels, - colors=color_args) + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.plot.pie, + subplots=True, labels=labels, + colors=color_args) self.assertEqual(len(axes), len(df.columns)) for ax in axes: @@ -3362,9 +2149,12 @@ def test_errorbar_plot(self): self._check_has_errorbars(ax, xerr=2, yerr=2) ax = _check_plot_works(df.plot, xerr=0.2, yerr=0.2, kind=kind) self._check_has_errorbars(ax, xerr=2, yerr=2) - axes = _check_plot_works(df.plot, filterwarnings='ignore', - yerr=df_err, xerr=df_err, subplots=True, - kind=kind) + # _check_plot_works adds an ax so catch warning. see GH #13188 + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.plot, + yerr=df_err, xerr=df_err, + subplots=True, + kind=kind) self._check_has_errorbars(axes, xerr=1, yerr=1) ax = _check_plot_works((df + 1).plot, yerr=df_err, @@ -3455,8 +2245,11 @@ def test_errorbar_timeseries(self): self._check_has_errorbars(ax, xerr=0, yerr=1) ax = _check_plot_works(tdf.plot, yerr=tdf_err, kind=kind) self._check_has_errorbars(ax, xerr=0, yerr=2) - axes = _check_plot_works(tdf.plot, filterwarnings='ignore', - kind=kind, yerr=tdf_err, subplots=True) + # _check_plot_works adds an ax so catch warning. see GH #13188 + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(tdf.plot, + kind=kind, yerr=tdf_err, + subplots=True) self._check_has_errorbars(axes, xerr=0, yerr=1) def test_errorbar_asymmetrical(self): @@ -3464,16 +2257,24 @@ def test_errorbar_asymmetrical(self): np.random.seed(0) err = np.random.rand(3, 2, 5) - data = np.random.randn(5, 3) - df = DataFrame(data) + # each column is [0, 1, 2, 3, 4], [3, 4, 5, 6, 7]... + df = DataFrame(np.arange(15).reshape(3, 5)).T + data = df.values ax = df.plot(yerr=err, xerr=err / 2) - self.assertEqual(ax.lines[7].get_ydata()[0], data[0, 1] - err[1, 0, 0]) - self.assertEqual(ax.lines[8].get_ydata()[0], data[0, 1] + err[1, 1, 0]) + if self.mpl_ge_2_0_0: + yerr_0_0 = ax.collections[1].get_paths()[0].vertices[:, 1] + expected_0_0 = err[0, :, 0] * np.array([-1, 1]) + tm.assert_almost_equal(yerr_0_0, expected_0_0) + else: + self.assertEqual(ax.lines[7].get_ydata()[0], + data[0, 1] - err[1, 0, 0]) + self.assertEqual(ax.lines[8].get_ydata()[0], + data[0, 1] + err[1, 1, 0]) - self.assertEqual(ax.lines[5].get_xdata()[0], -err[1, 0, 0] / 2) - self.assertEqual(ax.lines[6].get_xdata()[0], err[1, 1, 0] / 2) + self.assertEqual(ax.lines[5].get_xdata()[0], -err[1, 0, 0] / 2) + self.assertEqual(ax.lines[6].get_xdata()[0], err[1, 1, 0] / 2) with tm.assertRaises(ValueError): df.plot(yerr=err.T) @@ -3509,9 +2310,17 @@ def test_errorbar_scatter(self): self._check_has_errorbars(ax, xerr=1, yerr=1) def _check_errorbar_color(containers, expected, has_err='has_xerr'): - errs = [c.lines[1][0] - for c in ax.containers if getattr(c, has_err, False)] - self._check_colors(errs, linecolors=[expected] * len(errs)) + lines = [] + errs = [c.lines + for c in ax.containers if getattr(c, has_err, False)][0] + for el in errs: + if is_list_like(el): + lines.extend(el) + else: + lines.append(el) + err_lines = [x for x in lines if x in ax.collections] + self._check_colors( + err_lines, linecolors=np.array([expected] * len(err_lines))) # GH 8081 df = DataFrame( @@ -3906,103 +2715,6 @@ def test_rcParams_bar_colors(self): for c in barplot.patches]) -@tm.mplskip -class TestDataFrameGroupByPlots(TestPlotBase): - - def test_series_groupby_plotting_nominally_works(self): - n = 10 - weight = Series(np.random.normal(166, 20, size=n)) - height = Series(np.random.normal(60, 10, size=n)) - with tm.RNGContext(42): - gender = np.random.choice(['male', 'female'], size=n) - - weight.groupby(gender).plot() - tm.close() - height.groupby(gender).hist() - tm.close() - # Regression test for GH8733 - height.groupby(gender).plot(alpha=0.5) - tm.close() - - def test_plotting_with_float_index_works(self): - # GH 7025 - df = DataFrame({'def': [1, 1, 1, 2, 2, 2, 3, 3, 3], - 'val': np.random.randn(9)}, - index=[1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0]) - - df.groupby('def')['val'].plot() - tm.close() - df.groupby('def')['val'].apply(lambda x: x.plot()) - tm.close() - - def test_hist_single_row(self): - # GH10214 - bins = np.arange(80, 100 + 2, 1) - df = DataFrame({"Name": ["AAA", "BBB"], - "ByCol": [1, 2], - "Mark": [85, 89]}) - df["Mark"].hist(by=df["ByCol"], bins=bins) - df = DataFrame({"Name": ["AAA"], "ByCol": [1], "Mark": [85]}) - df["Mark"].hist(by=df["ByCol"], bins=bins) - - def test_plot_submethod_works(self): - df = DataFrame({'x': [1, 2, 3, 4, 5], - 'y': [1, 2, 3, 2, 1], - 'z': list('ababa')}) - df.groupby('z').plot.scatter('x', 'y') - tm.close() - df.groupby('z')['x'].plot.line() - tm.close() - - def test_plot_kwargs(self): - - df = DataFrame({'x': [1, 2, 3, 4, 5], - 'y': [1, 2, 3, 2, 1], - 'z': list('ababa')}) - - res = df.groupby('z').plot(kind='scatter', x='x', y='y') - # check that a scatter plot is effectively plotted: the axes should - # contain a PathCollection from the scatter plot (GH11805) - self.assertEqual(len(res['a'].collections), 1) - - res = df.groupby('z').plot.scatter(x='x', y='y') - self.assertEqual(len(res['a'].collections), 1) - - -def _check_plot_works(f, filterwarnings='always', **kwargs): - import matplotlib.pyplot as plt - ret = None - with warnings.catch_warnings(): - warnings.simplefilter(filterwarnings) - try: - try: - fig = kwargs['figure'] - except KeyError: - fig = plt.gcf() - - plt.clf() - - ax = kwargs.get('ax', fig.add_subplot(211)) # noqa - ret = f(**kwargs) - - assert_is_valid_plot_return_object(ret) - - try: - kwargs['ax'] = fig.add_subplot(212) - ret = f(**kwargs) - except Exception: - pass - else: - assert_is_valid_plot_return_object(ret) - - with ensure_clean(return_filelike=True) as path: - plt.savefig(path) - finally: - tm.close(fig) - - return ret - - def _generate_4_axes_via_gridspec(): import matplotlib.pyplot as plt import matplotlib as mpl @@ -4017,11 +2729,6 @@ def _generate_4_axes_via_gridspec(): return gs, [ax_tl, ax_ll, ax_tr, ax_lr] -def curpath(): - pth, _ = os.path.split(os.path.abspath(__file__)) - return pth - - if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) diff --git a/pandas/tests/plotting/test_groupby.py b/pandas/tests/plotting/test_groupby.py new file mode 100644 index 0000000000000..101a6556c61bf --- /dev/null +++ b/pandas/tests/plotting/test_groupby.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python +# coding: utf-8 + +import nose + +from pandas import Series, DataFrame +import pandas.util.testing as tm + +import numpy as np + +from pandas.tests.plotting.common import TestPlotBase + + +""" Test cases for GroupBy.plot """ + + +@tm.mplskip +class TestDataFrameGroupByPlots(TestPlotBase): + + def test_series_groupby_plotting_nominally_works(self): + n = 10 + weight = Series(np.random.normal(166, 20, size=n)) + height = Series(np.random.normal(60, 10, size=n)) + with tm.RNGContext(42): + gender = np.random.choice(['male', 'female'], size=n) + + weight.groupby(gender).plot() + tm.close() + height.groupby(gender).hist() + tm.close() + # Regression test for GH8733 + height.groupby(gender).plot(alpha=0.5) + tm.close() + + def test_plotting_with_float_index_works(self): + # GH 7025 + df = DataFrame({'def': [1, 1, 1, 2, 2, 2, 3, 3, 3], + 'val': np.random.randn(9)}, + index=[1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0]) + + df.groupby('def')['val'].plot() + tm.close() + df.groupby('def')['val'].apply(lambda x: x.plot()) + tm.close() + + def test_hist_single_row(self): + # GH10214 + bins = np.arange(80, 100 + 2, 1) + df = DataFrame({"Name": ["AAA", "BBB"], + "ByCol": [1, 2], + "Mark": [85, 89]}) + df["Mark"].hist(by=df["ByCol"], bins=bins) + df = DataFrame({"Name": ["AAA"], "ByCol": [1], "Mark": [85]}) + df["Mark"].hist(by=df["ByCol"], bins=bins) + + def test_plot_submethod_works(self): + df = DataFrame({'x': [1, 2, 3, 4, 5], + 'y': [1, 2, 3, 2, 1], + 'z': list('ababa')}) + df.groupby('z').plot.scatter('x', 'y') + tm.close() + df.groupby('z')['x'].plot.line() + tm.close() + + def test_plot_kwargs(self): + + df = DataFrame({'x': [1, 2, 3, 4, 5], + 'y': [1, 2, 3, 2, 1], + 'z': list('ababa')}) + + res = df.groupby('z').plot(kind='scatter', x='x', y='y') + # check that a scatter plot is effectively plotted: the axes should + # contain a PathCollection from the scatter plot (GH11805) + self.assertEqual(len(res['a'].collections), 1) + + res = df.groupby('z').plot.scatter(x='x', y='y') + self.assertEqual(len(res['a'].collections), 1) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/plotting/test_hist_method.py b/pandas/tests/plotting/test_hist_method.py new file mode 100644 index 0000000000000..c7bff5a31fc02 --- /dev/null +++ b/pandas/tests/plotting/test_hist_method.py @@ -0,0 +1,426 @@ +#!/usr/bin/env python +# coding: utf-8 + +import nose + +from pandas import Series, DataFrame +import pandas.util.testing as tm +from pandas.util.testing import slow + +import numpy as np +from numpy.random import randn + +import pandas.tools.plotting as plotting +from pandas.tests.plotting.common import (TestPlotBase, _check_plot_works) + + +""" Test cases for .hist method """ + + +@tm.mplskip +class TestSeriesPlots(TestPlotBase): + + def setUp(self): + TestPlotBase.setUp(self) + import matplotlib as mpl + mpl.rcdefaults() + + self.ts = tm.makeTimeSeries() + self.ts.name = 'ts' + + @slow + def test_hist_legacy(self): + _check_plot_works(self.ts.hist) + _check_plot_works(self.ts.hist, grid=False) + _check_plot_works(self.ts.hist, figsize=(8, 10)) + # _check_plot_works adds an ax so catch warning. see GH #13188 + with tm.assert_produces_warning(UserWarning): + _check_plot_works(self.ts.hist, by=self.ts.index.month) + with tm.assert_produces_warning(UserWarning): + _check_plot_works(self.ts.hist, by=self.ts.index.month, bins=5) + + fig, ax = self.plt.subplots(1, 1) + _check_plot_works(self.ts.hist, ax=ax) + _check_plot_works(self.ts.hist, ax=ax, figure=fig) + _check_plot_works(self.ts.hist, figure=fig) + tm.close() + + fig, (ax1, ax2) = self.plt.subplots(1, 2) + _check_plot_works(self.ts.hist, figure=fig, ax=ax1) + _check_plot_works(self.ts.hist, figure=fig, ax=ax2) + + with tm.assertRaises(ValueError): + self.ts.hist(by=self.ts.index, figure=fig) + + @slow + def test_hist_bins_legacy(self): + df = DataFrame(np.random.randn(10, 2)) + ax = df.hist(bins=2)[0][0] + self.assertEqual(len(ax.patches), 2) + + @slow + def test_hist_layout(self): + df = self.hist_df + with tm.assertRaises(ValueError): + df.height.hist(layout=(1, 1)) + + with tm.assertRaises(ValueError): + df.height.hist(layout=[1, 1]) + + @slow + def test_hist_layout_with_by(self): + df = self.hist_df + + # _check_plot_works adds an `ax` kwarg to the method call + # so we get a warning about an axis being cleared, even + # though we don't explicing pass one, see GH #13188 + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.height.hist, by=df.gender, + layout=(2, 1)) + self._check_axes_shape(axes, axes_num=2, layout=(2, 1)) + + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.height.hist, by=df.gender, + layout=(3, -1)) + self._check_axes_shape(axes, axes_num=2, layout=(3, 1)) + + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.height.hist, by=df.category, + layout=(4, 1)) + self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) + + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works( + df.height.hist, by=df.category, layout=(2, -1)) + self._check_axes_shape(axes, axes_num=4, layout=(2, 2)) + + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works( + df.height.hist, by=df.category, layout=(3, -1)) + self._check_axes_shape(axes, axes_num=4, layout=(3, 2)) + + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works( + df.height.hist, by=df.category, layout=(-1, 4)) + self._check_axes_shape(axes, axes_num=4, layout=(1, 4)) + + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works( + df.height.hist, by=df.classroom, layout=(2, 2)) + self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) + + axes = df.height.hist(by=df.category, layout=(4, 2), figsize=(12, 7)) + self._check_axes_shape( + axes, axes_num=4, layout=(4, 2), figsize=(12, 7)) + + @slow + def test_hist_no_overlap(self): + from matplotlib.pyplot import subplot, gcf + x = Series(randn(2)) + y = Series(randn(2)) + subplot(121) + x.hist() + subplot(122) + y.hist() + fig = gcf() + axes = fig.axes if self.mpl_ge_1_5_0 else fig.get_axes() + self.assertEqual(len(axes), 2) + + @slow + def test_hist_by_no_extra_plots(self): + df = self.hist_df + axes = df.height.hist(by=df.gender) # noqa + self.assertEqual(len(self.plt.get_fignums()), 1) + + @slow + def test_plot_fails_when_ax_differs_from_figure(self): + from pylab import figure + fig1 = figure() + fig2 = figure() + ax1 = fig1.add_subplot(111) + with tm.assertRaises(AssertionError): + self.ts.hist(ax=ax1, figure=fig2) + + +@tm.mplskip +class TestDataFramePlots(TestPlotBase): + + @slow + def test_hist_df_legacy(self): + from matplotlib.patches import Rectangle + with tm.assert_produces_warning(UserWarning): + _check_plot_works(self.hist_df.hist) + + # make sure layout is handled + df = DataFrame(randn(100, 3)) + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.hist, grid=False) + self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) + self.assertFalse(axes[1, 1].get_visible()) + + df = DataFrame(randn(100, 1)) + _check_plot_works(df.hist) + + # make sure layout is handled + df = DataFrame(randn(100, 6)) + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.hist, layout=(4, 2)) + self._check_axes_shape(axes, axes_num=6, layout=(4, 2)) + + # make sure sharex, sharey is handled + with tm.assert_produces_warning(UserWarning): + _check_plot_works(df.hist, sharex=True, sharey=True) + + # handle figsize arg + with tm.assert_produces_warning(UserWarning): + _check_plot_works(df.hist, figsize=(8, 10)) + + # check bins argument + with tm.assert_produces_warning(UserWarning): + _check_plot_works(df.hist, bins=5) + + # make sure xlabelsize and xrot are handled + ser = df[0] + xf, yf = 20, 18 + xrot, yrot = 30, 40 + axes = ser.hist(xlabelsize=xf, xrot=xrot, ylabelsize=yf, yrot=yrot) + self._check_ticks_props(axes, xlabelsize=xf, xrot=xrot, + ylabelsize=yf, yrot=yrot) + + xf, yf = 20, 18 + xrot, yrot = 30, 40 + axes = df.hist(xlabelsize=xf, xrot=xrot, ylabelsize=yf, yrot=yrot) + self._check_ticks_props(axes, xlabelsize=xf, xrot=xrot, + ylabelsize=yf, yrot=yrot) + + tm.close() + # make sure kwargs to hist are handled + ax = ser.hist(normed=True, cumulative=True, bins=4) + # height of last bin (index 5) must be 1.0 + rects = [x for x in ax.get_children() if isinstance(x, Rectangle)] + self.assertAlmostEqual(rects[-1].get_height(), 1.0) + + tm.close() + ax = ser.hist(log=True) + # scale of y must be 'log' + self._check_ax_scales(ax, yaxis='log') + + tm.close() + + # propagate attr exception from matplotlib.Axes.hist + with tm.assertRaises(AttributeError): + ser.hist(foo='bar') + + @slow + def test_hist_layout(self): + df = DataFrame(randn(100, 3)) + + layout_to_expected_size = ( + {'layout': None, 'expected_size': (2, 2)}, # default is 2x2 + {'layout': (2, 2), 'expected_size': (2, 2)}, + {'layout': (4, 1), 'expected_size': (4, 1)}, + {'layout': (1, 4), 'expected_size': (1, 4)}, + {'layout': (3, 3), 'expected_size': (3, 3)}, + {'layout': (-1, 4), 'expected_size': (1, 4)}, + {'layout': (4, -1), 'expected_size': (4, 1)}, + {'layout': (-1, 2), 'expected_size': (2, 2)}, + {'layout': (2, -1), 'expected_size': (2, 2)} + ) + + for layout_test in layout_to_expected_size: + axes = df.hist(layout=layout_test['layout']) + expected = layout_test['expected_size'] + self._check_axes_shape(axes, axes_num=3, layout=expected) + + # layout too small for all 4 plots + with tm.assertRaises(ValueError): + df.hist(layout=(1, 1)) + + # invalid format for layout + with tm.assertRaises(ValueError): + df.hist(layout=(1,)) + with tm.assertRaises(ValueError): + df.hist(layout=(-1, -1)) + + +@tm.mplskip +class TestDataFrameGroupByPlots(TestPlotBase): + + @slow + def test_grouped_hist_legacy(self): + from matplotlib.patches import Rectangle + + df = DataFrame(randn(500, 2), columns=['A', 'B']) + df['C'] = np.random.randint(0, 4, 500) + df['D'] = ['X'] * 500 + + axes = plotting.grouped_hist(df.A, by=df.C) + self._check_axes_shape(axes, axes_num=4, layout=(2, 2)) + + tm.close() + axes = df.hist(by=df.C) + self._check_axes_shape(axes, axes_num=4, layout=(2, 2)) + + tm.close() + # group by a key with single value + axes = df.hist(by='D', rot=30) + self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) + self._check_ticks_props(axes, xrot=30) + + tm.close() + # make sure kwargs to hist are handled + xf, yf = 20, 18 + xrot, yrot = 30, 40 + axes = plotting.grouped_hist(df.A, by=df.C, normed=True, + cumulative=True, bins=4, + xlabelsize=xf, xrot=xrot, + ylabelsize=yf, yrot=yrot) + # height of last bin (index 5) must be 1.0 + for ax in axes.ravel(): + rects = [x for x in ax.get_children() if isinstance(x, Rectangle)] + height = rects[-1].get_height() + self.assertAlmostEqual(height, 1.0) + self._check_ticks_props(axes, xlabelsize=xf, xrot=xrot, + ylabelsize=yf, yrot=yrot) + + tm.close() + axes = plotting.grouped_hist(df.A, by=df.C, log=True) + # scale of y must be 'log' + self._check_ax_scales(axes, yaxis='log') + + tm.close() + # propagate attr exception from matplotlib.Axes.hist + with tm.assertRaises(AttributeError): + plotting.grouped_hist(df.A, by=df.C, foo='bar') + + with tm.assert_produces_warning(FutureWarning): + df.hist(by='C', figsize='default') + + @slow + def test_grouped_hist_legacy2(self): + n = 10 + weight = Series(np.random.normal(166, 20, size=n)) + height = Series(np.random.normal(60, 10, size=n)) + with tm.RNGContext(42): + gender_int = np.random.choice([0, 1], size=n) + df_int = DataFrame({'height': height, 'weight': weight, + 'gender': gender_int}) + gb = df_int.groupby('gender') + axes = gb.hist() + self.assertEqual(len(axes), 2) + self.assertEqual(len(self.plt.get_fignums()), 2) + tm.close() + + @slow + def test_grouped_hist_layout(self): + df = self.hist_df + self.assertRaises(ValueError, df.hist, column='weight', by=df.gender, + layout=(1, 1)) + self.assertRaises(ValueError, df.hist, column='height', by=df.category, + layout=(1, 3)) + self.assertRaises(ValueError, df.hist, column='height', by=df.category, + layout=(-1, -1)) + + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.hist, column='height', by=df.gender, + layout=(2, 1)) + self._check_axes_shape(axes, axes_num=2, layout=(2, 1)) + + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.hist, column='height', by=df.gender, + layout=(2, -1)) + self._check_axes_shape(axes, axes_num=2, layout=(2, 1)) + + axes = df.hist(column='height', by=df.category, layout=(4, 1)) + self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) + + axes = df.hist(column='height', by=df.category, layout=(-1, 1)) + self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) + + axes = df.hist(column='height', by=df.category, + layout=(4, 2), figsize=(12, 8)) + self._check_axes_shape( + axes, axes_num=4, layout=(4, 2), figsize=(12, 8)) + tm.close() + + # GH 6769 + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works( + df.hist, column='height', by='classroom', layout=(2, 2)) + self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) + + # without column + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.hist, by='classroom') + self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) + + axes = df.hist(by='gender', layout=(3, 5)) + self._check_axes_shape(axes, axes_num=2, layout=(3, 5)) + + axes = df.hist(column=['height', 'weight', 'category']) + self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) + + @slow + def test_grouped_hist_multiple_axes(self): + # GH 6970, GH 7069 + df = self.hist_df + + fig, axes = self.plt.subplots(2, 3) + returned = df.hist(column=['height', 'weight', 'category'], ax=axes[0]) + self._check_axes_shape(returned, axes_num=3, layout=(1, 3)) + self.assert_numpy_array_equal(returned, axes[0]) + self.assertIs(returned[0].figure, fig) + returned = df.hist(by='classroom', ax=axes[1]) + self._check_axes_shape(returned, axes_num=3, layout=(1, 3)) + self.assert_numpy_array_equal(returned, axes[1]) + self.assertIs(returned[0].figure, fig) + + with tm.assertRaises(ValueError): + fig, axes = self.plt.subplots(2, 3) + # pass different number of axes from required + axes = df.hist(column='height', ax=axes) + + @slow + def test_axis_share_x(self): + df = self.hist_df + # GH4089 + ax1, ax2 = df.hist(column='height', by=df.gender, sharex=True) + + # share x + self.assertTrue(ax1._shared_x_axes.joined(ax1, ax2)) + self.assertTrue(ax2._shared_x_axes.joined(ax1, ax2)) + + # don't share y + self.assertFalse(ax1._shared_y_axes.joined(ax1, ax2)) + self.assertFalse(ax2._shared_y_axes.joined(ax1, ax2)) + + @slow + def test_axis_share_y(self): + df = self.hist_df + ax1, ax2 = df.hist(column='height', by=df.gender, sharey=True) + + # share y + self.assertTrue(ax1._shared_y_axes.joined(ax1, ax2)) + self.assertTrue(ax2._shared_y_axes.joined(ax1, ax2)) + + # don't share x + self.assertFalse(ax1._shared_x_axes.joined(ax1, ax2)) + self.assertFalse(ax2._shared_x_axes.joined(ax1, ax2)) + + @slow + def test_axis_share_xy(self): + df = self.hist_df + ax1, ax2 = df.hist(column='height', by=df.gender, sharex=True, + sharey=True) + + # share both x and y + self.assertTrue(ax1._shared_x_axes.joined(ax1, ax2)) + self.assertTrue(ax2._shared_x_axes.joined(ax1, ax2)) + + self.assertTrue(ax1._shared_y_axes.joined(ax1, ax2)) + self.assertTrue(ax2._shared_y_axes.joined(ax1, ax2)) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/plotting/test_misc.py b/pandas/tests/plotting/test_misc.py new file mode 100644 index 0000000000000..a484217da5969 --- /dev/null +++ b/pandas/tests/plotting/test_misc.py @@ -0,0 +1,283 @@ +#!/usr/bin/env python +# coding: utf-8 + +import nose + +from pandas import Series, DataFrame +from pandas.compat import lmap +import pandas.util.testing as tm +from pandas.util.testing import slow + +import numpy as np +from numpy import random +from numpy.random import randn + +import pandas.tools.plotting as plotting +from pandas.tests.plotting.common import (TestPlotBase, _check_plot_works, + _ok_for_gaussian_kde) + + +""" Test cases for misc plot functions """ + + +@tm.mplskip +class TestSeriesPlots(TestPlotBase): + + def setUp(self): + TestPlotBase.setUp(self) + import matplotlib as mpl + mpl.rcdefaults() + + self.ts = tm.makeTimeSeries() + self.ts.name = 'ts' + + @slow + def test_autocorrelation_plot(self): + from pandas.tools.plotting import autocorrelation_plot + _check_plot_works(autocorrelation_plot, series=self.ts) + _check_plot_works(autocorrelation_plot, series=self.ts.values) + + ax = autocorrelation_plot(self.ts, label='Test') + self._check_legend_labels(ax, labels=['Test']) + + @slow + def test_lag_plot(self): + from pandas.tools.plotting import lag_plot + _check_plot_works(lag_plot, series=self.ts) + _check_plot_works(lag_plot, series=self.ts, lag=5) + + @slow + def test_bootstrap_plot(self): + from pandas.tools.plotting import bootstrap_plot + _check_plot_works(bootstrap_plot, series=self.ts, size=10) + + +@tm.mplskip +class TestDataFramePlots(TestPlotBase): + + @slow + def test_scatter_plot_legacy(self): + tm._skip_if_no_scipy() + + df = DataFrame(randn(100, 2)) + + def scat(**kwds): + return plotting.scatter_matrix(df, **kwds) + + with tm.assert_produces_warning(UserWarning): + _check_plot_works(scat) + with tm.assert_produces_warning(UserWarning): + _check_plot_works(scat, marker='+') + with tm.assert_produces_warning(UserWarning): + _check_plot_works(scat, vmin=0) + if _ok_for_gaussian_kde('kde'): + with tm.assert_produces_warning(UserWarning): + _check_plot_works(scat, diagonal='kde') + if _ok_for_gaussian_kde('density'): + with tm.assert_produces_warning(UserWarning): + _check_plot_works(scat, diagonal='density') + with tm.assert_produces_warning(UserWarning): + _check_plot_works(scat, diagonal='hist') + with tm.assert_produces_warning(UserWarning): + _check_plot_works(scat, range_padding=.1) + + def scat2(x, y, by=None, ax=None, figsize=None): + return plotting.scatter_plot(df, x, y, by, ax, figsize=None) + + _check_plot_works(scat2, x=0, y=1) + grouper = Series(np.repeat([1, 2, 3, 4, 5], 20), df.index) + with tm.assert_produces_warning(UserWarning): + _check_plot_works(scat2, x=0, y=1, by=grouper) + + def test_scatter_matrix_axis(self): + tm._skip_if_no_scipy() + scatter_matrix = plotting.scatter_matrix + + with tm.RNGContext(42): + df = DataFrame(randn(100, 3)) + + # we are plotting multiples on a sub-plot + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(scatter_matrix, filterwarnings='always', + frame=df, range_padding=.1) + axes0_labels = axes[0][0].yaxis.get_majorticklabels() + + # GH 5662 + if self.mpl_ge_2_0_0: + expected = ['-2', '0', '2'] + else: + expected = ['-2', '-1', '0', '1', '2'] + self._check_text_labels(axes0_labels, expected) + self._check_ticks_props( + axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0) + + df[0] = ((df[0] - 2) / 3) + + # we are plotting multiples on a sub-plot + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(scatter_matrix, filterwarnings='always', + frame=df, range_padding=.1) + axes0_labels = axes[0][0].yaxis.get_majorticklabels() + if self.mpl_ge_2_0_0: + expected = ['-1.0', '-0.5', '0.0'] + else: + expected = ['-1.2', '-1.0', '-0.8', '-0.6', '-0.4', '-0.2', '0.0'] + self._check_text_labels(axes0_labels, expected) + self._check_ticks_props( + axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0) + + @slow + def test_andrews_curves(self): + from pandas.tools.plotting import andrews_curves + from matplotlib import cm + + df = self.iris + + _check_plot_works(andrews_curves, frame=df, class_column='Name') + + rgba = ('#556270', '#4ECDC4', '#C7F464') + ax = _check_plot_works(andrews_curves, frame=df, + class_column='Name', color=rgba) + self._check_colors( + ax.get_lines()[:10], linecolors=rgba, mapping=df['Name'][:10]) + + cnames = ['dodgerblue', 'aquamarine', 'seagreen'] + ax = _check_plot_works(andrews_curves, frame=df, + class_column='Name', color=cnames) + self._check_colors( + ax.get_lines()[:10], linecolors=cnames, mapping=df['Name'][:10]) + + ax = _check_plot_works(andrews_curves, frame=df, + class_column='Name', colormap=cm.jet) + cmaps = lmap(cm.jet, np.linspace(0, 1, df['Name'].nunique())) + self._check_colors( + ax.get_lines()[:10], linecolors=cmaps, mapping=df['Name'][:10]) + + length = 10 + df = DataFrame({"A": random.rand(length), + "B": random.rand(length), + "C": random.rand(length), + "Name": ["A"] * length}) + + _check_plot_works(andrews_curves, frame=df, class_column='Name') + + rgba = ('#556270', '#4ECDC4', '#C7F464') + ax = _check_plot_works(andrews_curves, frame=df, + class_column='Name', color=rgba) + self._check_colors( + ax.get_lines()[:10], linecolors=rgba, mapping=df['Name'][:10]) + + cnames = ['dodgerblue', 'aquamarine', 'seagreen'] + ax = _check_plot_works(andrews_curves, frame=df, + class_column='Name', color=cnames) + self._check_colors( + ax.get_lines()[:10], linecolors=cnames, mapping=df['Name'][:10]) + + ax = _check_plot_works(andrews_curves, frame=df, + class_column='Name', colormap=cm.jet) + cmaps = lmap(cm.jet, np.linspace(0, 1, df['Name'].nunique())) + self._check_colors( + ax.get_lines()[:10], linecolors=cmaps, mapping=df['Name'][:10]) + + colors = ['b', 'g', 'r'] + df = DataFrame({"A": [1, 2, 3], + "B": [1, 2, 3], + "C": [1, 2, 3], + "Name": colors}) + ax = andrews_curves(df, 'Name', color=colors) + handles, labels = ax.get_legend_handles_labels() + self._check_colors(handles, linecolors=colors) + + with tm.assert_produces_warning(FutureWarning): + andrews_curves(data=df, class_column='Name') + + @slow + def test_parallel_coordinates(self): + from pandas.tools.plotting import parallel_coordinates + from matplotlib import cm + + df = self.iris + + ax = _check_plot_works(parallel_coordinates, + frame=df, class_column='Name') + nlines = len(ax.get_lines()) + nxticks = len(ax.xaxis.get_ticklabels()) + + rgba = ('#556270', '#4ECDC4', '#C7F464') + ax = _check_plot_works(parallel_coordinates, + frame=df, class_column='Name', color=rgba) + self._check_colors( + ax.get_lines()[:10], linecolors=rgba, mapping=df['Name'][:10]) + + cnames = ['dodgerblue', 'aquamarine', 'seagreen'] + ax = _check_plot_works(parallel_coordinates, + frame=df, class_column='Name', color=cnames) + self._check_colors( + ax.get_lines()[:10], linecolors=cnames, mapping=df['Name'][:10]) + + ax = _check_plot_works(parallel_coordinates, + frame=df, class_column='Name', colormap=cm.jet) + cmaps = lmap(cm.jet, np.linspace(0, 1, df['Name'].nunique())) + self._check_colors( + ax.get_lines()[:10], linecolors=cmaps, mapping=df['Name'][:10]) + + ax = _check_plot_works(parallel_coordinates, + frame=df, class_column='Name', axvlines=False) + assert len(ax.get_lines()) == (nlines - nxticks) + + colors = ['b', 'g', 'r'] + df = DataFrame({"A": [1, 2, 3], + "B": [1, 2, 3], + "C": [1, 2, 3], + "Name": colors}) + ax = parallel_coordinates(df, 'Name', color=colors) + handles, labels = ax.get_legend_handles_labels() + self._check_colors(handles, linecolors=colors) + + with tm.assert_produces_warning(FutureWarning): + parallel_coordinates(data=df, class_column='Name') + with tm.assert_produces_warning(FutureWarning): + parallel_coordinates(df, 'Name', colors=colors) + + @slow + def test_radviz(self): + from pandas.tools.plotting import radviz + from matplotlib import cm + + df = self.iris + _check_plot_works(radviz, frame=df, class_column='Name') + + rgba = ('#556270', '#4ECDC4', '#C7F464') + ax = _check_plot_works( + radviz, frame=df, class_column='Name', color=rgba) + # skip Circle drawn as ticks + patches = [p for p in ax.patches[:20] if p.get_label() != ''] + self._check_colors( + patches[:10], facecolors=rgba, mapping=df['Name'][:10]) + + cnames = ['dodgerblue', 'aquamarine', 'seagreen'] + _check_plot_works(radviz, frame=df, class_column='Name', color=cnames) + patches = [p for p in ax.patches[:20] if p.get_label() != ''] + self._check_colors(patches, facecolors=cnames, mapping=df['Name'][:10]) + + _check_plot_works(radviz, frame=df, + class_column='Name', colormap=cm.jet) + cmaps = lmap(cm.jet, np.linspace(0, 1, df['Name'].nunique())) + patches = [p for p in ax.patches[:20] if p.get_label() != ''] + self._check_colors(patches, facecolors=cmaps, mapping=df['Name'][:10]) + + colors = [[0., 0., 1., 1.], + [0., 0.5, 1., 1.], + [1., 0., 0., 1.]] + df = DataFrame({"A": [1, 2, 3], + "B": [2, 1, 3], + "C": [3, 2, 1], + "Name": ['b', 'g', 'r']}) + ax = radviz(df, 'Name', color=colors) + handles, labels = ax.get_legend_handles_labels() + self._check_colors(handles, facecolors=colors) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py new file mode 100644 index 0000000000000..e752197c6ad77 --- /dev/null +++ b/pandas/tests/plotting/test_series.py @@ -0,0 +1,808 @@ +#!/usr/bin/env python +# coding: utf-8 + +import nose +import itertools + +from datetime import datetime + +import pandas as pd +from pandas import Series, DataFrame, date_range +from pandas.compat import range, lrange +import pandas.util.testing as tm +from pandas.util.testing import slow + +import numpy as np +from numpy.random import randn + +import pandas.tools.plotting as plotting +from pandas.tests.plotting.common import (TestPlotBase, _check_plot_works, + _skip_if_no_scipy_gaussian_kde, + _ok_for_gaussian_kde) + + +""" Test cases for Series.plot """ + + +@tm.mplskip +class TestSeriesPlots(TestPlotBase): + + def setUp(self): + TestPlotBase.setUp(self) + import matplotlib as mpl + mpl.rcdefaults() + + self.ts = tm.makeTimeSeries() + self.ts.name = 'ts' + + self.series = tm.makeStringSeries() + self.series.name = 'series' + + self.iseries = tm.makePeriodSeries() + self.iseries.name = 'iseries' + + @slow + def test_plot(self): + _check_plot_works(self.ts.plot, label='foo') + _check_plot_works(self.ts.plot, use_index=False) + axes = _check_plot_works(self.ts.plot, rot=0) + self._check_ticks_props(axes, xrot=0) + + ax = _check_plot_works(self.ts.plot, style='.', logy=True) + self._check_ax_scales(ax, yaxis='log') + + ax = _check_plot_works(self.ts.plot, style='.', logx=True) + self._check_ax_scales(ax, xaxis='log') + + ax = _check_plot_works(self.ts.plot, style='.', loglog=True) + self._check_ax_scales(ax, xaxis='log', yaxis='log') + + _check_plot_works(self.ts[:10].plot.bar) + _check_plot_works(self.ts.plot.area, stacked=False) + _check_plot_works(self.iseries.plot) + + for kind in ['line', 'bar', 'barh', 'kde', 'hist', 'box']: + if not _ok_for_gaussian_kde(kind): + continue + _check_plot_works(self.series[:5].plot, kind=kind) + + _check_plot_works(self.series[:10].plot.barh) + ax = _check_plot_works(Series(randn(10)).plot.bar, color='black') + self._check_colors([ax.patches[0]], facecolors=['black']) + + # GH 6951 + ax = _check_plot_works(self.ts.plot, subplots=True) + self._check_axes_shape(ax, axes_num=1, layout=(1, 1)) + + ax = _check_plot_works(self.ts.plot, subplots=True, layout=(-1, 1)) + self._check_axes_shape(ax, axes_num=1, layout=(1, 1)) + ax = _check_plot_works(self.ts.plot, subplots=True, layout=(1, -1)) + self._check_axes_shape(ax, axes_num=1, layout=(1, 1)) + + @slow + def test_plot_figsize_and_title(self): + # figsize and title + ax = self.series.plot(title='Test', figsize=(16, 8)) + self._check_text_labels(ax.title, 'Test') + self._check_axes_shape(ax, axes_num=1, layout=(1, 1), figsize=(16, 8)) + + def test_dont_modify_rcParams(self): + # GH 8242 + if self.mpl_ge_1_5_0: + key = 'axes.prop_cycle' + else: + key = 'axes.color_cycle' + colors = self.plt.rcParams[key] + Series([1, 2, 3]).plot() + self.assertEqual(colors, self.plt.rcParams[key]) + + def test_ts_line_lim(self): + ax = self.ts.plot() + xmin, xmax = ax.get_xlim() + lines = ax.get_lines() + self.assertEqual(xmin, lines[0].get_data(orig=False)[0][0]) + self.assertEqual(xmax, lines[0].get_data(orig=False)[0][-1]) + tm.close() + + ax = self.ts.plot(secondary_y=True) + xmin, xmax = ax.get_xlim() + lines = ax.get_lines() + self.assertEqual(xmin, lines[0].get_data(orig=False)[0][0]) + self.assertEqual(xmax, lines[0].get_data(orig=False)[0][-1]) + + def test_ts_area_lim(self): + ax = self.ts.plot.area(stacked=False) + xmin, xmax = ax.get_xlim() + line = ax.get_lines()[0].get_data(orig=False)[0] + self.assertEqual(xmin, line[0]) + self.assertEqual(xmax, line[-1]) + tm.close() + + # GH 7471 + ax = self.ts.plot.area(stacked=False, x_compat=True) + xmin, xmax = ax.get_xlim() + line = ax.get_lines()[0].get_data(orig=False)[0] + self.assertEqual(xmin, line[0]) + self.assertEqual(xmax, line[-1]) + tm.close() + + tz_ts = self.ts.copy() + tz_ts.index = tz_ts.tz_localize('GMT').tz_convert('CET') + ax = tz_ts.plot.area(stacked=False, x_compat=True) + xmin, xmax = ax.get_xlim() + line = ax.get_lines()[0].get_data(orig=False)[0] + self.assertEqual(xmin, line[0]) + self.assertEqual(xmax, line[-1]) + tm.close() + + ax = tz_ts.plot.area(stacked=False, secondary_y=True) + xmin, xmax = ax.get_xlim() + line = ax.get_lines()[0].get_data(orig=False)[0] + self.assertEqual(xmin, line[0]) + self.assertEqual(xmax, line[-1]) + + def test_label(self): + s = Series([1, 2]) + ax = s.plot(label='LABEL', legend=True) + self._check_legend_labels(ax, labels=['LABEL']) + self.plt.close() + ax = s.plot(legend=True) + self._check_legend_labels(ax, labels=['None']) + self.plt.close() + # get name from index + s.name = 'NAME' + ax = s.plot(legend=True) + self._check_legend_labels(ax, labels=['NAME']) + self.plt.close() + # override the default + ax = s.plot(legend=True, label='LABEL') + self._check_legend_labels(ax, labels=['LABEL']) + self.plt.close() + # Add lebel info, but don't draw + ax = s.plot(legend=False, label='LABEL') + self.assertEqual(ax.get_legend(), None) # Hasn't been drawn + ax.legend() # draw it + self._check_legend_labels(ax, labels=['LABEL']) + + def test_line_area_nan_series(self): + values = [1, 2, np.nan, 3] + s = Series(values) + ts = Series(values, index=tm.makeDateIndex(k=4)) + + for d in [s, ts]: + ax = _check_plot_works(d.plot) + masked = ax.lines[0].get_ydata() + # remove nan for comparison purpose + exp = np.array([1, 2, 3], dtype=np.float64) + self.assert_numpy_array_equal(np.delete(masked.data, 2), exp) + self.assert_numpy_array_equal( + masked.mask, np.array([False, False, True, False])) + + expected = np.array([1, 2, 0, 3], dtype=np.float64) + ax = _check_plot_works(d.plot, stacked=True) + self.assert_numpy_array_equal(ax.lines[0].get_ydata(), expected) + ax = _check_plot_works(d.plot.area) + self.assert_numpy_array_equal(ax.lines[0].get_ydata(), expected) + ax = _check_plot_works(d.plot.area, stacked=False) + self.assert_numpy_array_equal(ax.lines[0].get_ydata(), expected) + + def test_line_use_index_false(self): + s = Series([1, 2, 3], index=['a', 'b', 'c']) + s.index.name = 'The Index' + ax = s.plot(use_index=False) + label = ax.get_xlabel() + self.assertEqual(label, '') + ax2 = s.plot.bar(use_index=False) + label2 = ax2.get_xlabel() + self.assertEqual(label2, '') + + @slow + def test_bar_log(self): + expected = np.array([1., 10., 100., 1000.]) + + if not self.mpl_le_1_2_1: + expected = np.hstack((.1, expected, 1e4)) + + ax = Series([200, 500]).plot.bar(log=True) + tm.assert_numpy_array_equal(ax.yaxis.get_ticklocs(), expected) + tm.close() + + ax = Series([200, 500]).plot.barh(log=True) + tm.assert_numpy_array_equal(ax.xaxis.get_ticklocs(), expected) + tm.close() + + # GH 9905 + expected = np.array([1.0e-03, 1.0e-02, 1.0e-01, 1.0e+00]) + + if not self.mpl_le_1_2_1: + expected = np.hstack((1.0e-04, expected, 1.0e+01)) + + ax = Series([0.1, 0.01, 0.001]).plot(log=True, kind='bar') + ymax = 0.12589254117941673 if self.mpl_ge_2_0_0 else .10000000000000001 + self.assertEqual(ax.get_ylim(), (0.001, ymax)) + tm.assert_numpy_array_equal(ax.yaxis.get_ticklocs(), expected) + tm.close() + + ax = Series([0.1, 0.01, 0.001]).plot(log=True, kind='barh') + self.assertEqual(ax.get_xlim(), (0.001, ymax)) + tm.assert_numpy_array_equal(ax.xaxis.get_ticklocs(), expected) + + @slow + def test_bar_ignore_index(self): + df = Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']) + ax = df.plot.bar(use_index=False) + self._check_text_labels(ax.get_xticklabels(), ['0', '1', '2', '3']) + + def test_rotation(self): + df = DataFrame(randn(5, 5)) + # Default rot 0 + axes = df.plot() + self._check_ticks_props(axes, xrot=0) + + axes = df.plot(rot=30) + self._check_ticks_props(axes, xrot=30) + + def test_irregular_datetime(self): + rng = date_range('1/1/2000', '3/1/2000') + rng = rng[[0, 1, 2, 3, 5, 9, 10, 11, 12]] + ser = Series(randn(len(rng)), rng) + ax = ser.plot() + xp = datetime(1999, 1, 1).toordinal() + ax.set_xlim('1/1/1999', '1/1/2001') + self.assertEqual(xp, ax.get_xlim()[0]) + + @slow + def test_pie_series(self): + # if sum of values is less than 1.0, pie handle them as rate and draw + # semicircle. + series = Series(np.random.randint(1, 5), + index=['a', 'b', 'c', 'd', 'e'], name='YLABEL') + ax = _check_plot_works(series.plot.pie) + self._check_text_labels(ax.texts, series.index) + self.assertEqual(ax.get_ylabel(), 'YLABEL') + + # without wedge labels + ax = _check_plot_works(series.plot.pie, labels=None) + self._check_text_labels(ax.texts, [''] * 5) + + # with less colors than elements + color_args = ['r', 'g', 'b'] + ax = _check_plot_works(series.plot.pie, colors=color_args) + + color_expected = ['r', 'g', 'b', 'r', 'g'] + self._check_colors(ax.patches, facecolors=color_expected) + + # with labels and colors + labels = ['A', 'B', 'C', 'D', 'E'] + color_args = ['r', 'g', 'b', 'c', 'm'] + ax = _check_plot_works(series.plot.pie, labels=labels, + colors=color_args) + self._check_text_labels(ax.texts, labels) + self._check_colors(ax.patches, facecolors=color_args) + + # with autopct and fontsize + ax = _check_plot_works(series.plot.pie, colors=color_args, + autopct='%.2f', fontsize=7) + pcts = ['{0:.2f}'.format(s * 100) + for s in series.values / float(series.sum())] + iters = [iter(series.index), iter(pcts)] + expected_texts = list(next(it) for it in itertools.cycle(iters)) + self._check_text_labels(ax.texts, expected_texts) + for t in ax.texts: + self.assertEqual(t.get_fontsize(), 7) + + # includes negative value + with tm.assertRaises(ValueError): + series = Series([1, 2, 0, 4, -1], index=['a', 'b', 'c', 'd', 'e']) + series.plot.pie() + + # includes nan + series = Series([1, 2, np.nan, 4], index=['a', 'b', 'c', 'd'], + name='YLABEL') + ax = _check_plot_works(series.plot.pie) + self._check_text_labels(ax.texts, ['a', 'b', '', 'd']) + + def test_pie_nan(self): + s = Series([1, np.nan, 1, 1]) + ax = s.plot.pie(legend=True) + expected = ['0', '', '2', '3'] + result = [x.get_text() for x in ax.texts] + self.assertEqual(result, expected) + + @slow + def test_hist_df_kwargs(self): + df = DataFrame(np.random.randn(10, 2)) + ax = df.plot.hist(bins=5) + self.assertEqual(len(ax.patches), 10) + + @slow + def test_hist_df_with_nonnumerics(self): + # GH 9853 + with tm.RNGContext(1): + df = DataFrame( + np.random.randn(10, 4), columns=['A', 'B', 'C', 'D']) + df['E'] = ['x', 'y'] * 5 + ax = df.plot.hist(bins=5) + self.assertEqual(len(ax.patches), 20) + + ax = df.plot.hist() # bins=10 + self.assertEqual(len(ax.patches), 40) + + @slow + def test_hist_legacy(self): + _check_plot_works(self.ts.hist) + _check_plot_works(self.ts.hist, grid=False) + _check_plot_works(self.ts.hist, figsize=(8, 10)) + # _check_plot_works adds an ax so catch warning. see GH #13188 + with tm.assert_produces_warning(UserWarning): + _check_plot_works(self.ts.hist, + by=self.ts.index.month) + with tm.assert_produces_warning(UserWarning): + _check_plot_works(self.ts.hist, + by=self.ts.index.month, bins=5) + + fig, ax = self.plt.subplots(1, 1) + _check_plot_works(self.ts.hist, ax=ax) + _check_plot_works(self.ts.hist, ax=ax, figure=fig) + _check_plot_works(self.ts.hist, figure=fig) + tm.close() + + fig, (ax1, ax2) = self.plt.subplots(1, 2) + _check_plot_works(self.ts.hist, figure=fig, ax=ax1) + _check_plot_works(self.ts.hist, figure=fig, ax=ax2) + + with tm.assertRaises(ValueError): + self.ts.hist(by=self.ts.index, figure=fig) + + @slow + def test_hist_bins_legacy(self): + df = DataFrame(np.random.randn(10, 2)) + ax = df.hist(bins=2)[0][0] + self.assertEqual(len(ax.patches), 2) + + @slow + def test_hist_layout(self): + df = self.hist_df + with tm.assertRaises(ValueError): + df.height.hist(layout=(1, 1)) + + with tm.assertRaises(ValueError): + df.height.hist(layout=[1, 1]) + + @slow + def test_hist_layout_with_by(self): + df = self.hist_df + + # _check_plot_works adds an ax so catch warning. see GH #13188 + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.height.hist, + by=df.gender, layout=(2, 1)) + self._check_axes_shape(axes, axes_num=2, layout=(2, 1)) + + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.height.hist, + by=df.gender, layout=(3, -1)) + self._check_axes_shape(axes, axes_num=2, layout=(3, 1)) + + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.height.hist, + by=df.category, layout=(4, 1)) + self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) + + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.height.hist, + by=df.category, layout=(2, -1)) + self._check_axes_shape(axes, axes_num=4, layout=(2, 2)) + + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.height.hist, + by=df.category, layout=(3, -1)) + self._check_axes_shape(axes, axes_num=4, layout=(3, 2)) + + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.height.hist, + by=df.category, layout=(-1, 4)) + self._check_axes_shape(axes, axes_num=4, layout=(1, 4)) + + with tm.assert_produces_warning(UserWarning): + axes = _check_plot_works(df.height.hist, + by=df.classroom, layout=(2, 2)) + self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) + + axes = df.height.hist(by=df.category, layout=(4, 2), figsize=(12, 7)) + self._check_axes_shape(axes, axes_num=4, layout=(4, 2), + figsize=(12, 7)) + + @slow + def test_hist_no_overlap(self): + from matplotlib.pyplot import subplot, gcf + x = Series(randn(2)) + y = Series(randn(2)) + subplot(121) + x.hist() + subplot(122) + y.hist() + fig = gcf() + axes = fig.axes if self.mpl_ge_1_5_0 else fig.get_axes() + self.assertEqual(len(axes), 2) + + @slow + def test_hist_secondary_legend(self): + # GH 9610 + df = DataFrame(np.random.randn(30, 4), columns=list('abcd')) + + # primary -> secondary + ax = df['a'].plot.hist(legend=True) + df['b'].plot.hist(ax=ax, legend=True, secondary_y=True) + # both legends are dran on left ax + # left and right axis must be visible + self._check_legend_labels(ax, labels=['a', 'b (right)']) + self.assertTrue(ax.get_yaxis().get_visible()) + self.assertTrue(ax.right_ax.get_yaxis().get_visible()) + tm.close() + + # secondary -> secondary + ax = df['a'].plot.hist(legend=True, secondary_y=True) + df['b'].plot.hist(ax=ax, legend=True, secondary_y=True) + # both legends are draw on left ax + # left axis must be invisible, right axis must be visible + self._check_legend_labels(ax.left_ax, + labels=['a (right)', 'b (right)']) + self.assertFalse(ax.left_ax.get_yaxis().get_visible()) + self.assertTrue(ax.get_yaxis().get_visible()) + tm.close() + + # secondary -> primary + ax = df['a'].plot.hist(legend=True, secondary_y=True) + # right axes is returned + df['b'].plot.hist(ax=ax, legend=True) + # both legends are draw on left ax + # left and right axis must be visible + self._check_legend_labels(ax.left_ax, labels=['a (right)', 'b']) + self.assertTrue(ax.left_ax.get_yaxis().get_visible()) + self.assertTrue(ax.get_yaxis().get_visible()) + tm.close() + + @slow + def test_df_series_secondary_legend(self): + # GH 9779 + df = DataFrame(np.random.randn(30, 3), columns=list('abc')) + s = Series(np.random.randn(30), name='x') + + # primary -> secondary (without passing ax) + ax = df.plot() + s.plot(legend=True, secondary_y=True) + # both legends are dran on left ax + # left and right axis must be visible + self._check_legend_labels(ax, labels=['a', 'b', 'c', 'x (right)']) + self.assertTrue(ax.get_yaxis().get_visible()) + self.assertTrue(ax.right_ax.get_yaxis().get_visible()) + tm.close() + + # primary -> secondary (with passing ax) + ax = df.plot() + s.plot(ax=ax, legend=True, secondary_y=True) + # both legends are dran on left ax + # left and right axis must be visible + self._check_legend_labels(ax, labels=['a', 'b', 'c', 'x (right)']) + self.assertTrue(ax.get_yaxis().get_visible()) + self.assertTrue(ax.right_ax.get_yaxis().get_visible()) + tm.close() + + # seconcary -> secondary (without passing ax) + ax = df.plot(secondary_y=True) + s.plot(legend=True, secondary_y=True) + # both legends are dran on left ax + # left axis must be invisible and right axis must be visible + expected = ['a (right)', 'b (right)', 'c (right)', 'x (right)'] + self._check_legend_labels(ax.left_ax, labels=expected) + self.assertFalse(ax.left_ax.get_yaxis().get_visible()) + self.assertTrue(ax.get_yaxis().get_visible()) + tm.close() + + # secondary -> secondary (with passing ax) + ax = df.plot(secondary_y=True) + s.plot(ax=ax, legend=True, secondary_y=True) + # both legends are dran on left ax + # left axis must be invisible and right axis must be visible + expected = ['a (right)', 'b (right)', 'c (right)', 'x (right)'] + self._check_legend_labels(ax.left_ax, expected) + self.assertFalse(ax.left_ax.get_yaxis().get_visible()) + self.assertTrue(ax.get_yaxis().get_visible()) + tm.close() + + # secondary -> secondary (with passing ax) + ax = df.plot(secondary_y=True, mark_right=False) + s.plot(ax=ax, legend=True, secondary_y=True) + # both legends are dran on left ax + # left axis must be invisible and right axis must be visible + expected = ['a', 'b', 'c', 'x (right)'] + self._check_legend_labels(ax.left_ax, expected) + self.assertFalse(ax.left_ax.get_yaxis().get_visible()) + self.assertTrue(ax.get_yaxis().get_visible()) + tm.close() + + @slow + def test_plot_fails_with_dupe_color_and_style(self): + x = Series(randn(2)) + with tm.assertRaises(ValueError): + x.plot(style='k--', color='k') + + @slow + def test_hist_kde(self): + ax = self.ts.plot.hist(logy=True) + self._check_ax_scales(ax, yaxis='log') + xlabels = ax.get_xticklabels() + # ticks are values, thus ticklabels are blank + self._check_text_labels(xlabels, [''] * len(xlabels)) + ylabels = ax.get_yticklabels() + self._check_text_labels(ylabels, [''] * len(ylabels)) + + tm._skip_if_no_scipy() + _skip_if_no_scipy_gaussian_kde() + _check_plot_works(self.ts.plot.kde) + _check_plot_works(self.ts.plot.density) + ax = self.ts.plot.kde(logy=True) + self._check_ax_scales(ax, yaxis='log') + xlabels = ax.get_xticklabels() + self._check_text_labels(xlabels, [''] * len(xlabels)) + ylabels = ax.get_yticklabels() + self._check_text_labels(ylabels, [''] * len(ylabels)) + + @slow + def test_kde_kwargs(self): + tm._skip_if_no_scipy() + _skip_if_no_scipy_gaussian_kde() + from numpy import linspace + _check_plot_works(self.ts.plot.kde, bw_method=.5, + ind=linspace(-100, 100, 20)) + _check_plot_works(self.ts.plot.density, bw_method=.5, + ind=linspace(-100, 100, 20)) + ax = self.ts.plot.kde(logy=True, bw_method=.5, + ind=linspace(-100, 100, 20)) + self._check_ax_scales(ax, yaxis='log') + self._check_text_labels(ax.yaxis.get_label(), 'Density') + + @slow + def test_kde_missing_vals(self): + tm._skip_if_no_scipy() + _skip_if_no_scipy_gaussian_kde() + s = Series(np.random.uniform(size=50)) + s[0] = np.nan + _check_plot_works(s.plot.kde) + + @slow + def test_hist_kwargs(self): + ax = self.ts.plot.hist(bins=5) + self.assertEqual(len(ax.patches), 5) + self._check_text_labels(ax.yaxis.get_label(), 'Frequency') + tm.close() + + if self.mpl_ge_1_3_1: + ax = self.ts.plot.hist(orientation='horizontal') + self._check_text_labels(ax.xaxis.get_label(), 'Frequency') + tm.close() + + ax = self.ts.plot.hist(align='left', stacked=True) + tm.close() + + @slow + def test_hist_kde_color(self): + ax = self.ts.plot.hist(logy=True, bins=10, color='b') + self._check_ax_scales(ax, yaxis='log') + self.assertEqual(len(ax.patches), 10) + self._check_colors(ax.patches, facecolors=['b'] * 10) + + tm._skip_if_no_scipy() + _skip_if_no_scipy_gaussian_kde() + ax = self.ts.plot.kde(logy=True, color='r') + self._check_ax_scales(ax, yaxis='log') + lines = ax.get_lines() + self.assertEqual(len(lines), 1) + self._check_colors(lines, ['r']) + + @slow + def test_boxplot_series(self): + ax = self.ts.plot.box(logy=True) + self._check_ax_scales(ax, yaxis='log') + xlabels = ax.get_xticklabels() + self._check_text_labels(xlabels, [self.ts.name]) + ylabels = ax.get_yticklabels() + self._check_text_labels(ylabels, [''] * len(ylabels)) + + @slow + def test_kind_both_ways(self): + s = Series(range(3)) + for kind in plotting._common_kinds + plotting._series_kinds: + if not _ok_for_gaussian_kde(kind): + continue + s.plot(kind=kind) + getattr(s.plot, kind)() + + @slow + def test_invalid_plot_data(self): + s = Series(list('abcd')) + for kind in plotting._common_kinds: + if not _ok_for_gaussian_kde(kind): + continue + with tm.assertRaises(TypeError): + s.plot(kind=kind) + + @slow + def test_valid_object_plot(self): + s = Series(lrange(10), dtype=object) + for kind in plotting._common_kinds: + if not _ok_for_gaussian_kde(kind): + continue + _check_plot_works(s.plot, kind=kind) + + def test_partially_invalid_plot_data(self): + s = Series(['a', 'b', 1.0, 2]) + for kind in plotting._common_kinds: + if not _ok_for_gaussian_kde(kind): + continue + with tm.assertRaises(TypeError): + s.plot(kind=kind) + + def test_invalid_kind(self): + s = Series([1, 2]) + with tm.assertRaises(ValueError): + s.plot(kind='aasdf') + + @slow + def test_dup_datetime_index_plot(self): + dr1 = date_range('1/1/2009', periods=4) + dr2 = date_range('1/2/2009', periods=4) + index = dr1.append(dr2) + values = randn(index.size) + s = Series(values, index=index) + _check_plot_works(s.plot) + + @slow + def test_errorbar_plot(self): + + s = Series(np.arange(10), name='x') + s_err = np.random.randn(10) + d_err = DataFrame(randn(10, 2), index=s.index, columns=['x', 'y']) + # test line and bar plots + kinds = ['line', 'bar'] + for kind in kinds: + ax = _check_plot_works(s.plot, yerr=Series(s_err), kind=kind) + self._check_has_errorbars(ax, xerr=0, yerr=1) + ax = _check_plot_works(s.plot, yerr=s_err, kind=kind) + self._check_has_errorbars(ax, xerr=0, yerr=1) + ax = _check_plot_works(s.plot, yerr=s_err.tolist(), kind=kind) + self._check_has_errorbars(ax, xerr=0, yerr=1) + ax = _check_plot_works(s.plot, yerr=d_err, kind=kind) + self._check_has_errorbars(ax, xerr=0, yerr=1) + ax = _check_plot_works(s.plot, xerr=0.2, yerr=0.2, kind=kind) + self._check_has_errorbars(ax, xerr=1, yerr=1) + + ax = _check_plot_works(s.plot, xerr=s_err) + self._check_has_errorbars(ax, xerr=1, yerr=0) + + # test time series plotting + ix = date_range('1/1/2000', '1/1/2001', freq='M') + ts = Series(np.arange(12), index=ix, name='x') + ts_err = Series(np.random.randn(12), index=ix) + td_err = DataFrame(randn(12, 2), index=ix, columns=['x', 'y']) + + ax = _check_plot_works(ts.plot, yerr=ts_err) + self._check_has_errorbars(ax, xerr=0, yerr=1) + ax = _check_plot_works(ts.plot, yerr=td_err) + self._check_has_errorbars(ax, xerr=0, yerr=1) + + # check incorrect lengths and types + with tm.assertRaises(ValueError): + s.plot(yerr=np.arange(11)) + + s_err = ['zzz'] * 10 + # in mpl 1.5+ this is a TypeError + with tm.assertRaises((ValueError, TypeError)): + s.plot(yerr=s_err) + + def test_table(self): + _check_plot_works(self.series.plot, table=True) + _check_plot_works(self.series.plot, table=self.series) + + @slow + def test_series_grid_settings(self): + # Make sure plot defaults to rcParams['axes.grid'] setting, GH 9792 + self._check_grid_settings(Series([1, 2, 3]), + plotting._series_kinds + + plotting._common_kinds) + + @slow + def test_standard_colors(self): + for c in ['r', 'red', 'green', '#FF0000']: + result = plotting._get_standard_colors(1, color=c) + self.assertEqual(result, [c]) + + result = plotting._get_standard_colors(1, color=[c]) + self.assertEqual(result, [c]) + + result = plotting._get_standard_colors(3, color=c) + self.assertEqual(result, [c] * 3) + + result = plotting._get_standard_colors(3, color=[c]) + self.assertEqual(result, [c] * 3) + + @slow + def test_standard_colors_all(self): + import matplotlib.colors as colors + + # multiple colors like mediumaquamarine + for c in colors.cnames: + result = plotting._get_standard_colors(num_colors=1, color=c) + self.assertEqual(result, [c]) + + result = plotting._get_standard_colors(num_colors=1, color=[c]) + self.assertEqual(result, [c]) + + result = plotting._get_standard_colors(num_colors=3, color=c) + self.assertEqual(result, [c] * 3) + + result = plotting._get_standard_colors(num_colors=3, color=[c]) + self.assertEqual(result, [c] * 3) + + # single letter colors like k + for c in colors.ColorConverter.colors: + result = plotting._get_standard_colors(num_colors=1, color=c) + self.assertEqual(result, [c]) + + result = plotting._get_standard_colors(num_colors=1, color=[c]) + self.assertEqual(result, [c]) + + result = plotting._get_standard_colors(num_colors=3, color=c) + self.assertEqual(result, [c] * 3) + + result = plotting._get_standard_colors(num_colors=3, color=[c]) + self.assertEqual(result, [c] * 3) + + def test_series_plot_color_kwargs(self): + # GH1890 + ax = Series(np.arange(12) + 1).plot(color='green') + self._check_colors(ax.get_lines(), linecolors=['green']) + + def test_time_series_plot_color_kwargs(self): + # #1890 + ax = Series(np.arange(12) + 1, index=date_range( + '1/1/2000', periods=12)).plot(color='green') + self._check_colors(ax.get_lines(), linecolors=['green']) + + def test_time_series_plot_color_with_empty_kwargs(self): + import matplotlib as mpl + + if self.mpl_ge_1_5_0: + def_colors = self._maybe_unpack_cycler(mpl.rcParams) + else: + def_colors = mpl.rcParams['axes.color_cycle'] + index = date_range('1/1/2000', periods=12) + s = Series(np.arange(1, 13), index=index) + + ncolors = 3 + + for i in range(ncolors): + ax = s.plot() + self._check_colors(ax.get_lines(), linecolors=def_colors[:ncolors]) + + def test_xticklabels(self): + # GH11529 + s = Series(np.arange(10), index=['P%02d' % i for i in range(10)]) + ax = s.plot(xticks=[0, 3, 5, 9]) + exp = ['P%02d' % i for i in [0, 3, 5, 9]] + self._check_text_labels(ax.get_xticklabels(), exp) + + def test_custom_business_day_freq(self): + # GH7222 + from pandas.tseries.offsets import CustomBusinessDay + s = Series(range(100, 121), index=pd.bdate_range( + start='2014-05-01', end='2014-06-01', + freq=CustomBusinessDay(holidays=['2014-05-26']))) + + _check_plot_works(s.plot) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 433f0f4bc67f5..24e3a0ff5f325 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -5,7 +5,6 @@ from distutils.version import LooseVersion import nose -import random from numpy import nan import numpy as np @@ -261,8 +260,29 @@ def test_kurt(self): self.assertEqual(0, s.kurt()) self.assertTrue((df.kurt() == 0).all()) + def test_describe(self): + s = Series([0, 1, 2, 3, 4], name='int_data') + result = s.describe() + expected = Series([5, 2, s.std(), 0, 1, 2, 3, 4], + name='int_data', + index=['count', 'mean', 'std', 'min', '25%', + '50%', '75%', 'max']) + self.assert_series_equal(result, expected) + + s = Series([True, True, False, False, False], name='bool_data') + result = s.describe() + expected = Series([5, 2, False, 3], name='bool_data', + index=['count', 'unique', 'top', 'freq']) + self.assert_series_equal(result, expected) + + s = Series(['a', 'a', 'b', 'c', 'd'], name='str_data') + result = s.describe() + expected = Series([5, 4, 'a', 2], name='str_data', + index=['count', 'unique', 'top', 'freq']) + self.assert_series_equal(result, expected) + def test_argsort(self): - self._check_accum_op('argsort') + self._check_accum_op('argsort', check_dtype=False) argsorted = self.ts.argsort() self.assertTrue(issubclass(argsorted.dtype.type, np.integer)) @@ -289,8 +309,10 @@ def test_argsort_stable(self): mexpected = np.argsort(s.values, kind='mergesort') qexpected = np.argsort(s.values, kind='quicksort') - self.assert_series_equal(mindexer, Series(mexpected)) - self.assert_series_equal(qindexer, Series(qexpected)) + self.assert_series_equal(mindexer, Series(mexpected), + check_dtype=False) + self.assert_series_equal(qindexer, Series(qexpected), + check_dtype=False) self.assertFalse(np.array_equal(qindexer, mindexer)) def test_cumsum(self): @@ -487,10 +509,11 @@ def testit(): except ImportError: pass - def _check_accum_op(self, name): + def _check_accum_op(self, name, check_dtype=True): func = getattr(np, name) self.assert_numpy_array_equal(func(self.ts).values, - func(np.array(self.ts))) + func(np.array(self.ts)), + check_dtype=check_dtype) # with missing values ts = self.ts.copy() @@ -499,7 +522,8 @@ def _check_accum_op(self, name): result = func(ts)[1::2] expected = func(np.array(ts.valid())) - self.assert_numpy_array_equal(result.values, expected) + self.assert_numpy_array_equal(result.values, expected, + check_dtype=False) def test_compress(self): cond = [True, False, True, False, False] @@ -598,39 +622,40 @@ def test_all_any_params(self): self.assertRaises(NotImplementedError, s.all, bool_only=True) def test_modulo(self): + with np.errstate(all='ignore'): + + # GH3590, modulo as ints + p = DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]}) + result = p['first'] % p['second'] + expected = Series(p['first'].values % p['second'].values, + dtype='float64') + expected.iloc[0:3] = np.nan + assert_series_equal(result, expected) - # GH3590, modulo as ints - p = DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]}) - result = p['first'] % p['second'] - expected = Series(p['first'].values % p['second'].values, - dtype='float64') - expected.iloc[0:3] = np.nan - assert_series_equal(result, expected) - - result = p['first'] % 0 - expected = Series(np.nan, index=p.index, name='first') - assert_series_equal(result, expected) + result = p['first'] % 0 + expected = Series(np.nan, index=p.index, name='first') + assert_series_equal(result, expected) - p = p.astype('float64') - result = p['first'] % p['second'] - expected = Series(p['first'].values % p['second'].values) - assert_series_equal(result, expected) + p = p.astype('float64') + result = p['first'] % p['second'] + expected = Series(p['first'].values % p['second'].values) + assert_series_equal(result, expected) - p = p.astype('float64') - result = p['first'] % p['second'] - result2 = p['second'] % p['first'] - self.assertFalse(np.array_equal(result, result2)) + p = p.astype('float64') + result = p['first'] % p['second'] + result2 = p['second'] % p['first'] + self.assertFalse(np.array_equal(result, result2)) - # GH 9144 - s = Series([0, 1]) + # GH 9144 + s = Series([0, 1]) - result = s % 0 - expected = Series([nan, nan]) - assert_series_equal(result, expected) + result = s % 0 + expected = Series([nan, nan]) + assert_series_equal(result, expected) - result = 0 % s - expected = Series([nan, 0.0]) - assert_series_equal(result, expected) + result = 0 % s + expected = Series([nan, 0.0]) + assert_series_equal(result, expected) def test_ops_consistency_on_empty(self): @@ -1360,13 +1385,13 @@ def test_searchsorted_numeric_dtypes_scalar(self): self.assertEqual(r, e) r = s.searchsorted([30]) - e = np.array([2], dtype=np.int64) + e = np.array([2], dtype=np.intp) tm.assert_numpy_array_equal(r, e) def test_searchsorted_numeric_dtypes_vector(self): s = Series([1, 2, 90, 1000, 3e9]) r = s.searchsorted([91, 2e6]) - e = np.array([3, 4], dtype=np.int64) + e = np.array([3, 4], dtype=np.intp) tm.assert_numpy_array_equal(r, e) def test_search_sorted_datetime64_scalar(self): @@ -1380,14 +1405,14 @@ def test_search_sorted_datetime64_list(self): s = Series(pd.date_range('20120101', periods=10, freq='2D')) v = [pd.Timestamp('20120102'), pd.Timestamp('20120104')] r = s.searchsorted(v) - e = np.array([1, 2], dtype=np.int64) + e = np.array([1, 2], dtype=np.intp) tm.assert_numpy_array_equal(r, e) def test_searchsorted_sorter(self): # GH8490 s = Series([3, 1, 2]) r = s.searchsorted([0, 3], sorter=np.argsort(s)) - e = np.array([0, 2], dtype=np.int64) + e = np.array([0, 2], dtype=np.intp) tm.assert_numpy_array_equal(r, e) def test_is_unique(self): @@ -1414,141 +1439,6 @@ def test_is_monotonic(self): self.assertFalse(s.is_monotonic) self.assertTrue(s.is_monotonic_decreasing) - def test_sort_values(self): - - ts = self.ts.copy() - - # 9816 deprecated - with tm.assert_produces_warning(FutureWarning): - ts.sort() - - self.assert_series_equal(ts, self.ts.sort_values()) - self.assert_index_equal(ts.index, self.ts.sort_values().index) - - ts.sort_values(ascending=False, inplace=True) - self.assert_series_equal(ts, self.ts.sort_values(ascending=False)) - self.assert_index_equal(ts.index, - self.ts.sort_values(ascending=False).index) - - # GH 5856/5853 - # Series.sort_values operating on a view - df = DataFrame(np.random.randn(10, 4)) - s = df.iloc[:, 0] - - def f(): - s.sort_values(inplace=True) - - self.assertRaises(ValueError, f) - - # test order/sort inplace - # GH6859 - ts1 = self.ts.copy() - ts1.sort_values(ascending=False, inplace=True) - ts2 = self.ts.copy() - ts2.sort_values(ascending=False, inplace=True) - assert_series_equal(ts1, ts2) - - ts1 = self.ts.copy() - ts1 = ts1.sort_values(ascending=False, inplace=False) - ts2 = self.ts.copy() - ts2 = ts.sort_values(ascending=False) - assert_series_equal(ts1, ts2) - - def test_sort_index(self): - rindex = list(self.ts.index) - random.shuffle(rindex) - - random_order = self.ts.reindex(rindex) - sorted_series = random_order.sort_index() - assert_series_equal(sorted_series, self.ts) - - # descending - sorted_series = random_order.sort_index(ascending=False) - assert_series_equal(sorted_series, - self.ts.reindex(self.ts.index[::-1])) - - def test_sort_index_inplace(self): - - # For #11402 - rindex = list(self.ts.index) - random.shuffle(rindex) - - # descending - random_order = self.ts.reindex(rindex) - result = random_order.sort_index(ascending=False, inplace=True) - self.assertIs(result, None, - msg='sort_index() inplace should return None') - assert_series_equal(random_order, self.ts.reindex(self.ts.index[::-1])) - - # ascending - random_order = self.ts.reindex(rindex) - result = random_order.sort_index(ascending=True, inplace=True) - self.assertIs(result, None, - msg='sort_index() inplace should return None') - assert_series_equal(random_order, self.ts) - - def test_sort_API(self): - - # API for 9816 - - # sortlevel - mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC')) - s = Series([1, 2], mi) - backwards = s.iloc[[1, 0]] - - res = s.sort_index(level='A') - assert_series_equal(backwards, res) - - # sort_index - rindex = list(self.ts.index) - random.shuffle(rindex) - - random_order = self.ts.reindex(rindex) - sorted_series = random_order.sort_index(level=0) - assert_series_equal(sorted_series, self.ts) - - # compat on axis - sorted_series = random_order.sort_index(axis=0) - assert_series_equal(sorted_series, self.ts) - - self.assertRaises(ValueError, lambda: random_order.sort_values(axis=1)) - - sorted_series = random_order.sort_index(level=0, axis=0) - assert_series_equal(sorted_series, self.ts) - - self.assertRaises(ValueError, - lambda: random_order.sort_index(level=0, axis=1)) - - def test_order(self): - - # 9816 deprecated - with tm.assert_produces_warning(FutureWarning): - self.ts.order() - - ts = self.ts.copy() - ts[:5] = np.NaN - vals = ts.values - - result = ts.sort_values() - self.assertTrue(np.isnan(result[-5:]).all()) - self.assert_numpy_array_equal(result[:-5].values, np.sort(vals[5:])) - - result = ts.sort_values(na_position='first') - self.assertTrue(np.isnan(result[:5]).all()) - self.assert_numpy_array_equal(result[5:].values, np.sort(vals[5:])) - - # something object-type - ser = Series(['A', 'B'], [1, 2]) - # no failure - ser.sort_values() - - # ascending=False - ordered = ts.sort_values(ascending=False) - expected = np.sort(ts.valid().values)[::-1] - assert_almost_equal(expected, ordered.valid().values) - ordered = ts.sort_values(ascending=False, na_position='first') - assert_almost_equal(expected, ordered.valid().values) - def test_nsmallest_nlargest(self): # float, int, datetime64 (use i8), timedelts64 (same), # object that are numbers, object that are strings @@ -1686,49 +1576,63 @@ def test_shift_categorical(self): assert_index_equal(s.values.categories, sp1.values.categories) assert_index_equal(s.values.categories, sn2.values.categories) + def test_reshape_deprecate(self): + x = Series(np.random.random(10), name='x') + tm.assert_produces_warning(FutureWarning, x.reshape, x.shape) + def test_reshape_non_2d(self): - # GH 4554 - x = Series(np.random.random(201), name='x') - self.assertTrue(x.reshape(x.shape, ) is x) + # see gh-4554 + with tm.assert_produces_warning(FutureWarning): + x = Series(np.random.random(201), name='x') + self.assertTrue(x.reshape(x.shape, ) is x) - # GH 2719 - a = Series([1, 2, 3, 4]) - result = a.reshape(2, 2) - expected = a.values.reshape(2, 2) - tm.assert_numpy_array_equal(result, expected) - self.assertIsInstance(result, type(expected)) + # see gh-2719 + with tm.assert_produces_warning(FutureWarning): + a = Series([1, 2, 3, 4]) + result = a.reshape(2, 2) + expected = a.values.reshape(2, 2) + tm.assert_numpy_array_equal(result, expected) + self.assertIsInstance(result, type(expected)) def test_reshape_2d_return_array(self): x = Series(np.random.random(201), name='x') - result = x.reshape((-1, 1)) - self.assertNotIsInstance(result, Series) - result2 = np.reshape(x, (-1, 1)) - self.assertNotIsInstance(result2, Series) + with tm.assert_produces_warning(FutureWarning): + result = x.reshape((-1, 1)) + self.assertNotIsInstance(result, Series) + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result2 = np.reshape(x, (-1, 1)) + self.assertNotIsInstance(result2, Series) - result = x[:, None] - expected = x.reshape((-1, 1)) - assert_almost_equal(result, expected) + with tm.assert_produces_warning(FutureWarning): + result = x[:, None] + expected = x.reshape((-1, 1)) + assert_almost_equal(result, expected) def test_reshape_bad_kwarg(self): a = Series([1, 2, 3, 4]) - msg = "'foo' is an invalid keyword argument for this function" - tm.assertRaisesRegexp(TypeError, msg, a.reshape, (2, 2), foo=2) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + msg = "'foo' is an invalid keyword argument for this function" + tm.assertRaisesRegexp(TypeError, msg, a.reshape, (2, 2), foo=2) - msg = "reshape\(\) got an unexpected keyword argument 'foo'" - tm.assertRaisesRegexp(TypeError, msg, a.reshape, a.shape, foo=2) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + msg = "reshape\(\) got an unexpected keyword argument 'foo'" + tm.assertRaisesRegexp(TypeError, msg, a.reshape, a.shape, foo=2) def test_numpy_reshape(self): a = Series([1, 2, 3, 4]) - result = np.reshape(a, (2, 2)) - expected = a.values.reshape(2, 2) - tm.assert_numpy_array_equal(result, expected) - self.assertIsInstance(result, type(expected)) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = np.reshape(a, (2, 2)) + expected = a.values.reshape(2, 2) + tm.assert_numpy_array_equal(result, expected) + self.assertIsInstance(result, type(expected)) - result = np.reshape(a, a.shape) - tm.assert_series_equal(result, a) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = np.reshape(a, a.shape) + tm.assert_series_equal(result, a) def test_unstack(self): from numpy import nan diff --git a/pandas/tests/series/test_apply.py b/pandas/tests/series/test_apply.py index 26fc80c3ef988..8d7676bef4d72 100644 --- a/pandas/tests/series/test_apply.py +++ b/pandas/tests/series/test_apply.py @@ -18,17 +18,18 @@ class TestSeriesApply(TestData, tm.TestCase): _multiprocess_can_split_ = True def test_apply(self): - assert_series_equal(self.ts.apply(np.sqrt), np.sqrt(self.ts)) - - # elementwise-apply - import math - assert_series_equal(self.ts.apply(math.exp), np.exp(self.ts)) - - # how to handle Series result, #2316 - result = self.ts.apply(lambda x: Series( - [x, x ** 2], index=['x', 'x^2'])) - expected = DataFrame({'x': self.ts, 'x^2': self.ts ** 2}) - tm.assert_frame_equal(result, expected) + with np.errstate(all='ignore'): + assert_series_equal(self.ts.apply(np.sqrt), np.sqrt(self.ts)) + + # elementwise-apply + import math + assert_series_equal(self.ts.apply(math.exp), np.exp(self.ts)) + + # how to handle Series result, #2316 + result = self.ts.apply(lambda x: Series( + [x, x ** 2], index=['x', 'x^2'])) + expected = DataFrame({'x': self.ts, 'x^2': self.ts ** 2}) + tm.assert_frame_equal(result, expected) # empty series s = Series(dtype=object, name='foo', index=pd.Index([], name='bar')) diff --git a/pandas/tests/series/test_asof.py b/pandas/tests/series/test_asof.py new file mode 100644 index 0000000000000..e2092feab9004 --- /dev/null +++ b/pandas/tests/series/test_asof.py @@ -0,0 +1,158 @@ +# coding=utf-8 + +import nose + +import numpy as np + +from pandas import (offsets, Series, notnull, + isnull, date_range, Timestamp) + +import pandas.util.testing as tm + +from .common import TestData + + +class TestSeriesAsof(TestData, tm.TestCase): + _multiprocess_can_split_ = True + + def test_basic(self): + + # array or list or dates + N = 50 + rng = date_range('1/1/1990', periods=N, freq='53s') + ts = Series(np.random.randn(N), index=rng) + ts[15:30] = np.nan + dates = date_range('1/1/1990', periods=N * 3, freq='25s') + + result = ts.asof(dates) + self.assertTrue(notnull(result).all()) + lb = ts.index[14] + ub = ts.index[30] + + result = ts.asof(list(dates)) + self.assertTrue(notnull(result).all()) + lb = ts.index[14] + ub = ts.index[30] + + mask = (result.index >= lb) & (result.index < ub) + rs = result[mask] + self.assertTrue((rs == ts[lb]).all()) + + val = result[result.index[result.index >= ub][0]] + self.assertEqual(ts[ub], val) + + def test_scalar(self): + + N = 30 + rng = date_range('1/1/1990', periods=N, freq='53s') + ts = Series(np.arange(N), index=rng) + ts[5:10] = np.NaN + ts[15:20] = np.NaN + + val1 = ts.asof(ts.index[7]) + val2 = ts.asof(ts.index[19]) + + self.assertEqual(val1, ts[4]) + self.assertEqual(val2, ts[14]) + + # accepts strings + val1 = ts.asof(str(ts.index[7])) + self.assertEqual(val1, ts[4]) + + # in there + result = ts.asof(ts.index[3]) + self.assertEqual(result, ts[3]) + + # no as of value + d = ts.index[0] - offsets.BDay() + self.assertTrue(np.isnan(ts.asof(d))) + + def test_with_nan(self): + # basic asof test + rng = date_range('1/1/2000', '1/2/2000', freq='4h') + s = Series(np.arange(len(rng)), index=rng) + r = s.resample('2h').mean() + + result = r.asof(r.index) + expected = Series([0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6.], + index=date_range('1/1/2000', '1/2/2000', freq='2h')) + tm.assert_series_equal(result, expected) + + r.iloc[3:5] = np.nan + result = r.asof(r.index) + expected = Series([0, 0, 1, 1, 1, 1, 3, 3, 4, 4, 5, 5, 6.], + index=date_range('1/1/2000', '1/2/2000', freq='2h')) + tm.assert_series_equal(result, expected) + + r.iloc[-3:] = np.nan + result = r.asof(r.index) + expected = Series([0, 0, 1, 1, 1, 1, 3, 3, 4, 4, 4, 4, 4.], + index=date_range('1/1/2000', '1/2/2000', freq='2h')) + tm.assert_series_equal(result, expected) + + def test_periodindex(self): + from pandas import period_range, PeriodIndex + # array or list or dates + N = 50 + rng = period_range('1/1/1990', periods=N, freq='H') + ts = Series(np.random.randn(N), index=rng) + ts[15:30] = np.nan + dates = date_range('1/1/1990', periods=N * 3, freq='37min') + + result = ts.asof(dates) + self.assertTrue(notnull(result).all()) + lb = ts.index[14] + ub = ts.index[30] + + result = ts.asof(list(dates)) + self.assertTrue(notnull(result).all()) + lb = ts.index[14] + ub = ts.index[30] + + pix = PeriodIndex(result.index.values, freq='H') + mask = (pix >= lb) & (pix < ub) + rs = result[mask] + self.assertTrue((rs == ts[lb]).all()) + + ts[5:10] = np.nan + ts[15:20] = np.nan + + val1 = ts.asof(ts.index[7]) + val2 = ts.asof(ts.index[19]) + + self.assertEqual(val1, ts[4]) + self.assertEqual(val2, ts[14]) + + # accepts strings + val1 = ts.asof(str(ts.index[7])) + self.assertEqual(val1, ts[4]) + + # in there + self.assertEqual(ts.asof(ts.index[3]), ts[3]) + + # no as of value + d = ts.index[0].to_timestamp() - offsets.BDay() + self.assertTrue(isnull(ts.asof(d))) + + def test_errors(self): + + s = Series([1, 2, 3], + index=[Timestamp('20130101'), + Timestamp('20130103'), + Timestamp('20130102')]) + + # non-monotonic + self.assertFalse(s.index.is_monotonic) + with self.assertRaises(ValueError): + s.asof(s.index[0]) + + # subset with Series + N = 10 + rng = date_range('1/1/1990', periods=N, freq='53s') + s = Series(np.random.randn(N), index=rng) + with self.assertRaises(ValueError): + s.asof(s.index[0], subset='foo') + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/series/test_combine_concat.py b/pandas/tests/series/test_combine_concat.py index eb560d4a17055..23261c2ef79e2 100644 --- a/pandas/tests/series/test_combine_concat.py +++ b/pandas/tests/series/test_combine_concat.py @@ -39,6 +39,27 @@ def test_append_many(self): result = pieces[0].append(pieces[1:]) assert_series_equal(result, self.ts) + def test_append_duplicates(self): + # GH 13677 + s1 = pd.Series([1, 2, 3]) + s2 = pd.Series([4, 5, 6]) + exp = pd.Series([1, 2, 3, 4, 5, 6], index=[0, 1, 2, 0, 1, 2]) + tm.assert_series_equal(s1.append(s2), exp) + tm.assert_series_equal(pd.concat([s1, s2]), exp) + + # the result must have RangeIndex + exp = pd.Series([1, 2, 3, 4, 5, 6]) + tm.assert_series_equal(s1.append(s2, ignore_index=True), + exp, check_index_type=True) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), + exp, check_index_type=True) + + msg = 'Indexes have overlapping values:' + with tm.assertRaisesRegexp(ValueError, msg): + s1.append(s2, verify_integrity=True) + with tm.assertRaisesRegexp(ValueError, msg): + pd.concat([s1, s2], verify_integrity=True) + def test_combine_first(self): values = tm.makeIntIndex(20).values.astype(float) series = Series(values, index=tm.makeIntIndex(20)) @@ -164,9 +185,9 @@ def test_concat_empty_series_dtypes(self): 'category') self.assertEqual(pd.concat([Series(dtype='category'), Series(dtype='float64')]).dtype, - np.object_) + 'float64') self.assertEqual(pd.concat([Series(dtype='category'), - Series(dtype='object')]).dtype, 'category') + Series(dtype='object')]).dtype, 'object') # sparse result = pd.concat([Series(dtype='float64').to_sparse(), Series( diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index a80a3af56b18f..ed7b0fda19cb7 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -8,10 +8,11 @@ import numpy.ma as ma import pandas as pd +from pandas.types.common import is_categorical_dtype, is_datetime64tz_dtype from pandas import Index, Series, isnull, date_range, period_range from pandas.core.index import MultiIndex from pandas.tseries.index import Timestamp, DatetimeIndex -import pandas.core.common as com + import pandas.lib as lib from pandas.compat import lrange, range, zip, OrderedDict, long @@ -108,6 +109,17 @@ def test_constructor_iterator(self): result = Series(range(10), dtype='int64') assert_series_equal(result, expected) + def test_constructor_list_like(self): + + # make sure that we are coercing different + # list-likes to standard dtypes and not + # platform specific + expected = Series([1, 2, 3], dtype='int64') + for obj in [[1, 2, 3], (1, 2, 3), + np.array([1, 2, 3], dtype='int64')]: + result = Series(obj, index=[0, 1, 2]) + assert_series_equal(result, expected) + def test_constructor_generator(self): gen = (i for i in range(10)) @@ -144,11 +156,11 @@ def test_constructor_categorical(self): ValueError, lambda: Series(pd.Categorical([1, 2, 3]), dtype='int64')) cat = Series(pd.Categorical([1, 2, 3]), dtype='category') - self.assertTrue(com.is_categorical_dtype(cat)) - self.assertTrue(com.is_categorical_dtype(cat.dtype)) + self.assertTrue(is_categorical_dtype(cat)) + self.assertTrue(is_categorical_dtype(cat.dtype)) s = Series([1, 2, 3], dtype='category') - self.assertTrue(com.is_categorical_dtype(s)) - self.assertTrue(com.is_categorical_dtype(s.dtype)) + self.assertTrue(is_categorical_dtype(s)) + self.assertTrue(is_categorical_dtype(s.dtype)) def test_constructor_maskedarray(self): data = ma.masked_all((3, ), dtype=float) @@ -252,6 +264,24 @@ def test_constructor_pass_none(self): expected = Series(index=Index([None])) assert_series_equal(s, expected) + def test_constructor_pass_nan_nat(self): + # GH 13467 + exp = Series([np.nan, np.nan], dtype=np.float64) + self.assertEqual(exp.dtype, np.float64) + tm.assert_series_equal(Series([np.nan, np.nan]), exp) + tm.assert_series_equal(Series(np.array([np.nan, np.nan])), exp) + + exp = Series([pd.NaT, pd.NaT]) + self.assertEqual(exp.dtype, 'datetime64[ns]') + tm.assert_series_equal(Series([pd.NaT, pd.NaT]), exp) + tm.assert_series_equal(Series(np.array([pd.NaT, pd.NaT])), exp) + + tm.assert_series_equal(Series([pd.NaT, np.nan]), exp) + tm.assert_series_equal(Series(np.array([pd.NaT, np.nan])), exp) + + tm.assert_series_equal(Series([np.nan, pd.NaT]), exp) + tm.assert_series_equal(Series(np.array([np.nan, pd.NaT])), exp) + def test_constructor_cast(self): self.assertRaises(ValueError, Series, ['a', 'b', 'c'], dtype=float) @@ -351,13 +381,21 @@ def test_constructor_dtype_datetime64(self): # coerce datetime64 non-ns properly dates = date_range('01-Jan-2015', '01-Dec-2015', freq='M') values2 = dates.view(np.ndarray).astype('datetime64[ns]') - expected = Series(values2, dates) + expected = Series(values2, index=dates) for dtype in ['s', 'D', 'ms', 'us', 'ns']: values1 = dates.view(np.ndarray).astype('M8[{0}]'.format(dtype)) result = Series(values1, dates) assert_series_equal(result, expected) + # GH 13876 + # coerce to non-ns to object properly + expected = Series(values2, index=dates, dtype=object) + for dtype in ['s', 'D', 'ms', 'us', 'ns']: + values1 = dates.view(np.ndarray).astype('M8[{0}]'.format(dtype)) + result = Series(values1, index=dates, dtype=object) + assert_series_equal(result, expected) + # leave datetime.date alone dates2 = np.array([d.date() for d in dates.to_pydatetime()], dtype=object) @@ -411,7 +449,7 @@ def test_constructor_with_datetime_tz(self): s = Series(dr) self.assertTrue(s.dtype.name == 'datetime64[ns, US/Eastern]') self.assertTrue(s.dtype == 'datetime64[ns, US/Eastern]') - self.assertTrue(com.is_datetime64tz_dtype(s.dtype)) + self.assertTrue(is_datetime64tz_dtype(s.dtype)) self.assertTrue('datetime64[ns, US/Eastern]' in str(s)) # export @@ -426,10 +464,10 @@ def test_constructor_with_datetime_tz(self): # indexing result = s.iloc[0] self.assertEqual(result, Timestamp('2013-01-01 00:00:00-0500', - tz='US/Eastern', offset='D')) + tz='US/Eastern', freq='D')) result = s[0] self.assertEqual(result, Timestamp('2013-01-01 00:00:00-0500', - tz='US/Eastern', offset='D')) + tz='US/Eastern', freq='D')) result = s[Series([True, True, False], index=s.index)] assert_series_equal(result, s[0:2]) @@ -688,8 +726,9 @@ def test_constructor_dtype_timedelta64(self): td = Series([np.timedelta64(300000000), pd.NaT]) self.assertEqual(td.dtype, 'timedelta64[ns]') + # because iNaT is int, not coerced to timedelta td = Series([np.timedelta64(300000000), tslib.iNaT]) - self.assertEqual(td.dtype, 'timedelta64[ns]') + self.assertEqual(td.dtype, 'object') td = Series([np.timedelta64(300000000), np.nan]) self.assertEqual(td.dtype, 'timedelta64[ns]') diff --git a/pandas/tests/series/test_datetime_values.py b/pandas/tests/series/test_datetime_values.py index 6e82f81f901a9..8f2ab0ed28839 100644 --- a/pandas/tests/series/test_datetime_values.py +++ b/pandas/tests/series/test_datetime_values.py @@ -1,11 +1,12 @@ # coding=utf-8 # pylint: disable-msg=E1101,W0612 -from datetime import datetime +from datetime import datetime, date import numpy as np import pandas as pd +from pandas.types.common import is_integer_dtype, is_list_like from pandas import (Index, Series, DataFrame, bdate_range, date_range, period_range, timedelta_range) from pandas.tseries.period import PeriodIndex @@ -31,7 +32,7 @@ def test_dt_namespace_accessor(self): ok_for_base = ['year', 'month', 'day', 'hour', 'minute', 'second', 'weekofyear', 'week', 'dayofweek', 'weekday', 'dayofyear', 'quarter', 'freq', 'days_in_month', - 'daysinmonth'] + 'daysinmonth', 'is_leap_year'] ok_for_period = ok_for_base + ['qyear', 'start_time', 'end_time'] ok_for_period_methods = ['strftime', 'to_timestamp', 'asfreq'] ok_for_dt = ok_for_base + ['date', 'time', 'microsecond', 'nanosecond', @@ -49,16 +50,16 @@ def test_dt_namespace_accessor(self): def get_expected(s, name): result = getattr(Index(s._values), prop) if isinstance(result, np.ndarray): - if com.is_integer_dtype(result): + if is_integer_dtype(result): result = result.astype('int64') - elif not com.is_list_like(result): + elif not is_list_like(result): return result return Series(result, index=s.index, name=s.name) def compare(s, name): a = getattr(s.dt, prop) b = get_expected(s, prop) - if not (com.is_list_like(a) and com.is_list_like(b)): + if not (is_list_like(a) and is_list_like(b)): self.assertEqual(a, b) else: tm.assert_series_equal(a, b) @@ -409,3 +410,15 @@ def test_between(self): result = s[s.between(s[3], s[17], inclusive=False)] expected = s[5:16].dropna() assert_series_equal(result, expected) + + def test_date_tz(self): + # GH11757 + rng = pd.DatetimeIndex(['2014-04-04 23:56', + '2014-07-18 21:24', + '2015-11-22 22:14'], tz="US/Eastern") + s = Series(rng) + expected = Series([date(2014, 4, 4), + date(2014, 7, 18), + date(2015, 11, 22)]) + assert_series_equal(s.dt.date, expected) + assert_series_equal(s.apply(lambda x: x.date()), expected) diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index 6864eac603ded..9a406dfa10c35 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -133,6 +133,22 @@ def test_astype_unicode(self): reload(sys) # noqa sys.setdefaultencoding(former_encoding) + def test_astype_dict(self): + # GH7271 + s = Series(range(0, 10, 2), name='abc') + + result = s.astype({'abc': str}) + expected = Series(['0', '2', '4', '6', '8'], name='abc') + assert_series_equal(result, expected) + + result = s.astype({'abc': 'float64'}) + expected = Series([0.0, 2.0, 4.0, 6.0, 8.0], dtype='float64', + name='abc') + assert_series_equal(result, expected) + + self.assertRaises(KeyError, s.astype, {'abc': str, 'def': str}) + self.assertRaises(KeyError, s.astype, {0: str}) + def test_complexx(self): # GH4819 # complex access for ndarray compat diff --git a/pandas/tests/series/test_indexing.py b/pandas/tests/series/test_indexing.py index d01ac3e1aef42..5eef06bacfcb0 100644 --- a/pandas/tests/series/test_indexing.py +++ b/pandas/tests/series/test_indexing.py @@ -7,22 +7,20 @@ import numpy as np import pandas as pd +from pandas.types.common import is_integer, is_scalar from pandas import Index, Series, DataFrame, isnull, date_range from pandas.core.index import MultiIndex from pandas.core.indexing import IndexingError from pandas.tseries.index import Timestamp +from pandas.tseries.offsets import BDay from pandas.tseries.tdi import Timedelta -import pandas.core.common as com - -import pandas.core.datetools as datetools -import pandas.lib as lib from pandas.compat import lrange, range from pandas import compat from pandas.util.testing import assert_series_equal, assert_almost_equal import pandas.util.testing as tm -from .common import TestData +from pandas.tests.series.common import TestData JOIN_TYPES = ['inner', 'outer', 'left', 'right'] @@ -155,7 +153,7 @@ def test_getitem_get(self): self.assertEqual(self.series[5], self.series.get(self.series.index[5])) # missing - d = self.ts.index[0] - datetools.bday + d = self.ts.index[0] - BDay() self.assertRaises(KeyError, self.ts.__getitem__, d) # None @@ -323,7 +321,7 @@ def test_getitem_boolean_object(self): def test_getitem_setitem_boolean_corner(self): ts = self.ts - mask_shifted = ts.shift(1, freq=datetools.bday) > ts.median() + mask_shifted = ts.shift(1, freq=BDay()) > ts.median() # these used to raise...?? @@ -375,7 +373,7 @@ def test_getitem_ambiguous_keyerror(self): def test_getitem_unordered_dup(self): obj = Series(lrange(5), index=['c', 'a', 'a', 'b', 'b']) - self.assertTrue(lib.isscalar(obj['c'])) + self.assertTrue(is_scalar(obj['c'])) self.assertEqual(obj['c'], 0) def test_getitem_dups_with_missing(self): @@ -441,6 +439,16 @@ def test_setitem_callable(self): s[lambda x: 'A'] = -1 tm.assert_series_equal(s, pd.Series([-1, 2, 3, 4], index=list('ABCD'))) + def test_setitem_other_callable(self): + # GH 13299 + inc = lambda x: x + 1 + + s = pd.Series([1, 2, -1, 4]) + s[s < 0] = inc + + expected = pd.Series([1, 2, inc, 4]) + tm.assert_series_equal(s, expected) + def test_slice(self): numSlice = self.series[10:20] numSliceEnd = self.series[-10:] @@ -1164,23 +1172,23 @@ def test_where_numeric_with_string(self): s = pd.Series([1, 2, 3]) w = s.where(s > 1, 'X') - self.assertFalse(com.is_integer(w[0])) - self.assertTrue(com.is_integer(w[1])) - self.assertTrue(com.is_integer(w[2])) + self.assertFalse(is_integer(w[0])) + self.assertTrue(is_integer(w[1])) + self.assertTrue(is_integer(w[2])) self.assertTrue(isinstance(w[0], str)) self.assertTrue(w.dtype == 'object') w = s.where(s > 1, ['X', 'Y', 'Z']) - self.assertFalse(com.is_integer(w[0])) - self.assertTrue(com.is_integer(w[1])) - self.assertTrue(com.is_integer(w[2])) + self.assertFalse(is_integer(w[0])) + self.assertTrue(is_integer(w[1])) + self.assertTrue(is_integer(w[2])) self.assertTrue(isinstance(w[0], str)) self.assertTrue(w.dtype == 'object') w = s.where(s > 1, np.array(['X', 'Y', 'Z'])) - self.assertFalse(com.is_integer(w[0])) - self.assertTrue(com.is_integer(w[1])) - self.assertTrue(com.is_integer(w[2])) + self.assertFalse(is_integer(w[0])) + self.assertTrue(is_integer(w[1])) + self.assertTrue(is_integer(w[2])) self.assertTrue(isinstance(w[0], str)) self.assertTrue(w.dtype == 'object') @@ -1316,6 +1324,13 @@ def test_timedelta_assignment(self): s.loc['A'] = timedelta(1) tm.assert_series_equal(s, expected) + # GH 14155 + s = Series(10 * [np.timedelta64(10, 'm')]) + s.loc[[1, 2, 3]] = np.timedelta64(20, 'm') + expected = pd.Series(10 * [np.timedelta64(10, 'm')]) + expected.loc[[1, 2, 3]] = pd.Timedelta(np.timedelta64(20, 'm')) + tm.assert_series_equal(s, expected) + def test_underlying_data_conversion(self): # GH 4080 @@ -1848,3 +1863,8 @@ def test_multilevel_preserve_name(self): result2 = s.ix['foo'] self.assertEqual(result.name, s.name) self.assertEqual(result2.name, s.name) + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py index f89501d39f014..48528dc54adbd 100644 --- a/pandas/tests/series/test_io.py +++ b/pandas/tests/series/test_io.py @@ -64,7 +64,8 @@ def test_to_csv(self): with ensure_clean() as path: self.ts.to_csv(path) - lines = io.open(path, newline=None).readlines() + with io.open(path, newline=None) as f: + lines = f.readlines() assert (lines[1] != '\n') self.ts.to_csv(path, index=False) diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index 3588faa8b42f1..197311868b768 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -18,7 +18,8 @@ from pandas.compat import range, zip from pandas import compat -from pandas.util.testing import assert_series_equal, assert_almost_equal +from pandas.util.testing import (assert_series_equal, assert_almost_equal, + assert_frame_equal) import pandas.util.testing as tm from .common import TestData @@ -34,7 +35,8 @@ def test_comparisons(self): left[:3] = np.nan result = nanops.nangt(left, right) - expected = (left > right).astype('O') + with np.errstate(invalid='ignore'): + expected = (left > right).astype('O') expected[:3] = np.nan assert_almost_equal(result, expected) @@ -43,8 +45,9 @@ def test_comparisons(self): s2 = Series([False, True, False]) # it works! - s == s2 - s2 == s + exp = Series([False, False, False]) + assert_series_equal(s == s2, exp) + assert_series_equal(s2 == s, exp) def test_op_method(self): def check(series, other, check_reverse=False): @@ -62,12 +65,12 @@ def check(series, other, check_reverse=False): result = op(series, other) expected = alt(series, other) - tm.assert_almost_equal(result, expected) + assert_almost_equal(result, expected) if check_reverse: rop = getattr(Series, "r" + opname) result = rop(series, other) expected = alt(other, series) - tm.assert_almost_equal(result, expected) + assert_almost_equal(result, expected) check(self.ts, self.ts * 2) check(self.ts, self.ts[::2]) @@ -81,62 +84,63 @@ def test_invert(self): assert_series_equal(-(self.series < 0), ~(self.series < 0)) def test_div(self): + with np.errstate(all='ignore'): + # no longer do integer div for any ops, but deal with the 0's + p = DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]}) + result = p['first'] / p['second'] + expected = Series( + p['first'].values.astype(float) / p['second'].values, + dtype='float64') + expected.iloc[0:3] = np.inf + assert_series_equal(result, expected) - # no longer do integer div for any ops, but deal with the 0's - p = DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]}) - result = p['first'] / p['second'] - expected = Series(p['first'].values.astype(float) / p['second'].values, - dtype='float64') - expected.iloc[0:3] = np.inf - assert_series_equal(result, expected) - - result = p['first'] / 0 - expected = Series(np.inf, index=p.index, name='first') - assert_series_equal(result, expected) - - p = p.astype('float64') - result = p['first'] / p['second'] - expected = Series(p['first'].values / p['second'].values) - assert_series_equal(result, expected) + result = p['first'] / 0 + expected = Series(np.inf, index=p.index, name='first') + assert_series_equal(result, expected) - p = DataFrame({'first': [3, 4, 5, 8], 'second': [1, 1, 1, 1]}) - result = p['first'] / p['second'] - assert_series_equal(result, p['first'].astype('float64'), - check_names=False) - self.assertTrue(result.name is None) - self.assertFalse(np.array_equal(result, p['second'] / p['first'])) + p = p.astype('float64') + result = p['first'] / p['second'] + expected = Series(p['first'].values / p['second'].values) + assert_series_equal(result, expected) - # inf signing - s = Series([np.nan, 1., -1.]) - result = s / 0 - expected = Series([np.nan, np.inf, -np.inf]) - assert_series_equal(result, expected) + p = DataFrame({'first': [3, 4, 5, 8], 'second': [1, 1, 1, 1]}) + result = p['first'] / p['second'] + assert_series_equal(result, p['first'].astype('float64'), + check_names=False) + self.assertTrue(result.name is None) + self.assertFalse(np.array_equal(result, p['second'] / p['first'])) + + # inf signing + s = Series([np.nan, 1., -1.]) + result = s / 0 + expected = Series([np.nan, np.inf, -np.inf]) + assert_series_equal(result, expected) - # float/integer issue - # GH 7785 - p = DataFrame({'first': (1, 0), 'second': (-0.01, -0.02)}) - expected = Series([-0.01, -np.inf]) + # float/integer issue + # GH 7785 + p = DataFrame({'first': (1, 0), 'second': (-0.01, -0.02)}) + expected = Series([-0.01, -np.inf]) - result = p['second'].div(p['first']) - assert_series_equal(result, expected, check_names=False) + result = p['second'].div(p['first']) + assert_series_equal(result, expected, check_names=False) - result = p['second'] / p['first'] - assert_series_equal(result, expected) + result = p['second'] / p['first'] + assert_series_equal(result, expected) - # GH 9144 - s = Series([-1, 0, 1]) + # GH 9144 + s = Series([-1, 0, 1]) - result = 0 / s - expected = Series([0.0, nan, 0.0]) - assert_series_equal(result, expected) + result = 0 / s + expected = Series([0.0, nan, 0.0]) + assert_series_equal(result, expected) - result = s / 0 - expected = Series([-inf, nan, inf]) - assert_series_equal(result, expected) + result = s / 0 + expected = Series([-inf, nan, inf]) + assert_series_equal(result, expected) - result = s // 0 - expected = Series([-inf, nan, inf]) - assert_series_equal(result, expected) + result = s // 0 + expected = Series([-inf, nan, inf]) + assert_series_equal(result, expected) def test_operators(self): def _check_op(series, other, op, pos_only=False, @@ -146,8 +150,8 @@ def _check_op(series, other, op, pos_only=False, cython_or_numpy = op(left, right) python = left.combine(right, op) - tm.assert_series_equal(cython_or_numpy, python, - check_dtype=check_dtype) + assert_series_equal(cython_or_numpy, python, + check_dtype=check_dtype) def check(series, other): simple_ops = ['add', 'sub', 'mul', 'truediv', 'floordiv', 'mod'] @@ -184,7 +188,7 @@ def check_comparators(series, other, check_dtype=True): def test_operators_empty_int_corner(self): s1 = Series([], [], dtype=np.int32) s2 = Series({'x': 0.}) - tm.assert_series_equal(s1 * s2, Series([np.nan], index=['x'])) + assert_series_equal(s1 * s2, Series([np.nan], index=['x'])) def test_operators_timedelta64(self): @@ -264,6 +268,18 @@ def test_operators_timedelta64(self): rs[2] += np.timedelta64(timedelta(minutes=5, seconds=1)) self.assertEqual(rs[2], value) + def test_operator_series_comparison_zerorank(self): + # GH 13006 + result = np.float64(0) > pd.Series([1, 2, 3]) + expected = 0.0 > pd.Series([1, 2, 3]) + self.assert_series_equal(result, expected) + result = pd.Series([1, 2, 3]) < np.float64(0) + expected = pd.Series([1, 2, 3]) < 0.0 + self.assert_series_equal(result, expected) + result = np.array([0, 1, 2])[0] > pd.Series([0, 1, 2]) + expected = 0.0 > pd.Series([1, 2, 3]) + self.assert_series_equal(result, expected) + def test_timedeltas_with_DateOffset(self): # GH 4532 @@ -559,11 +575,11 @@ def run_ops(ops, get_ser, test_ser): td2 / td1 # ## datetime64 ### - dt1 = Series([Timestamp('20111230'), Timestamp('20120101'), Timestamp( - '20120103')]) + dt1 = Series([Timestamp('20111230'), Timestamp('20120101'), + Timestamp('20120103')]) dt1.iloc[2] = np.nan - dt2 = Series([Timestamp('20111231'), Timestamp('20120102'), Timestamp( - '20120104')]) + dt2 = Series([Timestamp('20111231'), Timestamp('20120102'), + Timestamp('20120104')]) ops = ['__add__', '__mul__', '__floordiv__', '__truediv__', '__div__', '__pow__', '__radd__', '__rmul__', '__rfloordiv__', '__rtruediv__', '__rdiv__', '__rpow__'] @@ -595,9 +611,10 @@ def run_ops(ops, get_ser, test_ser): ops = ['__mul__', '__floordiv__', '__truediv__', '__div__', '__pow__', '__rmul__', '__rfloordiv__', '__rtruediv__', '__rdiv__', '__rpow__'] - dt1 = Series( - date_range('2000-01-01 09:00:00', periods=5, - tz='US/Eastern'), name='foo') + + tz = 'US/Eastern' + dt1 = Series(date_range('2000-01-01 09:00:00', periods=5, + tz=tz), name='foo') dt2 = dt1.copy() dt2.iloc[2] = np.nan td1 = Series(timedelta_range('1 days 1 min', periods=5, freq='H')) @@ -606,62 +623,62 @@ def run_ops(ops, get_ser, test_ser): run_ops(ops, dt1, td1) result = dt1 + td1[0] - expected = ( - dt1.dt.tz_localize(None) + td1[0]).dt.tz_localize('US/Eastern') - assert_series_equal(result, expected) + exp = (dt1.dt.tz_localize(None) + td1[0]).dt.tz_localize(tz) + assert_series_equal(result, exp) result = dt2 + td2[0] - expected = ( - dt2.dt.tz_localize(None) + td2[0]).dt.tz_localize('US/Eastern') - assert_series_equal(result, expected) + exp = (dt2.dt.tz_localize(None) + td2[0]).dt.tz_localize(tz) + assert_series_equal(result, exp) # odd numpy behavior with scalar timedeltas if not _np_version_under1p8: result = td1[0] + dt1 - expected = ( - dt1.dt.tz_localize(None) + td1[0]).dt.tz_localize('US/Eastern') - assert_series_equal(result, expected) + exp = (dt1.dt.tz_localize(None) + td1[0]).dt.tz_localize(tz) + assert_series_equal(result, exp) result = td2[0] + dt2 - expected = ( - dt2.dt.tz_localize(None) + td2[0]).dt.tz_localize('US/Eastern') - assert_series_equal(result, expected) + exp = (dt2.dt.tz_localize(None) + td2[0]).dt.tz_localize(tz) + assert_series_equal(result, exp) result = dt1 - td1[0] - expected = ( - dt1.dt.tz_localize(None) - td1[0]).dt.tz_localize('US/Eastern') - assert_series_equal(result, expected) + exp = (dt1.dt.tz_localize(None) - td1[0]).dt.tz_localize(tz) + assert_series_equal(result, exp) self.assertRaises(TypeError, lambda: td1[0] - dt1) result = dt2 - td2[0] - expected = ( - dt2.dt.tz_localize(None) - td2[0]).dt.tz_localize('US/Eastern') - assert_series_equal(result, expected) + exp = (dt2.dt.tz_localize(None) - td2[0]).dt.tz_localize(tz) + assert_series_equal(result, exp) self.assertRaises(TypeError, lambda: td2[0] - dt2) result = dt1 + td1 - expected = ( - dt1.dt.tz_localize(None) + td1).dt.tz_localize('US/Eastern') - assert_series_equal(result, expected) + exp = (dt1.dt.tz_localize(None) + td1).dt.tz_localize(tz) + assert_series_equal(result, exp) result = dt2 + td2 - expected = ( - dt2.dt.tz_localize(None) + td2).dt.tz_localize('US/Eastern') - assert_series_equal(result, expected) + exp = (dt2.dt.tz_localize(None) + td2).dt.tz_localize(tz) + assert_series_equal(result, exp) result = dt1 - td1 - expected = ( - dt1.dt.tz_localize(None) - td1).dt.tz_localize('US/Eastern') - assert_series_equal(result, expected) + exp = (dt1.dt.tz_localize(None) - td1).dt.tz_localize(tz) + assert_series_equal(result, exp) result = dt2 - td2 - expected = ( - dt2.dt.tz_localize(None) - td2).dt.tz_localize('US/Eastern') - assert_series_equal(result, expected) + exp = (dt2.dt.tz_localize(None) - td2).dt.tz_localize(tz) + assert_series_equal(result, exp) self.assertRaises(TypeError, lambda: td1 - dt1) self.assertRaises(TypeError, lambda: td2 - dt2) + def test_sub_datetime_compat(self): + # GH 14088 + tm._skip_if_no_pytz() + import pytz + s = Series([datetime(2016, 8, 23, 12, tzinfo=pytz.utc), pd.NaT]) + dt = datetime(2016, 8, 22, 12, tzinfo=pytz.utc) + exp = Series([Timedelta('1 days'), pd.NaT]) + assert_series_equal(s - dt, exp) + assert_series_equal(s - Timestamp(dt), exp) + def test_sub_single_tz(self): # GH12290 s1 = Series([pd.Timestamp('2016-02-10', tz='America/Sao_Paulo')]) @@ -968,24 +985,97 @@ def test_comparison_invalid(self): self.assertRaises(TypeError, lambda: x <= y) def test_more_na_comparisons(self): - left = Series(['a', np.nan, 'c']) - right = Series(['a', np.nan, 'd']) + for dtype in [None, object]: + left = Series(['a', np.nan, 'c'], dtype=dtype) + right = Series(['a', np.nan, 'd'], dtype=dtype) - result = left == right - expected = Series([True, False, False]) - assert_series_equal(result, expected) + result = left == right + expected = Series([True, False, False]) + assert_series_equal(result, expected) - result = left != right - expected = Series([False, True, True]) - assert_series_equal(result, expected) + result = left != right + expected = Series([False, True, True]) + assert_series_equal(result, expected) - result = left == np.nan - expected = Series([False, False, False]) - assert_series_equal(result, expected) + result = left == np.nan + expected = Series([False, False, False]) + assert_series_equal(result, expected) - result = left != np.nan - expected = Series([True, True, True]) - assert_series_equal(result, expected) + result = left != np.nan + expected = Series([True, True, True]) + assert_series_equal(result, expected) + + def test_nat_comparisons(self): + data = [([pd.Timestamp('2011-01-01'), pd.NaT, + pd.Timestamp('2011-01-03')], + [pd.NaT, pd.NaT, pd.Timestamp('2011-01-03')]), + + ([pd.Timedelta('1 days'), pd.NaT, + pd.Timedelta('3 days')], + [pd.NaT, pd.NaT, pd.Timedelta('3 days')]), + + ([pd.Period('2011-01', freq='M'), pd.NaT, + pd.Period('2011-03', freq='M')], + [pd.NaT, pd.NaT, pd.Period('2011-03', freq='M')])] + + # add lhs / rhs switched data + data = data + [(r, l) for l, r in data] + + for l, r in data: + for dtype in [None, object]: + left = Series(l, dtype=dtype) + + # Series, Index + for right in [Series(r, dtype=dtype), Index(r, dtype=dtype)]: + expected = Series([False, False, True]) + assert_series_equal(left == right, expected) + + expected = Series([True, True, False]) + assert_series_equal(left != right, expected) + + expected = Series([False, False, False]) + assert_series_equal(left < right, expected) + + expected = Series([False, False, False]) + assert_series_equal(left > right, expected) + + expected = Series([False, False, True]) + assert_series_equal(left >= right, expected) + + expected = Series([False, False, True]) + assert_series_equal(left <= right, expected) + + def test_nat_comparisons_scalar(self): + data = [[pd.Timestamp('2011-01-01'), pd.NaT, + pd.Timestamp('2011-01-03')], + + [pd.Timedelta('1 days'), pd.NaT, pd.Timedelta('3 days')], + + [pd.Period('2011-01', freq='M'), pd.NaT, + pd.Period('2011-03', freq='M')]] + + for l in data: + for dtype in [None, object]: + left = Series(l, dtype=dtype) + + expected = Series([False, False, False]) + assert_series_equal(left == pd.NaT, expected) + assert_series_equal(pd.NaT == left, expected) + + expected = Series([True, True, True]) + assert_series_equal(left != pd.NaT, expected) + assert_series_equal(pd.NaT != left, expected) + + expected = Series([False, False, False]) + assert_series_equal(left < pd.NaT, expected) + assert_series_equal(pd.NaT > left, expected) + assert_series_equal(left <= pd.NaT, expected) + assert_series_equal(pd.NaT >= left, expected) + + assert_series_equal(left > pd.NaT, expected) + assert_series_equal(pd.NaT < left, expected) + assert_series_equal(left >= pd.NaT, expected) + assert_series_equal(pd.NaT <= left, expected) def test_comparison_different_length(self): a = Series(['a', 'b', 'c']) @@ -1004,15 +1094,15 @@ def test_comparison_label_based(self): a = Series([True, False, True], list('bca')) b = Series([False, True, False], list('abc')) - expected = Series([True, False, False], list('bca')) + expected = Series([False, True, False], list('abc')) result = a & b assert_series_equal(result, expected) - expected = Series([True, False, True], list('bca')) + expected = Series([True, True, False], list('abc')) result = a | b assert_series_equal(result, expected) - expected = Series([False, False, True], list('bca')) + expected = Series([True, False, False], list('abc')) result = a ^ b assert_series_equal(result, expected) @@ -1020,11 +1110,11 @@ def test_comparison_label_based(self): a = Series([True, False, True], list('bca')) b = Series([False, True, False, True], list('abcd')) - expected = Series([True, False, False], list('bca')) + expected = Series([False, True, False, False], list('abcd')) result = a & b assert_series_equal(result, expected) - expected = Series([True, False, True], list('bca')) + expected = Series([True, True, False, False], list('abcd')) result = a | b assert_series_equal(result, expected) @@ -1041,20 +1131,28 @@ def test_comparison_label_based(self): # vs non-matching result = a & Series([1], ['z']) - expected = Series([False, False, False], list('bca')) + expected = Series([False, False, False, False], list('abcz')) assert_series_equal(result, expected) result = a | Series([1], ['z']) - expected = Series([True, False, True], list('bca')) + expected = Series([True, True, False, False], list('abcz')) assert_series_equal(result, expected) # identity # we would like s[s|e] == s to hold for any e, whether empty or not - for e in [Series([]), Series([1], ['z']), Series(['z']), + for e in [Series([]), Series([1], ['z']), Series(np.nan, b.index), Series(np.nan, a.index)]: result = a[a | e] assert_series_equal(result, a[a]) + for e in [Series(['z'])]: + if compat.PY3: + with tm.assert_produces_warning(RuntimeWarning): + result = a[a | e] + else: + result = a[a | e] + assert_series_equal(result, a[a]) + # vs scalars index = list('bca') t = Series([True, False, True]) @@ -1084,6 +1182,76 @@ def test_comparison_label_based(self): for v in [np.nan]: self.assertRaises(TypeError, lambda: t & v) + def test_comparison_flex_basic(self): + left = pd.Series(np.random.randn(10)) + right = pd.Series(np.random.randn(10)) + + assert_series_equal(left.eq(right), left == right) + assert_series_equal(left.ne(right), left != right) + assert_series_equal(left.le(right), left < right) + assert_series_equal(left.lt(right), left <= right) + assert_series_equal(left.gt(right), left > right) + assert_series_equal(left.ge(right), left >= right) + + # axis + for axis in [0, None, 'index']: + assert_series_equal(left.eq(right, axis=axis), left == right) + assert_series_equal(left.ne(right, axis=axis), left != right) + assert_series_equal(left.le(right, axis=axis), left < right) + assert_series_equal(left.lt(right, axis=axis), left <= right) + assert_series_equal(left.gt(right, axis=axis), left > right) + assert_series_equal(left.ge(right, axis=axis), left >= right) + + # + msg = 'No axis named 1 for object type' + for op in ['eq', 'ne', 'le', 'le', 'gt', 'ge']: + with tm.assertRaisesRegexp(ValueError, msg): + getattr(left, op)(right, axis=1) + + def test_comparison_flex_alignment(self): + left = Series([1, 3, 2], index=list('abc')) + right = Series([2, 2, 2], index=list('bcd')) + + exp = pd.Series([False, False, True, False], index=list('abcd')) + assert_series_equal(left.eq(right), exp) + + exp = pd.Series([True, True, False, True], index=list('abcd')) + assert_series_equal(left.ne(right), exp) + + exp = pd.Series([False, False, True, False], index=list('abcd')) + assert_series_equal(left.le(right), exp) + + exp = pd.Series([False, False, False, False], index=list('abcd')) + assert_series_equal(left.lt(right), exp) + + exp = pd.Series([False, True, True, False], index=list('abcd')) + assert_series_equal(left.ge(right), exp) + + exp = pd.Series([False, True, False, False], index=list('abcd')) + assert_series_equal(left.gt(right), exp) + + def test_comparison_flex_alignment_fill(self): + left = Series([1, 3, 2], index=list('abc')) + right = Series([2, 2, 2], index=list('bcd')) + + exp = pd.Series([False, False, True, True], index=list('abcd')) + assert_series_equal(left.eq(right, fill_value=2), exp) + + exp = pd.Series([True, True, False, False], index=list('abcd')) + assert_series_equal(left.ne(right, fill_value=2), exp) + + exp = pd.Series([False, False, True, True], index=list('abcd')) + assert_series_equal(left.le(right, fill_value=0), exp) + + exp = pd.Series([False, False, False, True], index=list('abcd')) + assert_series_equal(left.lt(right, fill_value=0), exp) + + exp = pd.Series([True, True, True, False], index=list('abcd')) + assert_series_equal(left.ge(right, fill_value=0), exp) + + exp = pd.Series([True, True, False, False], index=list('abcd')) + assert_series_equal(left.gt(right, fill_value=0), exp) + def test_operators_bitwise(self): # GH 9016: support bitwise op for integer types index = list('bca') @@ -1119,11 +1287,11 @@ def test_operators_bitwise(self): s_a0b1c0 = Series([1], list('b')) res = s_tft & s_a0b1c0 - expected = s_tff + expected = s_tff.reindex(list('abc')) assert_series_equal(res, expected) res = s_tft | s_a0b1c0 - expected = s_tft + expected = s_tft.reindex(list('abc')) assert_series_equal(res, expected) n0 = 0 @@ -1160,9 +1328,25 @@ def test_operators_bitwise(self): self.assertRaises(TypeError, lambda: s_0123 & [0.1, 4, 3.14, 2]) # s_0123 will be all false now because of reindexing like s_tft - assert_series_equal(s_tft & s_0123, Series([False] * 3, list('bca'))) + if compat.PY3: + # unable to sort incompatible object via .union. + exp = Series([False] * 7, index=['b', 'c', 'a', 0, 1, 2, 3]) + with tm.assert_produces_warning(RuntimeWarning): + assert_series_equal(s_tft & s_0123, exp) + else: + exp = Series([False] * 7, index=[0, 1, 2, 3, 'a', 'b', 'c']) + assert_series_equal(s_tft & s_0123, exp) + # s_tft will be all false now because of reindexing like s_0123 - assert_series_equal(s_0123 & s_tft, Series([False] * 4)) + if compat.PY3: + # unable to sort incompatible object via .union. + exp = Series([False] * 7, index=[0, 1, 2, 3, 'b', 'c', 'a']) + with tm.assert_produces_warning(RuntimeWarning): + assert_series_equal(s_0123 & s_tft, exp) + else: + exp = Series([False] * 7, index=[0, 1, 2, 3, 'a', 'b', 'c']) + assert_series_equal(s_0123 & s_tft, exp) + assert_series_equal(s_0123 & False, Series([False] * 4)) assert_series_equal(s_0123 ^ False, Series([False, True, True, True])) assert_series_equal(s_0123 & [False], Series([False] * 4)) @@ -1246,9 +1430,124 @@ def _check_op(arr, op): _check_op(arr, operator.truediv) _check_op(arr, operator.floordiv) - def test_series_frame_radd_bug(self): - import operator + def test_arith_ops_df_compat(self): + # GH 1134 + s1 = pd.Series([1, 2, 3], index=list('ABC'), name='x') + s2 = pd.Series([2, 2, 2], index=list('ABD'), name='x') + + exp = pd.Series([3.0, 4.0, np.nan, np.nan], + index=list('ABCD'), name='x') + assert_series_equal(s1 + s2, exp) + assert_series_equal(s2 + s1, exp) + + exp = pd.DataFrame({'x': [3.0, 4.0, np.nan, np.nan]}, + index=list('ABCD')) + assert_frame_equal(s1.to_frame() + s2.to_frame(), exp) + assert_frame_equal(s2.to_frame() + s1.to_frame(), exp) + + # different length + s3 = pd.Series([1, 2, 3], index=list('ABC'), name='x') + s4 = pd.Series([2, 2, 2, 2], index=list('ABCD'), name='x') + + exp = pd.Series([3, 4, 5, np.nan], + index=list('ABCD'), name='x') + assert_series_equal(s3 + s4, exp) + assert_series_equal(s4 + s3, exp) + + exp = pd.DataFrame({'x': [3, 4, 5, np.nan]}, + index=list('ABCD')) + assert_frame_equal(s3.to_frame() + s4.to_frame(), exp) + assert_frame_equal(s4.to_frame() + s3.to_frame(), exp) + + def test_comp_ops_df_compat(self): + # GH 1134 + s1 = pd.Series([1, 2, 3], index=list('ABC'), name='x') + s2 = pd.Series([2, 2, 2], index=list('ABD'), name='x') + + s3 = pd.Series([1, 2, 3], index=list('ABC'), name='x') + s4 = pd.Series([2, 2, 2, 2], index=list('ABCD'), name='x') + + for l, r in [(s1, s2), (s2, s1), (s3, s4), (s4, s3)]: + + msg = "Can only compare identically-labeled Series objects" + with tm.assertRaisesRegexp(ValueError, msg): + l == r + + with tm.assertRaisesRegexp(ValueError, msg): + l != r + + with tm.assertRaisesRegexp(ValueError, msg): + l < r + + msg = "Can only compare identically-labeled DataFrame objects" + with tm.assertRaisesRegexp(ValueError, msg): + l.to_frame() == r.to_frame() + + with tm.assertRaisesRegexp(ValueError, msg): + l.to_frame() != r.to_frame() + + with tm.assertRaisesRegexp(ValueError, msg): + l.to_frame() < r.to_frame() + + def test_bool_ops_df_compat(self): + # GH 1134 + s1 = pd.Series([True, False, True], index=list('ABC'), name='x') + s2 = pd.Series([True, True, False], index=list('ABD'), name='x') + + exp = pd.Series([True, False, False, False], + index=list('ABCD'), name='x') + assert_series_equal(s1 & s2, exp) + assert_series_equal(s2 & s1, exp) + + # True | np.nan => True + exp = pd.Series([True, True, True, False], + index=list('ABCD'), name='x') + assert_series_equal(s1 | s2, exp) + # np.nan | True => np.nan, filled with False + exp = pd.Series([True, True, False, False], + index=list('ABCD'), name='x') + assert_series_equal(s2 | s1, exp) + + # DataFrame doesn't fill nan with False + exp = pd.DataFrame({'x': [True, False, np.nan, np.nan]}, + index=list('ABCD')) + assert_frame_equal(s1.to_frame() & s2.to_frame(), exp) + assert_frame_equal(s2.to_frame() & s1.to_frame(), exp) + + exp = pd.DataFrame({'x': [True, True, np.nan, np.nan]}, + index=list('ABCD')) + assert_frame_equal(s1.to_frame() | s2.to_frame(), exp) + assert_frame_equal(s2.to_frame() | s1.to_frame(), exp) + + # different length + s3 = pd.Series([True, False, True], index=list('ABC'), name='x') + s4 = pd.Series([True, True, True, True], index=list('ABCD'), name='x') + + exp = pd.Series([True, False, True, False], + index=list('ABCD'), name='x') + assert_series_equal(s3 & s4, exp) + assert_series_equal(s4 & s3, exp) + + # np.nan | True => np.nan, filled with False + exp = pd.Series([True, True, True, False], + index=list('ABCD'), name='x') + assert_series_equal(s3 | s4, exp) + # True | np.nan => True + exp = pd.Series([True, True, True, True], + index=list('ABCD'), name='x') + assert_series_equal(s4 | s3, exp) + + exp = pd.DataFrame({'x': [True, False, True, np.nan]}, + index=list('ABCD')) + assert_frame_equal(s3.to_frame() & s4.to_frame(), exp) + assert_frame_equal(s4.to_frame() & s3.to_frame(), exp) + + exp = pd.DataFrame({'x': [True, True, True, np.nan]}, + index=list('ABCD')) + assert_frame_equal(s3.to_frame() | s4.to_frame(), exp) + assert_frame_equal(s4.to_frame() | s3.to_frame(), exp) + def test_series_frame_radd_bug(self): # GH 353 vals = Series(tm.rands_array(5, 10)) result = 'foo_' + vals @@ -1258,23 +1557,94 @@ def test_series_frame_radd_bug(self): frame = DataFrame({'vals': vals}) result = 'foo_' + frame expected = DataFrame({'vals': vals.map(lambda x: 'foo_' + x)}) - tm.assert_frame_equal(result, expected) + assert_frame_equal(result, expected) # really raise this time - self.assertRaises(TypeError, operator.add, datetime.now(), self.ts) + with tm.assertRaises(TypeError): + datetime.now() + self.ts + + with tm.assertRaises(TypeError): + self.ts + datetime.now() + + def test_series_radd_more(self): + data = [[1, 2, 3], + [1.1, 2.2, 3.3], + [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-01-02'), + pd.NaT], + ['x', 'y', 1]] + + for d in data: + for dtype in [None, object]: + s = Series(d, dtype=dtype) + with tm.assertRaises(TypeError): + 'foo_' + s + + for dtype in [None, object]: + res = 1 + pd.Series([1, 2, 3], dtype=dtype) + exp = pd.Series([2, 3, 4], dtype=dtype) + assert_series_equal(res, exp) + res = pd.Series([1, 2, 3], dtype=dtype) + 1 + assert_series_equal(res, exp) + + res = np.nan + pd.Series([1, 2, 3], dtype=dtype) + exp = pd.Series([np.nan, np.nan, np.nan], dtype=dtype) + assert_series_equal(res, exp) + res = pd.Series([1, 2, 3], dtype=dtype) + np.nan + assert_series_equal(res, exp) + + s = pd.Series([pd.Timedelta('1 days'), pd.Timedelta('2 days'), + pd.Timedelta('3 days')], dtype=dtype) + exp = pd.Series([pd.Timedelta('4 days'), pd.Timedelta('5 days'), + pd.Timedelta('6 days')]) + assert_series_equal(pd.Timedelta('3 days') + s, exp) + assert_series_equal(s + pd.Timedelta('3 days'), exp) + + s = pd.Series(['x', np.nan, 'x']) + assert_series_equal('a' + s, pd.Series(['ax', np.nan, 'ax'])) + assert_series_equal(s + 'a', pd.Series(['xa', np.nan, 'xa'])) + + def test_frame_radd_more(self): + data = [[1, 2, 3], + [1.1, 2.2, 3.3], + [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-01-02'), + pd.NaT], + ['x', 'y', 1]] + + for d in data: + for dtype in [None, object]: + s = DataFrame(d, dtype=dtype) + with tm.assertRaises(TypeError): + 'foo_' + s + + for dtype in [None, object]: + res = 1 + pd.DataFrame([1, 2, 3], dtype=dtype) + exp = pd.DataFrame([2, 3, 4], dtype=dtype) + assert_frame_equal(res, exp) + res = pd.DataFrame([1, 2, 3], dtype=dtype) + 1 + assert_frame_equal(res, exp) + + res = np.nan + pd.DataFrame([1, 2, 3], dtype=dtype) + exp = pd.DataFrame([np.nan, np.nan, np.nan], dtype=dtype) + assert_frame_equal(res, exp) + res = pd.DataFrame([1, 2, 3], dtype=dtype) + np.nan + assert_frame_equal(res, exp) + + df = pd.DataFrame(['x', np.nan, 'x']) + assert_frame_equal('a' + df, pd.DataFrame(['ax', np.nan, 'ax'])) + assert_frame_equal(df + 'a', pd.DataFrame(['xa', np.nan, 'xa'])) def test_operators_frame(self): # rpow does not work with DataFrame df = DataFrame({'A': self.ts}) - tm.assert_series_equal(self.ts + self.ts, self.ts + df['A'], - check_names=False) - tm.assert_series_equal(self.ts ** self.ts, self.ts ** df['A'], - check_names=False) - tm.assert_series_equal(self.ts < self.ts, self.ts < df['A'], - check_names=False) - tm.assert_series_equal(self.ts / self.ts, self.ts / df['A'], - check_names=False) + assert_series_equal(self.ts + self.ts, self.ts + df['A'], + check_names=False) + assert_series_equal(self.ts ** self.ts, self.ts ** df['A'], + check_names=False) + assert_series_equal(self.ts < self.ts, self.ts < df['A'], + check_names=False) + assert_series_equal(self.ts / self.ts, self.ts / df['A'], + check_names=False) def test_operators_combine(self): def _check_fill(meth, op, a, b, fill_value=0): @@ -1287,18 +1657,19 @@ def _check_fill(meth, op, a, b, fill_value=0): exp_values = [] for i in range(len(exp_index)): - if amask[i]: - if bmask[i]: - exp_values.append(nan) - continue - exp_values.append(op(fill_value, b[i])) - elif bmask[i]: + with np.errstate(all='ignore'): if amask[i]: - exp_values.append(nan) - continue - exp_values.append(op(a[i], fill_value)) - else: - exp_values.append(op(a[i], b[i])) + if bmask[i]: + exp_values.append(nan) + continue + exp_values.append(op(fill_value, b[i])) + elif bmask[i]: + if amask[i]: + exp_values.append(nan) + continue + exp_values.append(op(a[i], fill_value)) + else: + exp_values.append(op(a[i], b[i])) result = meth(a, b, fill_value=fill_value) expected = Series(exp_values, exp_index) @@ -1369,12 +1740,12 @@ def test_divide_decimal(self): s = Series([Decimal(10)]) s = s / Decimal(2) - tm.assert_series_equal(expected, s) + assert_series_equal(expected, s) s = Series([Decimal(10)]) s = s // Decimal(2) - tm.assert_series_equal(expected, s) + assert_series_equal(expected, s) def test_datetime64_with_index(self): @@ -1401,3 +1772,12 @@ def test_datetime64_with_index(self): df['expected'] = df['date'] - df.index.to_series() df['result'] = df['date'] - df.index assert_series_equal(df['result'], df['expected'], check_names=False) + + def test_dti_tz_convert_to_utc(self): + base = pd.DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03'], + tz='UTC') + idx1 = base.tz_convert('Asia/Tokyo')[:2] + idx2 = base.tz_convert('US/Eastern')[1:] + + res = Series([1, 2], index=idx1) + Series([1, 1], index=idx2) + assert_series_equal(res, Series([np.nan, 3, np.nan], index=base)) diff --git a/pandas/tests/series/test_quantile.py b/pandas/tests/series/test_quantile.py index e0bff7fbd39e4..7d2517987e526 100644 --- a/pandas/tests/series/test_quantile.py +++ b/pandas/tests/series/test_quantile.py @@ -7,7 +7,7 @@ from pandas import (Index, Series, _np_version_under1p9) from pandas.tseries.index import Timestamp -import pandas.core.common as com +from pandas.types.common import is_integer import pandas.util.testing as tm from .common import TestData @@ -96,11 +96,11 @@ def test_quantile_interpolation_dtype(self): # interpolation = linear (default case) q = pd.Series([1, 3, 4]).quantile(0.5, interpolation='lower') self.assertEqual(q, percentile(np.array([1, 3, 4]), 50)) - self.assertTrue(com.is_integer(q)) + self.assertTrue(is_integer(q)) q = pd.Series([1, 3, 4]).quantile(0.5, interpolation='higher') self.assertEqual(q, percentile(np.array([1, 3, 4]), 50)) - self.assertTrue(com.is_integer(q)) + self.assertTrue(is_integer(q)) def test_quantile_interpolation_np_lt_1p9(self): # GH #10174 diff --git a/pandas/tests/series/test_sorting.py b/pandas/tests/series/test_sorting.py new file mode 100644 index 0000000000000..826201adbdb50 --- /dev/null +++ b/pandas/tests/series/test_sorting.py @@ -0,0 +1,146 @@ +# coding=utf-8 + +import numpy as np +import random + +from pandas import (DataFrame, Series, MultiIndex) + +from pandas.util.testing import (assert_series_equal, assert_almost_equal) +import pandas.util.testing as tm + +from .common import TestData + + +class TestSeriesSorting(TestData, tm.TestCase): + + _multiprocess_can_split_ = True + + def test_sort(self): + + ts = self.ts.copy() + + # 9816 deprecated + with tm.assert_produces_warning(FutureWarning): + ts.sort() # sorts inplace + self.assert_series_equal(ts, self.ts.sort_values()) + + def test_order(self): + + # 9816 deprecated + with tm.assert_produces_warning(FutureWarning): + result = self.ts.order() + self.assert_series_equal(result, self.ts.sort_values()) + + def test_sort_values(self): + + # check indexes are reordered corresponding with the values + ser = Series([3, 2, 4, 1], ['A', 'B', 'C', 'D']) + expected = Series([1, 2, 3, 4], ['D', 'B', 'A', 'C']) + result = ser.sort_values() + self.assert_series_equal(expected, result) + + ts = self.ts.copy() + ts[:5] = np.NaN + vals = ts.values + + result = ts.sort_values() + self.assertTrue(np.isnan(result[-5:]).all()) + self.assert_numpy_array_equal(result[:-5].values, np.sort(vals[5:])) + + # na_position + result = ts.sort_values(na_position='first') + self.assertTrue(np.isnan(result[:5]).all()) + self.assert_numpy_array_equal(result[5:].values, np.sort(vals[5:])) + + # something object-type + ser = Series(['A', 'B'], [1, 2]) + # no failure + ser.sort_values() + + # ascending=False + ordered = ts.sort_values(ascending=False) + expected = np.sort(ts.valid().values)[::-1] + assert_almost_equal(expected, ordered.valid().values) + ordered = ts.sort_values(ascending=False, na_position='first') + assert_almost_equal(expected, ordered.valid().values) + + # inplace=True + ts = self.ts.copy() + ts.sort_values(ascending=False, inplace=True) + self.assert_series_equal(ts, self.ts.sort_values(ascending=False)) + self.assert_index_equal(ts.index, + self.ts.sort_values(ascending=False).index) + + # GH 5856/5853 + # Series.sort_values operating on a view + df = DataFrame(np.random.randn(10, 4)) + s = df.iloc[:, 0] + + def f(): + s.sort_values(inplace=True) + + self.assertRaises(ValueError, f) + + def test_sort_index(self): + rindex = list(self.ts.index) + random.shuffle(rindex) + + random_order = self.ts.reindex(rindex) + sorted_series = random_order.sort_index() + assert_series_equal(sorted_series, self.ts) + + # descending + sorted_series = random_order.sort_index(ascending=False) + assert_series_equal(sorted_series, + self.ts.reindex(self.ts.index[::-1])) + + # compat on level + sorted_series = random_order.sort_index(level=0) + assert_series_equal(sorted_series, self.ts) + + # compat on axis + sorted_series = random_order.sort_index(axis=0) + assert_series_equal(sorted_series, self.ts) + + self.assertRaises(ValueError, lambda: random_order.sort_values(axis=1)) + + sorted_series = random_order.sort_index(level=0, axis=0) + assert_series_equal(sorted_series, self.ts) + + self.assertRaises(ValueError, + lambda: random_order.sort_index(level=0, axis=1)) + + def test_sort_index_inplace(self): + + # For #11402 + rindex = list(self.ts.index) + random.shuffle(rindex) + + # descending + random_order = self.ts.reindex(rindex) + result = random_order.sort_index(ascending=False, inplace=True) + self.assertIs(result, None, + msg='sort_index() inplace should return None') + assert_series_equal(random_order, self.ts.reindex(self.ts.index[::-1])) + + # ascending + random_order = self.ts.reindex(rindex) + result = random_order.sort_index(ascending=True, inplace=True) + self.assertIs(result, None, + msg='sort_index() inplace should return None') + assert_series_equal(random_order, self.ts) + + def test_sort_index_multiindex(self): + + mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC')) + s = Series([1, 2], mi) + backwards = s.iloc[[1, 0]] + + # implicit sort_remaining=True + res = s.sort_index(level='A') + assert_series_equal(backwards, res) + + # GH13496 + # rows share same level='A': sort has no effect without remaining lvls + res = s.sort_index(level='A', sort_remaining=False) + assert_series_equal(s, res) diff --git a/pandas/tests/series/test_subclass.py b/pandas/tests/series/test_subclass.py index 016113961ec74..cc07c7d9dd59b 100644 --- a/pandas/tests/series/test_subclass.py +++ b/pandas/tests/series/test_subclass.py @@ -1,6 +1,8 @@ # coding=utf-8 # pylint: disable-msg=E1101,W0612 +import numpy as np +import pandas as pd import pandas.util.testing as tm @@ -31,3 +33,72 @@ def test_to_frame(self): exp = tm.SubclassedDataFrame({'xxx': [1, 2, 3, 4]}, index=list('abcd')) tm.assert_frame_equal(res, exp) tm.assertIsInstance(res, tm.SubclassedDataFrame) + + +class TestSparseSeriesSubclassing(tm.TestCase): + + _multiprocess_can_split_ = True + + def test_subclass_sparse_slice(self): + # int64 + s = tm.SubclassedSparseSeries([1, 2, 3, 4, 5]) + exp = tm.SubclassedSparseSeries([2, 3, 4], index=[1, 2, 3]) + tm.assert_sp_series_equal(s.loc[1:3], exp) + self.assertEqual(s.loc[1:3].dtype, np.int64) + + exp = tm.SubclassedSparseSeries([2, 3], index=[1, 2]) + tm.assert_sp_series_equal(s.iloc[1:3], exp) + self.assertEqual(s.iloc[1:3].dtype, np.int64) + + exp = tm.SubclassedSparseSeries([2, 3], index=[1, 2]) + tm.assert_sp_series_equal(s[1:3], exp) + self.assertEqual(s[1:3].dtype, np.int64) + + # float64 + s = tm.SubclassedSparseSeries([1., 2., 3., 4., 5.]) + exp = tm.SubclassedSparseSeries([2., 3., 4.], index=[1, 2, 3]) + tm.assert_sp_series_equal(s.loc[1:3], exp) + self.assertEqual(s.loc[1:3].dtype, np.float64) + + exp = tm.SubclassedSparseSeries([2., 3.], index=[1, 2]) + tm.assert_sp_series_equal(s.iloc[1:3], exp) + self.assertEqual(s.iloc[1:3].dtype, np.float64) + + exp = tm.SubclassedSparseSeries([2., 3.], index=[1, 2]) + tm.assert_sp_series_equal(s[1:3], exp) + self.assertEqual(s[1:3].dtype, np.float64) + + def test_subclass_sparse_addition(self): + s1 = tm.SubclassedSparseSeries([1, 3, 5]) + s2 = tm.SubclassedSparseSeries([-2, 5, 12]) + exp = tm.SubclassedSparseSeries([-1, 8, 17]) + tm.assert_sp_series_equal(s1 + s2, exp) + + s1 = tm.SubclassedSparseSeries([4.0, 5.0, 6.0]) + s2 = tm.SubclassedSparseSeries([1.0, 2.0, 3.0]) + exp = tm.SubclassedSparseSeries([5., 7., 9.]) + tm.assert_sp_series_equal(s1 + s2, exp) + + def test_subclass_sparse_to_frame(self): + s = tm.SubclassedSparseSeries([1, 2], index=list('abcd'), name='xxx') + res = s.to_frame() + + exp_arr = pd.SparseArray([1, 2], dtype=np.int64, kind='block', + fill_value=0) + exp = tm.SubclassedSparseDataFrame({'xxx': exp_arr}, + index=list('abcd'), + default_fill_value=0) + tm.assert_sp_frame_equal(res, exp) + + # create from int dict + res = tm.SubclassedSparseDataFrame({'xxx': [1, 2]}, + index=list('abcd'), + default_fill_value=0) + tm.assert_sp_frame_equal(res, exp) + + s = tm.SubclassedSparseSeries([1.1, 2.1], index=list('abcd'), + name='xxx') + res = s.to_frame() + exp = tm.SubclassedSparseDataFrame({'xxx': [1.1, 2.1]}, + index=list('abcd')) + tm.assert_sp_frame_equal(res, exp) diff --git a/pandas/tests/series/test_timeseries.py b/pandas/tests/series/test_timeseries.py index 13b95ea97eedf..6e3d52366a4ec 100644 --- a/pandas/tests/series/test_timeseries.py +++ b/pandas/tests/series/test_timeseries.py @@ -3,19 +3,17 @@ from datetime import datetime -from numpy import nan import numpy as np -from pandas import Index, Series, notnull, date_range +from pandas import Index, Series, date_range, NaT from pandas.tseries.index import DatetimeIndex +from pandas.tseries.offsets import BDay, BMonthEnd from pandas.tseries.tdi import TimedeltaIndex -import pandas.core.datetools as datetools - from pandas.util.testing import assert_series_equal, assert_almost_equal import pandas.util.testing as tm -from .common import TestData +from pandas.tests.series.common import TestData class TestSeriesTimeSeries(TestData, tm.TestCase): @@ -30,7 +28,7 @@ def test_shift(self): tm.assert_numpy_array_equal(unshifted.valid().values, self.ts.values[:-1]) - offset = datetools.bday + offset = BDay() shifted = self.ts.shift(1, freq=offset) unshifted = shifted.shift(-1, freq=offset) @@ -57,7 +55,7 @@ def test_shift(self): tm.assert_numpy_array_equal(unshifted.valid().values, ps.values[:-1]) shifted2 = ps.shift(1, 'B') - shifted3 = ps.shift(1, datetools.bday) + shifted3 = ps.shift(1, BDay()) assert_series_equal(shifted2, shifted3) assert_series_equal(ps, shifted2.shift(-1, 'B')) @@ -67,7 +65,7 @@ def test_shift(self): shifted4 = ps.shift(1, freq='B') assert_series_equal(shifted2, shifted4) - shifted5 = ps.shift(1, freq=datetools.bday) + shifted5 = ps.shift(1, freq=BDay()) assert_series_equal(shifted5, shifted4) # 32-bit taking @@ -94,6 +92,33 @@ def test_shift(self): tz='CET'), name='foo') self.assertRaises(ValueError, lambda: s - s2) + def test_shift_dst(self): + # GH 13926 + dates = date_range('2016-11-06', freq='H', periods=10, tz='US/Eastern') + s = Series(dates) + + res = s.shift(0) + tm.assert_series_equal(res, s) + self.assertEqual(res.dtype, 'datetime64[ns, US/Eastern]') + + res = s.shift(1) + exp_vals = [NaT] + dates.asobject.values.tolist()[:9] + exp = Series(exp_vals) + tm.assert_series_equal(res, exp) + self.assertEqual(res.dtype, 'datetime64[ns, US/Eastern]') + + res = s.shift(-2) + exp_vals = dates.asobject.values.tolist()[2:] + [NaT, NaT] + exp = Series(exp_vals) + tm.assert_series_equal(res, exp) + self.assertEqual(res.dtype, 'datetime64[ns, US/Eastern]') + + for ex in [10, -10, 20, -20]: + res = s.shift(ex) + exp = Series([NaT] * 10, dtype='datetime64[ns, US/Eastern]') + tm.assert_series_equal(res, exp) + self.assertEqual(res.dtype, 'datetime64[ns, US/Eastern]') + def test_tshift(self): # PeriodIndex ps = tm.makePeriodSeries() @@ -105,7 +130,7 @@ def test_tshift(self): shifted2 = ps.tshift(freq='B') assert_series_equal(shifted, shifted2) - shifted3 = ps.tshift(freq=datetools.bday) + shifted3 = ps.tshift(freq=BDay()) assert_series_equal(shifted, shifted3) self.assertRaises(ValueError, ps.tshift, freq='M') @@ -130,7 +155,7 @@ def test_tshift(self): self.assertRaises(ValueError, no_freq.tshift) def test_truncate(self): - offset = datetools.bday + offset = BDay() ts = self.ts[::3] @@ -179,51 +204,6 @@ def test_truncate(self): before=self.ts.index[-1] + offset, after=self.ts.index[0] - offset) - def test_asof(self): - # array or list or dates - N = 50 - rng = date_range('1/1/1990', periods=N, freq='53s') - ts = Series(np.random.randn(N), index=rng) - ts[15:30] = np.nan - dates = date_range('1/1/1990', periods=N * 3, freq='25s') - - result = ts.asof(dates) - self.assertTrue(notnull(result).all()) - lb = ts.index[14] - ub = ts.index[30] - - result = ts.asof(list(dates)) - self.assertTrue(notnull(result).all()) - lb = ts.index[14] - ub = ts.index[30] - - mask = (result.index >= lb) & (result.index < ub) - rs = result[mask] - self.assertTrue((rs == ts[lb]).all()) - - val = result[result.index[result.index >= ub][0]] - self.assertEqual(ts[ub], val) - - self.ts[5:10] = np.NaN - self.ts[15:20] = np.NaN - - val1 = self.ts.asof(self.ts.index[7]) - val2 = self.ts.asof(self.ts.index[19]) - - self.assertEqual(val1, self.ts[4]) - self.assertEqual(val2, self.ts[14]) - - # accepts strings - val1 = self.ts.asof(str(self.ts.index[7])) - self.assertEqual(val1, self.ts[4]) - - # in there - self.assertEqual(self.ts.asof(self.ts.index[3]), self.ts[3]) - - # no as of value - d = self.ts.index[0] - datetools.bday - self.assertTrue(np.isnan(self.ts.asof(d))) - def test_getitem_setitem_datetimeindex(self): from pandas import date_range @@ -424,68 +404,6 @@ def test_getitem_setitem_periodindex(self): result[4:8] = ts[4:8] assert_series_equal(result, ts) - def test_asof_periodindex(self): - from pandas import period_range, PeriodIndex - # array or list or dates - N = 50 - rng = period_range('1/1/1990', periods=N, freq='H') - ts = Series(np.random.randn(N), index=rng) - ts[15:30] = np.nan - dates = date_range('1/1/1990', periods=N * 3, freq='37min') - - result = ts.asof(dates) - self.assertTrue(notnull(result).all()) - lb = ts.index[14] - ub = ts.index[30] - - result = ts.asof(list(dates)) - self.assertTrue(notnull(result).all()) - lb = ts.index[14] - ub = ts.index[30] - - pix = PeriodIndex(result.index.values, freq='H') - mask = (pix >= lb) & (pix < ub) - rs = result[mask] - self.assertTrue((rs == ts[lb]).all()) - - ts[5:10] = np.NaN - ts[15:20] = np.NaN - - val1 = ts.asof(ts.index[7]) - val2 = ts.asof(ts.index[19]) - - self.assertEqual(val1, ts[4]) - self.assertEqual(val2, ts[14]) - - # accepts strings - val1 = ts.asof(str(ts.index[7])) - self.assertEqual(val1, ts[4]) - - # in there - self.assertEqual(ts.asof(ts.index[3]), ts[3]) - - # no as of value - d = ts.index[0].to_timestamp() - datetools.bday - self.assertTrue(np.isnan(ts.asof(d))) - - def test_asof_more(self): - from pandas import date_range - - s = Series([nan, nan, 1, 2, nan, nan, 3, 4, 5], - index=date_range('1/1/2000', periods=9)) - - dates = s.index[[4, 5, 6, 2, 1]] - - result = s.asof(dates) - expected = Series([2, 2, 3, 1, np.nan], index=dates) - - assert_series_equal(result, expected) - - s = Series([1.5, 2.5, 1, 2, nan, nan, 3, 4, 5], - index=date_range('1/1/2000', periods=9)) - result = s.asof(s.index[0]) - self.assertEqual(result, s[0]) - def test_asfreq(self): ts = Series([0., 1., 2.], index=[datetime(2009, 10, 30), datetime( 2009, 11, 30), datetime(2009, 12, 31)]) @@ -498,8 +416,8 @@ def test_asfreq(self): monthly_ts = daily_ts.asfreq('BM') self.assert_series_equal(monthly_ts, ts) - daily_ts = ts.asfreq(datetools.bday) - monthly_ts = daily_ts.asfreq(datetools.bmonthEnd) + daily_ts = ts.asfreq(BDay()) + monthly_ts = daily_ts.asfreq(BMonthEnd()) self.assert_series_equal(monthly_ts, ts) result = ts[:0].asfreq('M') @@ -633,3 +551,18 @@ def test_timeseries_coercion(self): self.assertTrue(ser.is_time_series) self.assertTrue(ser.index.is_all_dates) self.assertIsInstance(ser.index, DatetimeIndex) + + def test_empty_series_ops(self): + # see issue #13844 + a = Series(dtype='M8[ns]') + b = Series(dtype='m8[ns]') + assert_series_equal(a, a + b) + assert_series_equal(a, a - b) + assert_series_equal(a, b + a) + self.assertRaises(TypeError, lambda x, y: x - y, b, a) + + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 8af93ad0ecb2e..092e02ee261a0 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -4,8 +4,8 @@ import numpy as np from numpy.random import RandomState from numpy import nan -import datetime - +from datetime import datetime +from itertools import permutations from pandas import Series, Categorical, CategoricalIndex, Index import pandas as pd @@ -56,6 +56,93 @@ def test_strings(self): tm.assert_series_equal(result, expected) +class TestSafeSort(tm.TestCase): + _multiprocess_can_split_ = True + + def test_basic_sort(self): + values = [3, 1, 2, 0, 4] + result = algos.safe_sort(values) + expected = np.array([0, 1, 2, 3, 4]) + tm.assert_numpy_array_equal(result, expected) + + values = list("baaacb") + result = algos.safe_sort(values) + expected = np.array(list("aaabbc")) + tm.assert_numpy_array_equal(result, expected) + + values = [] + result = algos.safe_sort(values) + expected = np.array([]) + tm.assert_numpy_array_equal(result, expected) + + def test_labels(self): + values = [3, 1, 2, 0, 4] + expected = np.array([0, 1, 2, 3, 4]) + + labels = [0, 1, 1, 2, 3, 0, -1, 4] + result, result_labels = algos.safe_sort(values, labels) + expected_labels = np.array([3, 1, 1, 2, 0, 3, -1, 4], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + tm.assert_numpy_array_equal(result_labels, expected_labels) + + # na_sentinel + labels = [0, 1, 1, 2, 3, 0, 99, 4] + result, result_labels = algos.safe_sort(values, labels, + na_sentinel=99) + expected_labels = np.array([3, 1, 1, 2, 0, 3, 99, 4], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + tm.assert_numpy_array_equal(result_labels, expected_labels) + + # out of bound indices + labels = [0, 101, 102, 2, 3, 0, 99, 4] + result, result_labels = algos.safe_sort(values, labels) + expected_labels = np.array([3, -1, -1, 2, 0, 3, -1, 4], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + tm.assert_numpy_array_equal(result_labels, expected_labels) + + labels = [] + result, result_labels = algos.safe_sort(values, labels) + expected_labels = np.array([], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + tm.assert_numpy_array_equal(result_labels, expected_labels) + + def test_mixed_integer(self): + values = np.array(['b', 1, 0, 'a', 0, 'b'], dtype=object) + result = algos.safe_sort(values) + expected = np.array([0, 0, 1, 'a', 'b', 'b'], dtype=object) + tm.assert_numpy_array_equal(result, expected) + + values = np.array(['b', 1, 0, 'a'], dtype=object) + labels = [0, 1, 2, 3, 0, -1, 1] + result, result_labels = algos.safe_sort(values, labels) + expected = np.array([0, 1, 'a', 'b'], dtype=object) + expected_labels = np.array([3, 1, 0, 2, 3, -1, 1], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + tm.assert_numpy_array_equal(result_labels, expected_labels) + + def test_unsortable(self): + # GH 13714 + arr = np.array([1, 2, datetime.now(), 0, 3], dtype=object) + if compat.PY2 and not pd._np_version_under1p10: + # RuntimeWarning: tp_compare didn't return -1 or -2 for exception + with tm.assert_produces_warning(RuntimeWarning): + tm.assertRaises(TypeError, algos.safe_sort, arr) + else: + tm.assertRaises(TypeError, algos.safe_sort, arr) + + def test_exceptions(self): + with tm.assertRaisesRegexp(TypeError, + "Only list-like objects are allowed"): + algos.safe_sort(values=1) + + with tm.assertRaisesRegexp(TypeError, + "Only list-like objects or None"): + algos.safe_sort(values=[0, 1, 2], labels=1) + + with tm.assertRaisesRegexp(ValueError, "values should be unique"): + algos.safe_sort(values=[0, 1, 2, 1], labels=[0, 1]) + + class TestFactorize(tm.TestCase): _multiprocess_can_split_ = True @@ -68,33 +155,33 @@ def test_basic(self): labels, uniques = algos.factorize(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'], sort=True) - exp = np.array([0, 1, 1, 0, 0, 2, 2, 2], dtype=np.int_) + exp = np.array([0, 1, 1, 0, 0, 2, 2, 2], dtype=np.intp) self.assert_numpy_array_equal(labels, exp) exp = np.array(['a', 'b', 'c'], dtype=object) self.assert_numpy_array_equal(uniques, exp) labels, uniques = algos.factorize(list(reversed(range(5)))) - exp = np.array([0, 1, 2, 3, 4], dtype=np.int_) + exp = np.array([0, 1, 2, 3, 4], dtype=np.intp) self.assert_numpy_array_equal(labels, exp) exp = np.array([4, 3, 2, 1, 0], dtype=np.int64) self.assert_numpy_array_equal(uniques, exp) labels, uniques = algos.factorize(list(reversed(range(5))), sort=True) - exp = np.array([4, 3, 2, 1, 0], dtype=np.int_) + exp = np.array([4, 3, 2, 1, 0], dtype=np.intp) self.assert_numpy_array_equal(labels, exp) exp = np.array([0, 1, 2, 3, 4], dtype=np.int64) self.assert_numpy_array_equal(uniques, exp) labels, uniques = algos.factorize(list(reversed(np.arange(5.)))) - exp = np.array([0, 1, 2, 3, 4], dtype=np.int_) + exp = np.array([0, 1, 2, 3, 4], dtype=np.intp) self.assert_numpy_array_equal(labels, exp) exp = np.array([4., 3., 2., 1., 0.], dtype=np.float64) self.assert_numpy_array_equal(uniques, exp) labels, uniques = algos.factorize(list(reversed(np.arange(5.))), sort=True) - exp = np.array([4, 3, 2, 1, 0], dtype=np.int_) + exp = np.array([4, 3, 2, 1, 0], dtype=np.intp) self.assert_numpy_array_equal(labels, exp) exp = np.array([0., 1., 2., 3., 4.], dtype=np.float64) self.assert_numpy_array_equal(uniques, exp) @@ -105,13 +192,13 @@ def test_mixed(self): x = Series(['A', 'A', np.nan, 'B', 3.14, np.inf]) labels, uniques = algos.factorize(x) - exp = np.array([0, 0, -1, 1, 2, 3], dtype=np.int_) + exp = np.array([0, 0, -1, 1, 2, 3], dtype=np.intp) self.assert_numpy_array_equal(labels, exp) exp = pd.Index(['A', 'B', 3.14, np.inf]) tm.assert_index_equal(uniques, exp) labels, uniques = algos.factorize(x, sort=True) - exp = np.array([2, 2, -1, 3, 0, 1], dtype=np.int_) + exp = np.array([2, 2, -1, 3, 0, 1], dtype=np.intp) self.assert_numpy_array_equal(labels, exp) exp = pd.Index([3.14, np.inf, 'A', 'B']) tm.assert_index_equal(uniques, exp) @@ -124,13 +211,13 @@ def test_datelike(self): x = Series([v1, v1, v1, v2, v2, v1]) labels, uniques = algos.factorize(x) - exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.int_) + exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.intp) self.assert_numpy_array_equal(labels, exp) exp = pd.DatetimeIndex([v1, v2]) self.assert_index_equal(uniques, exp) labels, uniques = algos.factorize(x, sort=True) - exp = np.array([1, 1, 1, 0, 0, 1], dtype=np.int_) + exp = np.array([1, 1, 1, 0, 0, 1], dtype=np.intp) self.assert_numpy_array_equal(labels, exp) exp = pd.DatetimeIndex([v2, v1]) self.assert_index_equal(uniques, exp) @@ -142,12 +229,12 @@ def test_datelike(self): # periods are not 'sorted' as they are converted back into an index labels, uniques = algos.factorize(x) - exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.int_) + exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.intp) self.assert_numpy_array_equal(labels, exp) self.assert_index_equal(uniques, pd.PeriodIndex([v1, v2])) labels, uniques = algos.factorize(x, sort=True) - exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.int_) + exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.intp) self.assert_numpy_array_equal(labels, exp) self.assert_index_equal(uniques, pd.PeriodIndex([v1, v2])) @@ -156,12 +243,12 @@ def test_datelike(self): v2 = pd.to_timedelta('1 day') x = Series([v1, v2, v1, v1, v2, v2, v1]) labels, uniques = algos.factorize(x) - exp = np.array([0, 1, 0, 0, 1, 1, 0], dtype=np.int_) + exp = np.array([0, 1, 0, 0, 1, 1, 0], dtype=np.intp) self.assert_numpy_array_equal(labels, exp) self.assert_index_equal(uniques, pd.to_timedelta([v1, v2])) labels, uniques = algos.factorize(x, sort=True) - exp = np.array([1, 0, 1, 1, 0, 0, 1], dtype=np.int_) + exp = np.array([1, 0, 1, 1, 0, 0, 1], dtype=np.intp) self.assert_numpy_array_equal(labels, exp) self.assert_index_equal(uniques, pd.to_timedelta([v2, v1])) @@ -212,45 +299,15 @@ def _test_vector_resize(htable, uniques, dtype, nvals): _test_vector_resize(tbl(), vect(), dtype, 0) _test_vector_resize(tbl(), vect(), dtype, 10) + def test_complex_sorting(self): + # gh 12666 - check no segfault + # Test not valid numpy versions older than 1.11 + if pd._np_version_under1p11: + self.skipTest("Test valid only for numpy 1.11+") -class TestIndexer(tm.TestCase): - _multiprocess_can_split_ = True + x17 = np.array([complex(i) for i in range(17)], dtype=object) - def test_outer_join_indexer(self): - typemap = [('int32', algos.algos.outer_join_indexer_int32), - ('int64', algos.algos.outer_join_indexer_int64), - ('float32', algos.algos.outer_join_indexer_float32), - ('float64', algos.algos.outer_join_indexer_float64), - ('object', algos.algos.outer_join_indexer_object)] - - for dtype, indexer in typemap: - left = np.arange(3, dtype=dtype) - right = np.arange(2, 5, dtype=dtype) - empty = np.array([], dtype=dtype) - - result, lindexer, rindexer = indexer(left, right) - tm.assertIsInstance(result, np.ndarray) - tm.assertIsInstance(lindexer, np.ndarray) - tm.assertIsInstance(rindexer, np.ndarray) - tm.assert_numpy_array_equal(result, np.arange(5, dtype=dtype)) - exp = np.array([0, 1, 2, -1, -1], dtype=np.int64) - tm.assert_numpy_array_equal(lindexer, exp) - exp = np.array([-1, -1, 0, 1, 2], dtype=np.int64) - tm.assert_numpy_array_equal(rindexer, exp) - - result, lindexer, rindexer = indexer(empty, right) - tm.assert_numpy_array_equal(result, right) - exp = np.array([-1, -1, -1], dtype=np.int64) - tm.assert_numpy_array_equal(lindexer, exp) - exp = np.array([0, 1, 2], dtype=np.int64) - tm.assert_numpy_array_equal(rindexer, exp) - - result, lindexer, rindexer = indexer(left, empty) - tm.assert_numpy_array_equal(result, left) - exp = np.array([0, 1, 2], dtype=np.int64) - tm.assert_numpy_array_equal(lindexer, exp) - exp = np.array([-1, -1, -1], dtype=np.int64) - tm.assert_numpy_array_equal(rindexer, exp) + self.assertRaises(TypeError, algos.factorize, x17[::-1], sort=True) class TestUnique(tm.TestCase): @@ -470,6 +527,24 @@ def test_value_counts_nat(self): tm.assert_series_equal(algos.value_counts(dt), exp_dt) # TODO same for (timedelta) + def test_value_counts_datetime_outofbounds(self): + # GH 13663 + s = pd.Series([datetime(3000, 1, 1), datetime(5000, 1, 1), + datetime(5000, 1, 1), datetime(6000, 1, 1), + datetime(3000, 1, 1), datetime(3000, 1, 1)]) + res = s.value_counts() + + exp_index = pd.Index([datetime(3000, 1, 1), datetime(5000, 1, 1), + datetime(6000, 1, 1)], dtype=object) + exp = pd.Series([3, 2, 1], index=exp_index) + tm.assert_series_equal(res, exp) + + # GH 12424 + res = pd.to_datetime(pd.Series(['2362-01-01', np.nan]), + errors='ignore') + exp = pd.Series(['2362-01-01', np.nan], dtype=object) + tm.assert_series_equal(res, exp) + def test_categorical(self): s = Series(pd.Categorical(list('aaabbc'))) result = s.value_counts() @@ -569,6 +644,157 @@ def test_value_counts_normalized(self): tm.assert_series_equal(result, expected) +class TestDuplicated(tm.TestCase): + + _multiprocess_can_split_ = True + + def test_duplicated_with_nas(self): + keys = np.array([0, 1, np.nan, 0, 2, np.nan], dtype=object) + + result = algos.duplicated(keys) + expected = np.array([False, False, False, True, False, True]) + tm.assert_numpy_array_equal(result, expected) + + result = algos.duplicated(keys, keep='first') + expected = np.array([False, False, False, True, False, True]) + tm.assert_numpy_array_equal(result, expected) + + result = algos.duplicated(keys, keep='last') + expected = np.array([True, False, True, False, False, False]) + tm.assert_numpy_array_equal(result, expected) + + result = algos.duplicated(keys, keep=False) + expected = np.array([True, False, True, True, False, True]) + tm.assert_numpy_array_equal(result, expected) + + keys = np.empty(8, dtype=object) + for i, t in enumerate(zip([0, 0, np.nan, np.nan] * 2, + [0, np.nan, 0, np.nan] * 2)): + keys[i] = t + + result = algos.duplicated(keys) + falses = [False] * 4 + trues = [True] * 4 + expected = np.array(falses + trues) + tm.assert_numpy_array_equal(result, expected) + + result = algos.duplicated(keys, keep='last') + expected = np.array(trues + falses) + tm.assert_numpy_array_equal(result, expected) + + result = algos.duplicated(keys, keep=False) + expected = np.array(trues + trues) + tm.assert_numpy_array_equal(result, expected) + + def test_numeric_object_likes(self): + cases = [np.array([1, 2, 1, 5, 3, + 2, 4, 1, 5, 6]), + np.array([1.1, 2.2, 1.1, np.nan, 3.3, + 2.2, 4.4, 1.1, np.nan, 6.6]), + np.array([1 + 1j, 2 + 2j, 1 + 1j, 5 + 5j, 3 + 3j, + 2 + 2j, 4 + 4j, 1 + 1j, 5 + 5j, 6 + 6j]), + np.array(['a', 'b', 'a', 'e', 'c', + 'b', 'd', 'a', 'e', 'f'], dtype=object)] + + exp_first = np.array([False, False, True, False, False, + True, False, True, True, False]) + exp_last = np.array([True, True, True, True, False, + False, False, False, False, False]) + exp_false = exp_first | exp_last + + for case in cases: + res_first = algos.duplicated(case, keep='first') + tm.assert_numpy_array_equal(res_first, exp_first) + + res_last = algos.duplicated(case, keep='last') + tm.assert_numpy_array_equal(res_last, exp_last) + + res_false = algos.duplicated(case, keep=False) + tm.assert_numpy_array_equal(res_false, exp_false) + + # index + for idx in [pd.Index(case), pd.Index(case, dtype='category')]: + res_first = idx.duplicated(keep='first') + tm.assert_numpy_array_equal(res_first, exp_first) + + res_last = idx.duplicated(keep='last') + tm.assert_numpy_array_equal(res_last, exp_last) + + res_false = idx.duplicated(keep=False) + tm.assert_numpy_array_equal(res_false, exp_false) + + # series + for s in [pd.Series(case), pd.Series(case, dtype='category')]: + res_first = s.duplicated(keep='first') + tm.assert_series_equal(res_first, pd.Series(exp_first)) + + res_last = s.duplicated(keep='last') + tm.assert_series_equal(res_last, pd.Series(exp_last)) + + res_false = s.duplicated(keep=False) + tm.assert_series_equal(res_false, pd.Series(exp_false)) + + def test_datetime_likes(self): + + dt = ['2011-01-01', '2011-01-02', '2011-01-01', 'NaT', '2011-01-03', + '2011-01-02', '2011-01-04', '2011-01-01', 'NaT', '2011-01-06'] + td = ['1 days', '2 days', '1 days', 'NaT', '3 days', + '2 days', '4 days', '1 days', 'NaT', '6 days'] + + cases = [np.array([pd.Timestamp(d) for d in dt]), + np.array([pd.Timestamp(d, tz='US/Eastern') for d in dt]), + np.array([pd.Period(d, freq='D') for d in dt]), + np.array([np.datetime64(d) for d in dt]), + np.array([pd.Timedelta(d) for d in td])] + + exp_first = np.array([False, False, True, False, False, + True, False, True, True, False]) + exp_last = np.array([True, True, True, True, False, + False, False, False, False, False]) + exp_false = exp_first | exp_last + + for case in cases: + res_first = algos.duplicated(case, keep='first') + tm.assert_numpy_array_equal(res_first, exp_first) + + res_last = algos.duplicated(case, keep='last') + tm.assert_numpy_array_equal(res_last, exp_last) + + res_false = algos.duplicated(case, keep=False) + tm.assert_numpy_array_equal(res_false, exp_false) + + # index + for idx in [pd.Index(case), pd.Index(case, dtype='category'), + pd.Index(case, dtype=object)]: + res_first = idx.duplicated(keep='first') + tm.assert_numpy_array_equal(res_first, exp_first) + + res_last = idx.duplicated(keep='last') + tm.assert_numpy_array_equal(res_last, exp_last) + + res_false = idx.duplicated(keep=False) + tm.assert_numpy_array_equal(res_false, exp_false) + + # series + for s in [pd.Series(case), pd.Series(case, dtype='category'), + pd.Series(case, dtype=object)]: + res_first = s.duplicated(keep='first') + tm.assert_series_equal(res_first, pd.Series(exp_first)) + + res_last = s.duplicated(keep='last') + tm.assert_series_equal(res_last, pd.Series(exp_last)) + + res_false = s.duplicated(keep=False) + tm.assert_series_equal(res_false, pd.Series(exp_false)) + + def test_unique_index(self): + cases = [pd.Index([1, 2, 3]), pd.RangeIndex(0, 3)] + for case in cases: + self.assertTrue(case.is_unique) + tm.assert_numpy_array_equal(case.duplicated(), + np.array([False, False, False])) + + class GroupVarTestMixin(object): def test_group_var_generic_1d(self): @@ -702,12 +928,14 @@ def test_unique_label_indices(): left = unique_label_indices(a) right = np.unique(a, return_index=True)[1] - tm.assert_numpy_array_equal(left, right) + tm.assert_numpy_array_equal(left, right, + check_dtype=False) a[np.random.choice(len(a), 10)] = -1 left = unique_label_indices(a) right = np.unique(a, return_index=True)[1][1:] - tm.assert_numpy_array_equal(left, right) + tm.assert_numpy_array_equal(left, right, + check_dtype=False) def test_rank(): @@ -730,7 +958,7 @@ def _check(arr): def test_pad_backfill_object_segfault(): old = np.array([], dtype='O') - new = np.array([datetime.datetime(2010, 12, 31)], dtype='O') + new = np.array([datetime(2010, 12, 31)], dtype='O') result = _algos.pad_object(old, new) expected = np.array([-1], dtype=np.int64) @@ -809,153 +1037,6 @@ def test_pad(self): self.assert_numpy_array_equal(filler, expect_filler) -def test_left_join_indexer_unique(): - a = np.array([1, 2, 3, 4, 5], dtype=np.int64) - b = np.array([2, 2, 3, 4, 4], dtype=np.int64) - - result = _algos.left_join_indexer_unique_int64(b, a) - expected = np.array([1, 1, 2, 3, 3], dtype=np.int64) - assert (np.array_equal(result, expected)) - - -def test_left_outer_join_bug(): - left = np.array([0, 1, 0, 1, 1, 2, 3, 1, 0, 2, 1, 2, 0, 1, 1, 2, 3, 2, 3, - 2, 1, 1, 3, 0, 3, 2, 3, 0, 0, 2, 3, 2, 0, 3, 1, 3, 0, 1, - 3, 0, 0, 1, 0, 3, 1, 0, 1, 0, 1, 1, 0, 2, 2, 2, 2, 2, 0, - 3, 1, 2, 0, 0, 3, 1, 3, 2, 2, 0, 1, 3, 0, 2, 3, 2, 3, 3, - 2, 3, 3, 1, 3, 2, 0, 0, 3, 1, 1, 1, 0, 2, 3, 3, 1, 2, 0, - 3, 1, 2, 0, 2], dtype=np.int64) - - right = np.array([3, 1], dtype=np.int64) - max_groups = 4 - - lidx, ridx = _algos.left_outer_join(left, right, max_groups, sort=False) - - exp_lidx = np.arange(len(left)) - exp_ridx = -np.ones(len(left)) - exp_ridx[left == 1] = 1 - exp_ridx[left == 3] = 0 - - assert (np.array_equal(lidx, exp_lidx)) - assert (np.array_equal(ridx, exp_ridx)) - - -def test_inner_join_indexer(): - a = np.array([1, 2, 3, 4, 5], dtype=np.int64) - b = np.array([0, 3, 5, 7, 9], dtype=np.int64) - - index, ares, bres = _algos.inner_join_indexer_int64(a, b) - - index_exp = np.array([3, 5], dtype=np.int64) - assert_almost_equal(index, index_exp) - - aexp = np.array([2, 4], dtype=np.int64) - bexp = np.array([1, 2], dtype=np.int64) - assert_almost_equal(ares, aexp) - assert_almost_equal(bres, bexp) - - a = np.array([5], dtype=np.int64) - b = np.array([5], dtype=np.int64) - - index, ares, bres = _algos.inner_join_indexer_int64(a, b) - tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64)) - tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.int64)) - tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.int64)) - - -def test_outer_join_indexer(): - a = np.array([1, 2, 3, 4, 5], dtype=np.int64) - b = np.array([0, 3, 5, 7, 9], dtype=np.int64) - - index, ares, bres = _algos.outer_join_indexer_int64(a, b) - - index_exp = np.array([0, 1, 2, 3, 4, 5, 7, 9], dtype=np.int64) - assert_almost_equal(index, index_exp) - - aexp = np.array([-1, 0, 1, 2, 3, 4, -1, -1], dtype=np.int64) - bexp = np.array([0, -1, -1, 1, -1, 2, 3, 4], dtype=np.int64) - assert_almost_equal(ares, aexp) - assert_almost_equal(bres, bexp) - - a = np.array([5], dtype=np.int64) - b = np.array([5], dtype=np.int64) - - index, ares, bres = _algos.outer_join_indexer_int64(a, b) - tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64)) - tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.int64)) - tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.int64)) - - -def test_left_join_indexer(): - a = np.array([1, 2, 3, 4, 5], dtype=np.int64) - b = np.array([0, 3, 5, 7, 9], dtype=np.int64) - - index, ares, bres = _algos.left_join_indexer_int64(a, b) - - assert_almost_equal(index, a) - - aexp = np.array([0, 1, 2, 3, 4], dtype=np.int64) - bexp = np.array([-1, -1, 1, -1, 2], dtype=np.int64) - assert_almost_equal(ares, aexp) - assert_almost_equal(bres, bexp) - - a = np.array([5], dtype=np.int64) - b = np.array([5], dtype=np.int64) - - index, ares, bres = _algos.left_join_indexer_int64(a, b) - tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64)) - tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.int64)) - tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.int64)) - - -def test_left_join_indexer2(): - idx = Index([1, 1, 2, 5]) - idx2 = Index([1, 2, 5, 7, 9]) - - res, lidx, ridx = _algos.left_join_indexer_int64(idx2.values, idx.values) - - exp_res = np.array([1, 1, 2, 5, 7, 9], dtype=np.int64) - assert_almost_equal(res, exp_res) - - exp_lidx = np.array([0, 0, 1, 2, 3, 4], dtype=np.int64) - assert_almost_equal(lidx, exp_lidx) - - exp_ridx = np.array([0, 1, 2, 3, -1, -1], dtype=np.int64) - assert_almost_equal(ridx, exp_ridx) - - -def test_outer_join_indexer2(): - idx = Index([1, 1, 2, 5]) - idx2 = Index([1, 2, 5, 7, 9]) - - res, lidx, ridx = _algos.outer_join_indexer_int64(idx2.values, idx.values) - - exp_res = np.array([1, 1, 2, 5, 7, 9], dtype=np.int64) - assert_almost_equal(res, exp_res) - - exp_lidx = np.array([0, 0, 1, 2, 3, 4], dtype=np.int64) - assert_almost_equal(lidx, exp_lidx) - - exp_ridx = np.array([0, 1, 2, 3, -1, -1], dtype=np.int64) - assert_almost_equal(ridx, exp_ridx) - - -def test_inner_join_indexer2(): - idx = Index([1, 1, 2, 5]) - idx2 = Index([1, 2, 5, 7, 9]) - - res, lidx, ridx = _algos.inner_join_indexer_int64(idx2.values, idx.values) - - exp_res = np.array([1, 1, 2, 5], dtype=np.int64) - assert_almost_equal(res, exp_res) - - exp_lidx = np.array([0, 0, 1, 2], dtype=np.int64) - assert_almost_equal(lidx, exp_lidx) - - exp_ridx = np.array([0, 1, 2, 3], dtype=np.int64) - assert_almost_equal(ridx, exp_ridx) - - def test_is_lexsorted(): failure = [ np.array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, @@ -1012,8 +1093,37 @@ def test_groupsort_indexer(): assert (np.array_equal(result, expected)) +def test_infinity_sort(): + # GH 13445 + # numpy's argsort can be unhappy if something is less than + # itself. Instead, let's give our infinities a self-consistent + # ordering, but outside the float extended real line. + + Inf = _algos.Infinity() + NegInf = _algos.NegInfinity() + + ref_nums = [NegInf, float("-inf"), -1e100, 0, 1e100, float("inf"), Inf] + + assert all(Inf >= x for x in ref_nums) + assert all(Inf > x or x is Inf for x in ref_nums) + assert Inf >= Inf and Inf == Inf + assert not Inf < Inf and not Inf > Inf + + assert all(NegInf <= x for x in ref_nums) + assert all(NegInf < x or x is NegInf for x in ref_nums) + assert NegInf <= NegInf and NegInf == NegInf + assert not NegInf < NegInf and not NegInf > NegInf + + for perm in permutations(ref_nums): + assert sorted(perm) == ref_nums + + # smoke tests + np.array([_algos.Infinity()] * 32).argsort() + np.array([_algos.NegInfinity()] * 32).argsort() + + def test_ensure_platform_int(): - arr = np.arange(100) + arr = np.arange(100, dtype=np.intp) result = _algos.ensure_platform_int(arr) assert (result is arr) diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 77ae3ca20d123..eaa316bfd8157 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -9,7 +9,8 @@ import pandas as pd import pandas.compat as compat -import pandas.core.common as com +from pandas.types.common import (is_object_dtype, is_datetimetz, + needs_i8_conversion) import pandas.util.testing as tm from pandas import (Series, Index, DatetimeIndex, TimedeltaIndex, PeriodIndex, Timedelta) @@ -393,7 +394,7 @@ def test_ops(self): if not isinstance(o, PeriodIndex): expected = getattr(o.values, op)() else: - expected = pd.Period(ordinal=getattr(o.values, op)(), + expected = pd.Period(ordinal=getattr(o._values, op)(), freq=o.freq) try: self.assertEqual(result, expected) @@ -449,46 +450,31 @@ def test_nanops(self): def test_value_counts_unique_nunique(self): for orig in self.objs: - o = orig.copy() klass = type(o) - values = o.values - - # create repeated values, 'n'th element is repeated by n+1 times - if isinstance(o, PeriodIndex): - # freq must be specified because repeat makes freq ambiguous - - # resets name from Index - expected_index = pd.Index(o[::-1]) - expected_index.name = None + values = o._values - # attach name to klass - o = o.repeat(range(1, len(o) + 1)) - o.name = 'a' + if isinstance(values, Index): + # reset name not to affect latter process + values.name = None - elif isinstance(o, DatetimeIndex): - - # resets name from Index - expected_index = pd.Index(o[::-1]) - expected_index.name = None - - # attach name to klass - o = o.repeat(range(1, len(o) + 1)) - o.name = 'a' - - # don't test boolean - elif isinstance(o, Index) and o.is_boolean(): + # create repeated values, 'n'th element is repeated by n+1 times + # skip boolean, because it only has 2 values at most + if isinstance(o, Index) and o.is_boolean(): continue elif isinstance(o, Index): - expected_index = pd.Index(values[::-1]) + expected_index = pd.Index(o[::-1]) expected_index.name = None o = o.repeat(range(1, len(o) + 1)) o.name = 'a' else: expected_index = pd.Index(values[::-1]) idx = o.index.repeat(range(1, len(o) + 1)) - o = klass(np.repeat(values, range(1, len(o) + 1)), - index=idx, name='a') + rep = np.repeat(values, range(1, len(o) + 1)) + o = klass(rep, index=idx, name='a') + + # check values has the same dtype as the original + self.assertEqual(o.dtype, orig.dtype) expected_s = Series(range(10, 0, -1), index=expected_index, dtype='int64', name='a') @@ -499,25 +485,34 @@ def test_value_counts_unique_nunique(self): self.assertEqual(result.name, 'a') result = o.unique() - if isinstance(o, (DatetimeIndex, PeriodIndex)): + if isinstance(o, Index): self.assertTrue(isinstance(result, o.__class__)) - self.assertEqual(result.freq, o.freq) self.assert_index_equal(result, orig) + elif is_datetimetz(o): + # datetimetz Series returns array of Timestamp + self.assertEqual(result[0], orig[0]) + for r in result: + self.assertIsInstance(r, pd.Timestamp) + tm.assert_numpy_array_equal(result, + orig._values.asobject.values) else: - self.assert_numpy_array_equal(result, values) + tm.assert_numpy_array_equal(result, orig.values) self.assertEqual(o.nunique(), len(np.unique(o.values))) + def test_value_counts_unique_nunique_null(self): + for null_obj in [np.nan, None]: - for o in self.objs: + for orig in self.objs: + o = orig.copy() klass = type(o) - values = o.values + values = o._values if not self._allow_na_ops(o): continue # special assign to the numpy array - if com.is_datetimetz(o): + if is_datetimetz(o): if isinstance(o, DatetimeIndex): v = o.asi8 v[0:2] = pd.tslib.iNaT @@ -525,34 +520,45 @@ def test_value_counts_unique_nunique(self): else: o = o.copy() o[0:2] = pd.tslib.iNaT - values = o.values - elif o.values.dtype == 'datetime64[ns]' or isinstance( - o, PeriodIndex): + values = o._values + + elif needs_i8_conversion(o): values[0:2] = pd.tslib.iNaT + values = o._shallow_copy(values) else: values[0:2] = null_obj + # check values has the same dtype as the original + + self.assertEqual(values.dtype, o.dtype) # create repeated values, 'n'th element is repeated by n+1 # times - if isinstance(o, PeriodIndex): - # freq must be specified because repeat makes freq - # ambiguous + if isinstance(o, (DatetimeIndex, PeriodIndex)): + expected_index = o.copy() + expected_index.name = None - # resets name from Index - expected_index = pd.Index(o, name=None) # attach name to klass - o = klass(np.repeat(values, range(1, len(o) + 1)), - freq=o.freq, name='a') - elif isinstance(o, Index): - expected_index = pd.Index(values, name=None) - o = klass( - np.repeat(values, range(1, len(o) + 1)), name='a') + o = klass(values.repeat(range(1, len(o) + 1))) + o.name = 'a' + else: + if is_datetimetz(o): + expected_index = orig._values._shallow_copy(values) + else: + expected_index = pd.Index(values) + expected_index.name = None + o = o.repeat(range(1, len(o) + 1)) + o.name = 'a' + + # check values has the same dtype as the original + self.assertEqual(o.dtype, orig.dtype) + # check values correctly have NaN + nanloc = np.zeros(len(o), dtype=np.bool) + nanloc[:3] = True + if isinstance(o, Index): + self.assert_numpy_array_equal(pd.isnull(o), nanloc) else: - expected_index = pd.Index(values, name=None) - idx = np.repeat(o.index.values, range(1, len(o) + 1)) - o = klass( - np.repeat(values, range( - 1, len(o) + 1)), index=idx, name='a') + exp = pd.Series(nanloc, o.index, name='a') + self.assert_series_equal(pd.isnull(o), exp) expected_s_na = Series(list(range(10, 2, -1)) + [3], index=expected_index[9:0:-1], @@ -570,14 +576,20 @@ def test_value_counts_unique_nunique(self): self.assertTrue(result_s.index.name is None) self.assertEqual(result_s.name, 'a') - # numpy_array_equal cannot compare arrays includes nan result = o.unique() - self.assert_numpy_array_equal(result[1:], values[2:]) - - if isinstance(o, (DatetimeIndex, PeriodIndex)): - self.assertTrue(result.asi8[0] == pd.tslib.iNaT) + if isinstance(o, Index): + tm.assert_index_equal(result, + Index(values[1:], name='a')) + elif is_datetimetz(o): + # unable to compare NaT / nan + tm.assert_numpy_array_equal(result[1:], + values[2:].asobject.values) + self.assertIs(result[0], pd.NaT) else: + tm.assert_numpy_array_equal(result[1:], values[2:]) + self.assertTrue(pd.isnull(result[0])) + self.assertEqual(result.dtype, orig.dtype) self.assertEqual(o.nunique(), 8) self.assertEqual(o.nunique(dropna=False), 9) @@ -590,8 +602,13 @@ def test_value_counts_inferred(self): expected = Series([4, 3, 2, 1], index=['b', 'a', 'd', 'c']) tm.assert_series_equal(s.value_counts(), expected) - exp = np.unique(np.array(s_values, dtype=np.object_)) - self.assert_numpy_array_equal(s.unique(), exp) + if isinstance(s, Index): + exp = Index(np.unique(np.array(s_values, dtype=np.object_))) + tm.assert_index_equal(s.unique(), exp) + else: + exp = np.unique(np.array(s_values, dtype=np.object_)) + tm.assert_numpy_array_equal(s.unique(), exp) + self.assertEqual(s.nunique(), 4) # don't sort, have to sort after the fact as not sorting is # platform-dep @@ -627,8 +644,12 @@ def test_value_counts_bins(self): exp1n = Series({0.998: 1.0}) tm.assert_series_equal(res1n, exp1n) - self.assert_numpy_array_equal(s1.unique(), - np.array([1, 2, 3], dtype=np.int64)) + if isinstance(s1, Index): + tm.assert_index_equal(s1.unique(), Index([1, 2, 3])) + else: + exp = np.array([1, 2, 3], dtype=np.int64) + tm.assert_numpy_array_equal(s1.unique(), exp) + self.assertEqual(s1.nunique(), 3) res4 = s1.value_counts(bins=4) @@ -652,8 +673,12 @@ def test_value_counts_bins(self): expected = Series([4, 3, 2], index=['b', 'a', 'd']) tm.assert_series_equal(s.value_counts(), expected) - exp = np.array(['a', 'b', np.nan, 'd'], dtype=np.object_) - self.assert_numpy_array_equal(s.unique(), exp) + if isinstance(s, Index): + exp = Index(['a', 'b', np.nan, 'd']) + tm.assert_index_equal(s.unique(), exp) + else: + exp = np.array(['a', 'b', np.nan, 'd'], dtype=object) + tm.assert_numpy_array_equal(s.unique(), exp) self.assertEqual(s.nunique(), 3) s = klass({}) @@ -661,8 +686,13 @@ def test_value_counts_bins(self): tm.assert_series_equal(s.value_counts(), expected, check_index_type=False) # returned dtype differs depending on original - self.assert_numpy_array_equal(s.unique(), np.array([]), - check_dtype=False) + if isinstance(s, Index): + self.assert_index_equal(s.unique(), Index([]), + exact=False) + else: + self.assert_numpy_array_equal(s.unique(), np.array([]), + check_dtype=False) + self.assertEqual(s.nunique(), 0) def test_value_counts_datetime64(self): @@ -691,10 +721,10 @@ def test_value_counts_datetime64(self): '2009-01-01 00:00:00Z', '2008-09-09 00:00:00Z'], dtype='datetime64[ns]') - if isinstance(s, DatetimeIndex): - self.assert_index_equal(s.unique(), DatetimeIndex(expected)) + if isinstance(s, Index): + tm.assert_index_equal(s.unique(), DatetimeIndex(expected)) else: - self.assert_numpy_array_equal(s.unique(), expected) + tm.assert_numpy_array_equal(s.unique(), expected) self.assertEqual(s.nunique(), 3) @@ -714,12 +744,12 @@ def test_value_counts_datetime64(self): self.assertEqual(unique.dtype, 'datetime64[ns]') # numpy_array_equal cannot compare pd.NaT - if isinstance(s, DatetimeIndex): - self.assert_index_equal(unique[:3], DatetimeIndex(expected)) + if isinstance(s, Index): + exp_idx = DatetimeIndex(expected.tolist() + [pd.NaT]) + tm.assert_index_equal(unique, exp_idx) else: - self.assert_numpy_array_equal(unique[:3], expected) - self.assertTrue(unique[3] is pd.NaT or - unique[3].astype('int64') == pd.tslib.iNaT) + tm.assert_numpy_array_equal(unique[:3], expected) + self.assertTrue(pd.isnull(unique[3])) self.assertEqual(s.nunique(), 3) self.assertEqual(s.nunique(dropna=False), 4) @@ -733,10 +763,10 @@ def test_value_counts_datetime64(self): tm.assert_series_equal(result, expected_s) expected = TimedeltaIndex(['1 days'], name='dt') - if isinstance(td, TimedeltaIndex): - self.assert_index_equal(td.unique(), expected) + if isinstance(td, Index): + tm.assert_index_equal(td.unique(), expected) else: - self.assert_numpy_array_equal(td.unique(), expected.values) + tm.assert_numpy_array_equal(td.unique(), expected.values) td2 = timedelta(1) + (df.dt - df.dt) td2 = klass(td2, name='dt') @@ -748,11 +778,11 @@ def test_factorize(self): o = orig.copy() if isinstance(o, Index) and o.is_boolean(): - exp_arr = np.array([0, 1] + [0] * 8) + exp_arr = np.array([0, 1] + [0] * 8, dtype=np.intp) exp_uniques = o exp_uniques = Index([False, True]) else: - exp_arr = np.array(range(len(o))) + exp_arr = np.array(range(len(o)), dtype=np.intp) exp_uniques = o labels, uniques = o.factorize() @@ -782,7 +812,8 @@ def test_factorize_repeated(self): o = o.take(indexer) n = o[5:].append(o) - exp_arr = np.array([5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) + exp_arr = np.array([5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9], + dtype=np.intp) labels, uniques = n.factorize(sort=True) self.assert_numpy_array_equal(labels, exp_arr) @@ -792,7 +823,8 @@ def test_factorize_repeated(self): else: self.assert_index_equal(uniques, o, check_names=False) - exp_arr = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4]) + exp_arr = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4], + np.intp) labels, uniques = n.factorize(sort=False) self.assert_numpy_array_equal(labels, exp_arr) @@ -921,18 +953,14 @@ def test_fillna(self): # # GH 11343 # though Index.fillna and Series.fillna has separate impl, # test here to confirm these works as the same - def get_fill_value(obj): - if isinstance(obj, pd.tseries.base.DatetimeIndexOpsMixin): - return obj.asobject.values[0] - else: - return obj.values[0] - for o in self.objs: - klass = type(o) + for orig in self.objs: + + o = orig.copy() values = o.values # values will not be changed - result = o.fillna(get_fill_value(o)) + result = o.fillna(o.astype(object).values[0]) if isinstance(o, Index): self.assert_index_equal(o, result) else: @@ -941,33 +969,30 @@ def get_fill_value(obj): self.assertFalse(o is result) for null_obj in [np.nan, None]: - for o in self.objs: + for orig in self.objs: + o = orig.copy() klass = type(o) - values = o.values.copy() if not self._allow_na_ops(o): continue - # value for filling - fill_value = get_fill_value(o) + if needs_i8_conversion(o): - # special assign to the numpy array - if o.values.dtype == 'datetime64[ns]' or isinstance( - o, PeriodIndex): - values[0:2] = pd.tslib.iNaT + values = o.astype(object).values + fill_value = values[0] + values[0:2] = pd.NaT else: + values = o.values.copy() + fill_value = o.values[0] values[0:2] = null_obj - if isinstance(o, PeriodIndex): - # freq must be specified because repeat makes freq - # ambiguous - expected = [fill_value.ordinal] * 2 + list(values[2:]) - expected = klass(ordinal=expected, freq=o.freq) - o = klass(ordinal=values, freq=o.freq) - else: - expected = [fill_value] * 2 + list(values[2:]) - expected = klass(expected) - o = klass(values) + expected = [fill_value] * 2 + list(values[2:]) + + expected = klass(expected) + o = klass(values) + + # check values has the same dtype as the original + self.assertEqual(o.dtype, orig.dtype) result = o.fillna(fill_value) if isinstance(o, Index): @@ -982,8 +1007,8 @@ def test_memory_usage(self): res = o.memory_usage() res_deep = o.memory_usage(deep=True) - if (com.is_object_dtype(o) or (isinstance(o, Series) and - com.is_object_dtype(o.index))): + if (is_object_dtype(o) or (isinstance(o, Series) and + is_object_dtype(o.index))): # if there are objects, only deep will pick them up self.assertTrue(res_deep > res) else: diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index cff5bbe14f1eb..a494a0d53b123 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -1,19 +1,23 @@ # -*- coding: utf-8 -*- # pylint: disable=E1101,E1103,W0232 -import os import sys from datetime import datetime from distutils.version import LooseVersion import numpy as np +from pandas.types.dtypes import CategoricalDtype +from pandas.types.common import (is_categorical_dtype, + is_object_dtype, + is_float_dtype, + is_integer_dtype) + import pandas as pd import pandas.compat as compat -import pandas.core.common as com import pandas.util.testing as tm from pandas import (Categorical, Index, Series, DataFrame, PeriodIndex, - Timestamp, CategoricalIndex) + Timestamp, CategoricalIndex, isnull) from pandas.compat import range, lrange, u, PY3 from pandas.core.config import option_context @@ -25,9 +29,8 @@ class TestCategorical(tm.TestCase): _multiprocess_can_split_ = True def setUp(self): - self.factor = Categorical.from_array(['a', 'b', 'b', 'a', - 'a', 'c', 'c', 'c'], - ordered=True) + self.factor = Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'], + ordered=True) def test_getitem(self): self.assertEqual(self.factor[0], 'a') @@ -66,8 +69,8 @@ def test_setitem(self): indexer[0] = True indexer[-1] = True c[indexer] = 'c' - expected = Categorical.from_array(['c', 'b', 'b', 'a', - 'a', 'c', 'c', 'c'], ordered=True) + expected = Categorical(['c', 'b', 'b', 'a', 'a', 'c', 'c', 'c'], + ordered=True) self.assert_categorical_equal(c, expected) @@ -90,21 +93,12 @@ def test_constructor_unsortable(self): # it works! arr = np.array([1, 2, 3, datetime.now()], dtype='O') - factor = Categorical.from_array(arr, ordered=False) + factor = Categorical(arr, ordered=False) self.assertFalse(factor.ordered) - if compat.PY3: - self.assertRaises( - TypeError, lambda: Categorical.from_array(arr, ordered=True)) - else: - # this however will raise as cannot be sorted (on PY3 or older - # numpies) - if LooseVersion(np.__version__) < "1.10": - self.assertRaises( - TypeError, - lambda: Categorical.from_array(arr, ordered=True)) - else: - Categorical.from_array(arr, ordered=True) + # this however will raise as cannot be sorted + self.assertRaises( + TypeError, lambda: Categorical(arr, ordered=True)) def test_is_equal_dtype(self): @@ -195,18 +189,18 @@ def f(): # This should result in integer categories, not float! cat = pd.Categorical([1, 2, 3, np.nan], categories=[1, 2, 3]) - self.assertTrue(com.is_integer_dtype(cat.categories)) + self.assertTrue(is_integer_dtype(cat.categories)) # https://github.com/pydata/pandas/issues/3678 cat = pd.Categorical([np.nan, 1, 2, 3]) - self.assertTrue(com.is_integer_dtype(cat.categories)) + self.assertTrue(is_integer_dtype(cat.categories)) # this should result in floats cat = pd.Categorical([np.nan, 1, 2., 3]) - self.assertTrue(com.is_float_dtype(cat.categories)) + self.assertTrue(is_float_dtype(cat.categories)) cat = pd.Categorical([np.nan, 1., 2., 3.]) - self.assertTrue(com.is_float_dtype(cat.categories)) + self.assertTrue(is_float_dtype(cat.categories)) # Deprecating NaNs in categoires (GH #10748) # preserve int as far as possible by converting to object if NaN is in @@ -214,23 +208,23 @@ def f(): with tm.assert_produces_warning(FutureWarning): cat = pd.Categorical([np.nan, 1, 2, 3], categories=[np.nan, 1, 2, 3]) - self.assertTrue(com.is_object_dtype(cat.categories)) + self.assertTrue(is_object_dtype(cat.categories)) # This doesn't work -> this would probably need some kind of "remember # the original type" feature to try to cast the array interface result # to... # vals = np.asarray(cat[cat.notnull()]) - # self.assertTrue(com.is_integer_dtype(vals)) + # self.assertTrue(is_integer_dtype(vals)) with tm.assert_produces_warning(FutureWarning): cat = pd.Categorical([np.nan, "a", "b", "c"], categories=[np.nan, "a", "b", "c"]) - self.assertTrue(com.is_object_dtype(cat.categories)) + self.assertTrue(is_object_dtype(cat.categories)) # but don't do it for floats with tm.assert_produces_warning(FutureWarning): cat = pd.Categorical([np.nan, 1., 2., 3.], categories=[np.nan, 1., 2., 3.]) - self.assertTrue(com.is_float_dtype(cat.categories)) + self.assertTrue(is_float_dtype(cat.categories)) # corner cases cat = pd.Categorical([1]) @@ -346,28 +340,44 @@ def test_constructor_with_datetimelike(self): def test_constructor_from_index_series_datetimetz(self): idx = pd.date_range('2015-01-01 10:00', freq='D', periods=3, tz='US/Eastern') - result = pd.Categorical.from_array(idx) + result = pd.Categorical(idx) tm.assert_index_equal(result.categories, idx) - result = pd.Categorical.from_array(pd.Series(idx)) + result = pd.Categorical(pd.Series(idx)) tm.assert_index_equal(result.categories, idx) def test_constructor_from_index_series_timedelta(self): idx = pd.timedelta_range('1 days', freq='D', periods=3) - result = pd.Categorical.from_array(idx) + result = pd.Categorical(idx) tm.assert_index_equal(result.categories, idx) - result = pd.Categorical.from_array(pd.Series(idx)) + result = pd.Categorical(pd.Series(idx)) tm.assert_index_equal(result.categories, idx) def test_constructor_from_index_series_period(self): idx = pd.period_range('2015-01-01', freq='D', periods=3) - result = pd.Categorical.from_array(idx) + result = pd.Categorical(idx) tm.assert_index_equal(result.categories, idx) - result = pd.Categorical.from_array(pd.Series(idx)) + result = pd.Categorical(pd.Series(idx)) tm.assert_index_equal(result.categories, idx) + def test_constructor_invariant(self): + # GH 14190 + vals = [ + np.array([1., 1.2, 1.8, np.nan]), + np.array([1, 2, 3], dtype='int64'), + ['a', 'b', 'c', np.nan], + [pd.Period('2014-01'), pd.Period('2014-02'), pd.NaT], + [pd.Timestamp('2014-01-01'), pd.Timestamp('2014-01-02'), pd.NaT], + [pd.Timestamp('2014-01-01', tz='US/Eastern'), + pd.Timestamp('2014-01-02', tz='US/Eastern'), pd.NaT], + ] + for val in vals: + c = Categorical(val) + c2 = Categorical(c) + tm.assert_categorical_equal(c, c2) + def test_from_codes(self): # too few categories @@ -403,6 +413,21 @@ def f(): codes = np.random.choice([0, 1], 5, p=[0.9, 0.1]) pd.Categorical.from_codes(codes, categories=["train", "test"]) + def test_validate_ordered(self): + # see gh-14058 + exp_msg = "'ordered' must either be 'True' or 'False'" + exp_err = TypeError + + # This should be a boolean. + ordered = np.array([0, 1, 2]) + + with tm.assertRaisesRegexp(exp_err, exp_msg): + Categorical([1, 2, 3], ordered=ordered) + + with tm.assertRaisesRegexp(exp_err, exp_msg): + Categorical.from_codes([0, 0, 1], categories=['a', 'b', 'c'], + ordered=ordered) + def test_comparisons(self): result = self.factor[self.factor == 'a'] @@ -515,17 +540,20 @@ def f(): def test_argsort(self): c = Categorical([5, 3, 1, 4, 2], ordered=True) - expected = np.array([2, 4, 1, 3, 0], dtype=np.int64) - tm.assert_numpy_array_equal(c.argsort(ascending=True), expected) + expected = np.array([2, 4, 1, 3, 0]) + tm.assert_numpy_array_equal(c.argsort(ascending=True), expected, + check_dtype=False) expected = expected[::-1] - tm.assert_numpy_array_equal(c.argsort(ascending=False), expected) + tm.assert_numpy_array_equal(c.argsort(ascending=False), expected, + check_dtype=False) def test_numpy_argsort(self): c = Categorical([5, 3, 1, 4, 2], ordered=True) - expected = np.array([2, 4, 1, 3, 0], dtype=np.int64) - tm.assert_numpy_array_equal(np.argsort(c), expected) + expected = np.array([2, 4, 1, 3, 0]) + tm.assert_numpy_array_equal(np.argsort(c), expected, + check_dtype=False) msg = "the 'kind' parameter is not supported" tm.assertRaisesRegexp(ValueError, msg, np.argsort, @@ -549,7 +577,7 @@ def test_na_flags_int_categories(self): cat = Categorical(labels, categories, fastpath=True) repr(cat) - self.assert_numpy_array_equal(com.isnull(cat), labels == -1) + self.assert_numpy_array_equal(isnull(cat), labels == -1) def test_categories_none(self): factor = Categorical(['a', 'b', 'b', 'a', @@ -708,7 +736,7 @@ def test_periodindex(self): idx1 = PeriodIndex(['2014-01', '2014-01', '2014-02', '2014-02', '2014-03', '2014-03'], freq='M') - cat1 = Categorical.from_array(idx1) + cat1 = Categorical(idx1) str(cat1) exp_arr = np.array([0, 0, 1, 1, 2, 2], dtype=np.int8) exp_idx = PeriodIndex(['2014-01', '2014-02', '2014-03'], freq='M') @@ -717,7 +745,7 @@ def test_periodindex(self): idx2 = PeriodIndex(['2014-03', '2014-03', '2014-02', '2014-01', '2014-03', '2014-01'], freq='M') - cat2 = Categorical.from_array(idx2, ordered=True) + cat2 = Categorical(idx2, ordered=True) str(cat2) exp_arr = np.array([2, 2, 1, 0, 2, 0], dtype=np.int8) exp_idx2 = PeriodIndex(['2014-01', '2014-02', '2014-03'], freq='M') @@ -726,7 +754,7 @@ def test_periodindex(self): idx3 = PeriodIndex(['2013-12', '2013-11', '2013-10', '2013-09', '2013-08', '2013-07', '2013-05'], freq='M') - cat3 = Categorical.from_array(idx3, ordered=True) + cat3 = Categorical(idx3, ordered=True) exp_arr = np.array([6, 5, 4, 3, 2, 1, 0], dtype=np.int8) exp_idx = PeriodIndex(['2013-05', '2013-07', '2013-08', '2013-09', '2013-10', '2013-11', '2013-12'], freq='M') @@ -800,13 +828,12 @@ def test_set_ordered(self): cat2.set_ordered(False, inplace=True) self.assertFalse(cat2.ordered) - # deperecated in v0.16.0 - with tm.assert_produces_warning(FutureWarning): - cat.ordered = False - self.assertFalse(cat.ordered) - with tm.assert_produces_warning(FutureWarning): + # removed in 0.19.0 + msg = "can\'t set attribute" + with tm.assertRaisesRegexp(AttributeError, msg): cat.ordered = True - self.assertTrue(cat.ordered) + with tm.assertRaisesRegexp(AttributeError, msg): + cat.ordered = False def test_set_categories(self): cat = Categorical(["a", "b", "c", "a"], ordered=True) @@ -1287,6 +1314,30 @@ def test_unique_ordered(self): ordered=True) tm.assert_categorical_equal(res, exp_cat) + def test_unique_index_series(self): + c = Categorical([3, 1, 2, 2, 1], categories=[3, 2, 1]) + # Categorical.unique sorts categories by appearance order + # if ordered=False + exp = Categorical([3, 1, 2], categories=[3, 1, 2]) + tm.assert_categorical_equal(c.unique(), exp) + + tm.assert_index_equal(Index(c).unique(), Index(exp)) + tm.assert_categorical_equal(pd.Series(c).unique(), exp) + + c = Categorical([1, 1, 2, 2], categories=[3, 2, 1]) + exp = Categorical([1, 2], categories=[1, 2]) + tm.assert_categorical_equal(c.unique(), exp) + tm.assert_index_equal(Index(c).unique(), Index(exp)) + tm.assert_categorical_equal(pd.Series(c).unique(), exp) + + c = Categorical([3, 1, 2, 2, 1], categories=[3, 2, 1], ordered=True) + # Categorical.unique keeps categories order if ordered=True + exp = Categorical([3, 1, 2], categories=[3, 2, 1], ordered=True) + tm.assert_categorical_equal(c.unique(), exp) + + tm.assert_index_equal(Index(c).unique(), Index(exp)) + tm.assert_categorical_equal(pd.Series(c).unique(), exp) + def test_mode(self): s = Categorical([1, 1, 2, 4, 5, 5, 5], categories=[5, 4, 3, 2, 1], ordered=True) @@ -1505,7 +1556,7 @@ def test_searchsorted(self): # Single item array res = c1.searchsorted(['bread']) chk = s1.searchsorted(['bread']) - exp = np.array([1], dtype=np.int64) + exp = np.array([1], dtype=np.intp) self.assert_numpy_array_equal(res, exp) self.assert_numpy_array_equal(res, chk) @@ -1514,21 +1565,21 @@ def test_searchsorted(self): # np.array.searchsorted() res = c1.searchsorted('bread') chk = s1.searchsorted('bread') - exp = np.array([1], dtype=np.int64) + exp = np.array([1], dtype=np.intp) self.assert_numpy_array_equal(res, exp) self.assert_numpy_array_equal(res, chk) # Searching for a value that is not present in the Categorical res = c1.searchsorted(['bread', 'eggs']) chk = s1.searchsorted(['bread', 'eggs']) - exp = np.array([1, 4], dtype=np.int64) + exp = np.array([1, 4], dtype=np.intp) self.assert_numpy_array_equal(res, exp) self.assert_numpy_array_equal(res, chk) # Searching for a value that is not present, to the right res = c1.searchsorted(['bread', 'eggs'], side='right') chk = s1.searchsorted(['bread', 'eggs'], side='right') - exp = np.array([3, 4], dtype=np.int64) # eggs before milk + exp = np.array([3, 4], dtype=np.intp) # eggs before milk self.assert_numpy_array_equal(res, exp) self.assert_numpy_array_equal(res, chk) @@ -1538,7 +1589,7 @@ def test_searchsorted(self): chk = s2.searchsorted(['bread', 'eggs'], side='right', sorter=[0, 1, 2, 3, 5, 4]) # eggs after donuts, after switching milk and donuts - exp = np.array([3, 5], dtype=np.int64) + exp = np.array([3, 5], dtype=np.intp) self.assert_numpy_array_equal(res, exp) self.assert_numpy_array_equal(res, chk) @@ -1551,17 +1602,10 @@ def test_deprecated_labels(self): res = cat.labels self.assert_numpy_array_equal(res, exp) - def test_deprecated_levels(self): - # TODO: levels is deprecated and should be removed in 0.18 or 2017, - # whatever is earlier - cat = pd.Categorical([1, 2, 3, np.nan], categories=[1, 2, 3]) - exp = cat.categories + def test_deprecated_from_array(self): + # GH13854, `.from_array` is deprecated with tm.assert_produces_warning(FutureWarning): - res = cat.levels - self.assert_index_equal(res, exp) - with tm.assert_produces_warning(FutureWarning): - res = pd.Categorical([1, 2, 3, np.nan], levels=[1, 2, 3]) - self.assert_index_equal(res.categories, exp) + Categorical.from_array([0, 1]) def test_removed_names_produces_warning(self): @@ -1627,8 +1671,7 @@ class TestCategoricalAsBlock(tm.TestCase): _multiprocess_can_split_ = True def setUp(self): - self.factor = Categorical.from_array(['a', 'b', 'b', 'a', 'a', 'c', - 'c', 'c']) + self.factor = Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']) df = DataFrame({'value': np.random.randint(0, 10000, 100)}) labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)] @@ -2061,8 +2104,8 @@ def test_series_functions_no_warnings(self): def test_assignment_to_dataframe(self): # assignment - df = DataFrame({'value': np.array( - np.random.randint(0, 10000, 100), dtype='int32')}) + df = DataFrame({'value': np.array(np.random.randint(0, 10000, 100), + dtype='int32')}) labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)] df = df.sort_values(by=['value'], ascending=True) @@ -2073,15 +2116,15 @@ def test_assignment_to_dataframe(self): result = df.dtypes expected = Series( - [np.dtype('int32'), com.CategoricalDtype()], index=['value', 'D']) + [np.dtype('int32'), CategoricalDtype()], index=['value', 'D']) tm.assert_series_equal(result, expected) df['E'] = s str(df) result = df.dtypes - expected = Series([np.dtype('int32'), com.CategoricalDtype(), - com.CategoricalDtype()], + expected = Series([np.dtype('int32'), CategoricalDtype(), + CategoricalDtype()], index=['value', 'D', 'E']) tm.assert_series_equal(result, expected) @@ -2291,28 +2334,28 @@ def test_categorical_repr_period(self): idx = pd.period_range('2011-01-01 09:00', freq='H', periods=5) c = pd.Categorical(idx) exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00] -Categories (5, period): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, - 2011-01-01 13:00]""" +Categories (5, period[H]): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, + 2011-01-01 13:00]""" self.assertEqual(repr(c), exp) c = pd.Categorical(idx.append(idx), categories=idx) exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00, 2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00] -Categories (5, period): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, - 2011-01-01 13:00]""" +Categories (5, period[H]): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, + 2011-01-01 13:00]""" self.assertEqual(repr(c), exp) idx = pd.period_range('2011-01', freq='M', periods=5) c = pd.Categorical(idx) exp = """[2011-01, 2011-02, 2011-03, 2011-04, 2011-05] -Categories (5, period): [2011-01, 2011-02, 2011-03, 2011-04, 2011-05]""" +Categories (5, period[M]): [2011-01, 2011-02, 2011-03, 2011-04, 2011-05]""" self.assertEqual(repr(c), exp) c = pd.Categorical(idx.append(idx), categories=idx) exp = """[2011-01, 2011-02, 2011-03, 2011-04, 2011-05, 2011-01, 2011-02, 2011-03, 2011-04, 2011-05] -Categories (5, period): [2011-01, 2011-02, 2011-03, 2011-04, 2011-05]""" +Categories (5, period[M]): [2011-01, 2011-02, 2011-03, 2011-04, 2011-05]""" self.assertEqual(repr(c), exp) @@ -2320,28 +2363,28 @@ def test_categorical_repr_period_ordered(self): idx = pd.period_range('2011-01-01 09:00', freq='H', periods=5) c = pd.Categorical(idx, ordered=True) exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00] -Categories (5, period): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 < - 2011-01-01 13:00]""" +Categories (5, period[H]): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 < + 2011-01-01 13:00]""" self.assertEqual(repr(c), exp) c = pd.Categorical(idx.append(idx), categories=idx, ordered=True) exp = """[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00, 2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00] -Categories (5, period): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 < - 2011-01-01 13:00]""" +Categories (5, period[H]): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 < + 2011-01-01 13:00]""" self.assertEqual(repr(c), exp) idx = pd.period_range('2011-01', freq='M', periods=5) c = pd.Categorical(idx, ordered=True) exp = """[2011-01, 2011-02, 2011-03, 2011-04, 2011-05] -Categories (5, period): [2011-01 < 2011-02 < 2011-03 < 2011-04 < 2011-05]""" +Categories (5, period[M]): [2011-01 < 2011-02 < 2011-03 < 2011-04 < 2011-05]""" self.assertEqual(repr(c), exp) c = pd.Categorical(idx.append(idx), categories=idx, ordered=True) exp = """[2011-01, 2011-02, 2011-03, 2011-04, 2011-05, 2011-01, 2011-02, 2011-03, 2011-04, 2011-05] -Categories (5, period): [2011-01 < 2011-02 < 2011-03 < 2011-04 < 2011-05]""" +Categories (5, period[M]): [2011-01 < 2011-02 < 2011-03 < 2011-04 < 2011-05]""" self.assertEqual(repr(c), exp) @@ -2530,8 +2573,8 @@ def test_categorical_series_repr_period(self): 3 2011-01-01 12:00 4 2011-01-01 13:00 dtype: category -Categories (5, period): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, - 2011-01-01 13:00]""" +Categories (5, period[H]): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, + 2011-01-01 13:00]""" self.assertEqual(repr(s), exp) @@ -2543,7 +2586,7 @@ def test_categorical_series_repr_period(self): 3 2011-04 4 2011-05 dtype: category -Categories (5, period): [2011-01, 2011-02, 2011-03, 2011-04, 2011-05]""" +Categories (5, period[M]): [2011-01, 2011-02, 2011-03, 2011-04, 2011-05]""" self.assertEqual(repr(s), exp) @@ -2556,8 +2599,8 @@ def test_categorical_series_repr_period_ordered(self): 3 2011-01-01 12:00 4 2011-01-01 13:00 dtype: category -Categories (5, period): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 < - 2011-01-01 13:00]""" +Categories (5, period[H]): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 < + 2011-01-01 13:00]""" self.assertEqual(repr(s), exp) @@ -2569,7 +2612,7 @@ def test_categorical_series_repr_period_ordered(self): 3 2011-04 4 2011-05 dtype: category -Categories (5, period): [2011-01 < 2011-02 < 2011-03 < 2011-04 < 2011-05]""" +Categories (5, period[M]): [2011-01 < 2011-02 < 2011-03 < 2011-04 < 2011-05]""" self.assertEqual(repr(s), exp) @@ -2911,54 +2954,41 @@ def test_value_counts(self): tm.assert_series_equal(res, exp) def test_value_counts_with_nan(self): - # https://github.com/pydata/pandas/issues/9443 + # see gh-9443 + # sanity check s = pd.Series(["a", "b", "a"], dtype="category") - tm.assert_series_equal( - s.value_counts(dropna=True), - pd.Series([2, 1], index=pd.CategoricalIndex(["a", "b"]))) - tm.assert_series_equal( - s.value_counts(dropna=False), - pd.Series([2, 1], index=pd.CategoricalIndex(["a", "b"]))) + exp = pd.Series([2, 1], index=pd.CategoricalIndex(["a", "b"])) - s = pd.Series(["a", "b", None, "a", None, None], dtype="category") - tm.assert_series_equal( - s.value_counts(dropna=True), - pd.Series([2, 1], index=pd.CategoricalIndex(["a", "b"]))) - tm.assert_series_equal( - s.value_counts(dropna=False), - pd.Series([3, 2, 1], index=pd.CategoricalIndex([np.nan, "a", "b"]))) - # When we aren't sorting by counts, and np.nan isn't a - # category, it should be last. - tm.assert_series_equal( - s.value_counts(dropna=False, sort=False), - pd.Series([2, 1, 3], - index=pd.CategoricalIndex(["a", "b", np.nan]))) + res = s.value_counts(dropna=True) + tm.assert_series_equal(res, exp) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - s = pd.Series(pd.Categorical(["a", "b", "a"], - categories=["a", "b", np.nan])) + res = s.value_counts(dropna=True) + tm.assert_series_equal(res, exp) - # internal categories are different because of NaN - exp = pd.Series([2, 1], index=pd.CategoricalIndex(["a", "b"])) - tm.assert_series_equal(s.value_counts(dropna=True), exp, - check_categorical=False) - exp = pd.Series([2, 1, 0], - index=pd.CategoricalIndex(["a", "b", np.nan])) - tm.assert_series_equal(s.value_counts(dropna=False), exp, - check_categorical=False) + # same Series via two different constructions --> same behaviour + series = [ + pd.Series(["a", "b", None, "a", None, None], dtype="category"), + pd.Series(pd.Categorical(["a", "b", None, "a", None, None], + categories=["a", "b"])) + ] - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - s = pd.Series(pd.Categorical(["a", "b", None, "a", None, None], - categories=["a", "b", np.nan])) + for s in series: + # None is a NaN value, so we exclude its count here + exp = pd.Series([2, 1], index=pd.CategoricalIndex(["a", "b"])) + res = s.value_counts(dropna=True) + tm.assert_series_equal(res, exp) - exp = pd.Series([2, 1], index=pd.CategoricalIndex(["a", "b"])) - tm.assert_series_equal(s.value_counts(dropna=True), exp, - check_categorical=False) - exp = pd.Series([3, 2, 1], - index=pd.CategoricalIndex([np.nan, "a", "b"])) - tm.assert_series_equal(s.value_counts(dropna=False), exp, - check_categorical=False) + # we don't exclude the count of None and sort by counts + exp = pd.Series([3, 2, 1], index=pd.CategoricalIndex([np.nan, "a", "b"])) + res = s.value_counts(dropna=False) + tm.assert_series_equal(res, exp) + + # When we aren't sorting by counts, and np.nan isn't a + # category, it should be last. + exp = pd.Series([2, 1, 3], index=pd.CategoricalIndex(["a", "b", np.nan])) + res = s.value_counts(dropna=False, sort=False) + tm.assert_series_equal(res, exp) def test_groupby(self): @@ -2987,9 +3017,10 @@ def test_groupby(self): # multiple groupers gb = df.groupby(['A', 'B']) - exp_index = pd.MultiIndex.from_product([['a', 'b', 'z'], - ['c', 'd', 'y']], - names=['A', 'B']) + exp_index = pd.MultiIndex.from_product( + [Categorical(["a", "b", "z"], ordered=True), + Categorical(["c", "d", "y"], ordered=True)], + names=['A', 'B']) expected = DataFrame({'values': [1, 2, np.nan, 3, 4, np.nan, np.nan, np.nan, np.nan]}, index=exp_index) @@ -3000,10 +3031,13 @@ def test_groupby(self): df = df.copy() df['C'] = ['foo', 'bar'] * 2 gb = df.groupby(['A', 'B', 'C']) + exp_index = pd.MultiIndex.from_product( + [Categorical(["a", "b", "z"], ordered=True), + Categorical(["c", "d", "y"], ordered=True), + ['foo', 'bar']], + names=['A', 'B', 'C']) expected = DataFrame({'values': Series( - np.nan, index=pd.MultiIndex.from_product( - [['a', 'b', 'z'], ['c', 'd', 'y'], ['foo', 'bar'] - ], names=['A', 'B', 'C']))}).sortlevel() + np.nan, index=exp_index)}).sortlevel() expected.iloc[[1, 2, 7, 8], 0] = [1, 2, 3, 4] result = gb.sum() tm.assert_frame_equal(result, expected) @@ -3082,11 +3116,12 @@ def test_pivot_table(self): df = DataFrame({"A": raw_cat1, "B": raw_cat2, "values": [1, 2, 3, 4]}) result = pd.pivot_table(df, values='values', index=['A', 'B']) + exp_index = pd.MultiIndex.from_product( + [Categorical(["a", "b", "z"], ordered=True), + Categorical(["c", "d", "y"], ordered=True)], + names=['A', 'B']) expected = Series([1, 2, np.nan, 3, 4, np.nan, np.nan, np.nan, np.nan], - index=pd.MultiIndex.from_product( - [['a', 'b', 'z'], ['c', 'd', 'y']], - names=['A', 'B']), - name='values') + index=exp_index, name='values') tm.assert_series_equal(result, expected) def test_count(self): @@ -3231,7 +3266,7 @@ def test_slicing_and_getting_ops(self): # frame res_df = df.iloc[2:4, :] tm.assert_frame_equal(res_df, exp_df) - self.assertTrue(com.is_categorical_dtype(res_df["cats"])) + self.assertTrue(is_categorical_dtype(res_df["cats"])) # row res_row = df.iloc[2, :] @@ -3241,7 +3276,7 @@ def test_slicing_and_getting_ops(self): # col res_col = df.iloc[:, 0] tm.assert_series_equal(res_col, exp_col) - self.assertTrue(com.is_categorical_dtype(res_col)) + self.assertTrue(is_categorical_dtype(res_col)) # single value res_val = df.iloc[2, 0] @@ -3251,7 +3286,7 @@ def test_slicing_and_getting_ops(self): # frame res_df = df.loc["j":"k", :] tm.assert_frame_equal(res_df, exp_df) - self.assertTrue(com.is_categorical_dtype(res_df["cats"])) + self.assertTrue(is_categorical_dtype(res_df["cats"])) # row res_row = df.loc["j", :] @@ -3261,7 +3296,7 @@ def test_slicing_and_getting_ops(self): # col res_col = df.loc[:, "cats"] tm.assert_series_equal(res_col, exp_col) - self.assertTrue(com.is_categorical_dtype(res_col)) + self.assertTrue(is_categorical_dtype(res_col)) # single value res_val = df.loc["j", "cats"] @@ -3272,7 +3307,7 @@ def test_slicing_and_getting_ops(self): # res_df = df.ix["j":"k",[0,1]] # doesn't work? res_df = df.ix["j":"k", :] tm.assert_frame_equal(res_df, exp_df) - self.assertTrue(com.is_categorical_dtype(res_df["cats"])) + self.assertTrue(is_categorical_dtype(res_df["cats"])) # row res_row = df.ix["j", :] @@ -3282,7 +3317,7 @@ def test_slicing_and_getting_ops(self): # col res_col = df.ix[:, "cats"] tm.assert_series_equal(res_col, exp_col) - self.assertTrue(com.is_categorical_dtype(res_col)) + self.assertTrue(is_categorical_dtype(res_col)) # single value res_val = df.ix["j", 0] @@ -3315,37 +3350,36 @@ def test_slicing_and_getting_ops(self): res_df = df.iloc[slice(2, 4)] tm.assert_frame_equal(res_df, exp_df) - self.assertTrue(com.is_categorical_dtype(res_df["cats"])) + self.assertTrue(is_categorical_dtype(res_df["cats"])) res_df = df.iloc[[2, 3]] tm.assert_frame_equal(res_df, exp_df) - self.assertTrue(com.is_categorical_dtype(res_df["cats"])) + self.assertTrue(is_categorical_dtype(res_df["cats"])) res_col = df.iloc[:, 0] tm.assert_series_equal(res_col, exp_col) - self.assertTrue(com.is_categorical_dtype(res_col)) + self.assertTrue(is_categorical_dtype(res_col)) res_df = df.iloc[:, slice(0, 2)] tm.assert_frame_equal(res_df, df) - self.assertTrue(com.is_categorical_dtype(res_df["cats"])) + self.assertTrue(is_categorical_dtype(res_df["cats"])) res_df = df.iloc[:, [0, 1]] tm.assert_frame_equal(res_df, df) - self.assertTrue(com.is_categorical_dtype(res_df["cats"])) + self.assertTrue(is_categorical_dtype(res_df["cats"])) def test_slicing_doc_examples(self): # GH 7918 - cats = Categorical( - ["a", "b", "b", "b", "c", "c", "c"], categories=["a", "b", "c"]) + cats = Categorical(["a", "b", "b", "b", "c", "c", "c"], + categories=["a", "b", "c"]) idx = Index(["h", "i", "j", "k", "l", "m", "n", ]) values = [1, 2, 2, 2, 3, 4, 5] df = DataFrame({"cats": cats, "values": values}, index=idx) result = df.iloc[2:4, :] expected = DataFrame( - {"cats": Categorical( - ['b', 'b'], categories=['a', 'b', 'c']), + {"cats": Categorical(['b', 'b'], categories=['a', 'b', 'c']), "values": [2, 2]}, index=['j', 'k']) tm.assert_frame_equal(result, expected) @@ -3360,10 +3394,9 @@ def test_slicing_doc_examples(self): tm.assert_series_equal(result, expected) result = df.ix["h":"j", 0:1] - expected = DataFrame({'cats': Series( - Categorical( - ['a', 'b', 'b'], categories=['a', 'b', 'c']), index=['h', 'i', - 'j'])}) + expected = DataFrame({'cats': Categorical(['a', 'b', 'b'], + categories=['a', 'b', 'c'])}, + index=['h', 'i', 'j']) tm.assert_frame_equal(result, expected) def test_assigning_ops(self): @@ -3617,8 +3650,8 @@ def f(): with tm.assertRaises(ValueError): # different values df = orig.copy() - df.ix["j":"k", 0] = pd.Categorical( - ["c", "c"], categories=["a", "b", "c"]) + df.ix["j":"k", 0] = pd.Categorical(["c", "c"], + categories=["a", "b", "c"]) # assign a part of a column with dtype != categorical -> # exp_parts_cats_col @@ -3655,8 +3688,8 @@ def f(): self.assertRaises(ValueError, f) # fancy indexing - catsf = pd.Categorical( - ["a", "a", "c", "c", "a", "a", "a"], categories=["a", "b", "c"]) + catsf = pd.Categorical(["a", "a", "c", "c", "a", "a", "a"], + categories=["a", "b", "c"]) idxf = pd.Index(["h", "i", "j", "k", "l", "m", "n"]) valuesf = [1, 1, 3, 3, 1, 1, 1] df = pd.DataFrame({"cats": catsf, "values": valuesf}, index=idxf) @@ -3714,9 +3747,8 @@ def f(): s = orig.copy() s.index = ["x", "y"] s["y"] = "a" - exp = Series( - pd.Categorical(["b", "a"], - categories=["a", "b"]), index=["x", "y"]) + exp = Series(pd.Categorical(["b", "a"], categories=["a", "b"]), + index=["x", "y"]) tm.assert_series_equal(s, exp) # ensure that one can set something to np.nan @@ -3868,7 +3900,7 @@ def test_cat_equality(self): self.assertRaises(TypeError, lambda: a > b) self.assertRaises(TypeError, lambda: b > a) - def test_concat(self): + def test_concat_append(self): cat = pd.Categorical(["a", "b"], categories=["a", "b"]) vals = [1, 2] df = pd.DataFrame({"cats": cat, "vals": vals}) @@ -3877,20 +3909,22 @@ def test_concat(self): exp = pd.DataFrame({"cats": cat2, "vals": vals2}, index=pd.Index([0, 1, 0, 1])) - res = pd.concat([df, df]) - tm.assert_frame_equal(exp, res) + tm.assert_frame_equal(pd.concat([df, df]), exp) + tm.assert_frame_equal(df.append(df), exp) - # Concat should raise if the two categoricals do not have the same - # categories + # GH 13524 can concat different categories cat3 = pd.Categorical(["a", "b"], categories=["a", "b", "c"]) vals3 = [1, 2] - df_wrong_categories = pd.DataFrame({"cats": cat3, "vals": vals3}) + df_different_categories = pd.DataFrame({"cats": cat3, "vals": vals3}) - def f(): - pd.concat([df, df_wrong_categories]) + res = pd.concat([df, df_different_categories], ignore_index=True) + exp = pd.DataFrame({"cats": list('abab'), "vals": [1, 2, 1, 2]}) + tm.assert_frame_equal(res, exp) - self.assertRaises(ValueError, f) + res = df.append(df_different_categories, ignore_index=True) + tm.assert_frame_equal(res, exp) + def test_concat_append_gh7864(self): # GH 7864 # make sure ordering is preserverd df = pd.DataFrame({"id": [1, 2, 3, 4, 5, 6], @@ -3907,41 +3941,44 @@ def f(): df2['grade'].cat.categories) dfx = pd.concat([df1, df2]) - dfx['grade'].cat.categories self.assert_index_equal(df['grade'].cat.categories, dfx['grade'].cat.categories) + dfa = df1.append(df2) + self.assert_index_equal(df['grade'].cat.categories, + dfa['grade'].cat.categories) + + def test_concat_preserve(self): - # GH 8641 - # series concat not preserving category dtype + # GH 8641 series concat not preserving category dtype + # GH 13524 can concat different categories s = Series(list('abc'), dtype='category') s2 = Series(list('abd'), dtype='category') - def f(): - pd.concat([s, s2]) - - self.assertRaises(ValueError, f) + exp = Series(list('abcabd')) + res = pd.concat([s, s2], ignore_index=True) + tm.assert_series_equal(res, exp) - result = pd.concat([s, s], ignore_index=True) - expected = Series(list('abcabc')).astype('category') - tm.assert_series_equal(result, expected) + exp = Series(list('abcabc'), dtype='category') + res = pd.concat([s, s], ignore_index=True) + tm.assert_series_equal(res, exp) - result = pd.concat([s, s]) - expected = Series( - list('abcabc'), index=[0, 1, 2, 0, 1, 2]).astype('category') - tm.assert_series_equal(result, expected) + exp = Series(list('abcabc'), index=[0, 1, 2, 0, 1, 2], + dtype='category') + res = pd.concat([s, s]) + tm.assert_series_equal(res, exp) a = Series(np.arange(6, dtype='int64')) b = Series(list('aabbca')) df2 = DataFrame({'A': a, 'B': b.astype('category', categories=list('cab'))}) - result = pd.concat([df2, df2]) - expected = DataFrame({'A': pd.concat([a, a]), - 'B': pd.concat([b, b]).astype( - 'category', categories=list('cab'))}) - tm.assert_frame_equal(result, expected) + res = pd.concat([df2, df2]) + exp = DataFrame({'A': pd.concat([a, a]), + 'B': pd.concat([b, b]).astype( + 'category', categories=list('cab'))}) + tm.assert_frame_equal(res, exp) def test_categorical_index_preserver(self): @@ -3949,44 +3986,21 @@ def test_categorical_index_preserver(self): b = Series(list('aabbca')) df2 = DataFrame({'A': a, - 'B': b.astype('category', categories=list( - 'cab'))}).set_index('B') + 'B': b.astype('category', categories=list('cab')) + }).set_index('B') result = pd.concat([df2, df2]) expected = DataFrame({'A': pd.concat([a, a]), 'B': pd.concat([b, b]).astype( - 'category', categories=list( - 'cab'))}).set_index('B') + 'category', categories=list('cab')) + }).set_index('B') tm.assert_frame_equal(result, expected) # wrong catgories df3 = DataFrame({'A': a, - 'B': b.astype('category', categories=list( - 'abc'))}).set_index('B') + 'B': pd.Categorical(b, categories=list('abc')) + }).set_index('B') self.assertRaises(TypeError, lambda: pd.concat([df2, df3])) - def test_append(self): - cat = pd.Categorical(["a", "b"], categories=["a", "b"]) - vals = [1, 2] - df = pd.DataFrame({"cats": cat, "vals": vals}) - cat2 = pd.Categorical(["a", "b", "a", "b"], categories=["a", "b"]) - vals2 = [1, 2, 1, 2] - exp = pd.DataFrame({"cats": cat2, - "vals": vals2}, index=pd.Index([0, 1, 0, 1])) - - res = df.append(df) - tm.assert_frame_equal(exp, res) - - # Concat should raise if the two categoricals do not have the same - # categories - cat3 = pd.Categorical(["a", "b"], categories=["a", "b", "c"]) - vals3 = [1, 2] - df_wrong_categories = pd.DataFrame({"cats": cat3, "vals": vals3}) - - def f(): - df.append(df_wrong_categories) - - self.assertRaises(ValueError, f) - def test_merge(self): # GH 9426 @@ -4050,13 +4064,40 @@ def test_numpy_repeat(self): msg = "the 'axis' parameter is not supported" tm.assertRaisesRegexp(ValueError, msg, np.repeat, cat, 2, axis=1) + def test_reshape(self): + cat = pd.Categorical([], categories=["a", "b"]) + tm.assert_produces_warning(FutureWarning, cat.reshape, 0) + + with tm.assert_produces_warning(FutureWarning): + cat = pd.Categorical([], categories=["a", "b"]) + self.assert_categorical_equal(cat.reshape(0), cat) + + with tm.assert_produces_warning(FutureWarning): + cat = pd.Categorical([], categories=["a", "b"]) + self.assert_categorical_equal(cat.reshape((5, -1)), cat) + + with tm.assert_produces_warning(FutureWarning): + cat = pd.Categorical(["a", "b"], categories=["a", "b"]) + self.assert_categorical_equal(cat.reshape(cat.shape), cat) + + with tm.assert_produces_warning(FutureWarning): + cat = pd.Categorical(["a", "b"], categories=["a", "b"]) + self.assert_categorical_equal(cat.reshape(cat.size), cat) + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + msg = "can only specify one unknown dimension" + cat = pd.Categorical(["a", "b"], categories=["a", "b"]) + tm.assertRaisesRegexp(ValueError, msg, cat.reshape, (-2, -1)) + def test_numpy_reshape(self): - cat = pd.Categorical(["a", "b"], categories=["a", "b"]) - self.assert_categorical_equal(np.reshape(cat, cat.shape), cat) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + cat = pd.Categorical(["a", "b"], categories=["a", "b"]) + self.assert_categorical_equal(np.reshape(cat, cat.shape), cat) - msg = "the 'order' parameter is not supported" - tm.assertRaisesRegexp(ValueError, msg, np.reshape, - cat, cat.shape, order='F') + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + msg = "the 'order' parameter is not supported" + tm.assertRaisesRegexp(ValueError, msg, np.reshape, + cat, cat.shape, order='F') def test_na_actions(self): @@ -4091,33 +4132,59 @@ def f(): res = df.dropna() tm.assert_frame_equal(res, df_exp_drop_all) - # make sure that fillna takes both missing values and NA categories - # into account - c = Categorical(["a", "b", np.nan]) - with tm.assert_produces_warning(FutureWarning): - c.set_categories(["a", "b", np.nan], rename=True, inplace=True) - - c[0] = np.nan + # make sure that fillna takes missing values into account + c = Categorical([np.nan, "b", np.nan], categories=["a", "b"]) df = pd.DataFrame({"cats": c, "vals": [1, 2, 3]}) - cat_exp = Categorical(["a", "b", "a"], categories=["a", "b", np.nan]) + cat_exp = Categorical(["a", "b", "a"], categories=["a", "b"]) df_exp = pd.DataFrame({"cats": cat_exp, "vals": [1, 2, 3]}) res = df.fillna("a") tm.assert_frame_equal(res, df_exp) + # GH 14021 + # np.nan should always be a is a valid filler + cat = Categorical([np.nan, 2, np.nan]) + val = Categorical([np.nan, np.nan, np.nan]) + df = DataFrame({"cats": cat, "vals": val}) + res = df.fillna(df.median()) + v_exp = [np.nan, np.nan, np.nan] + df_exp = pd.DataFrame({"cats": [2, 2, 2], "vals": v_exp}, + dtype='category') + tm.assert_frame_equal(res, df_exp) + + result = df.cats.fillna(np.nan) + tm.assert_series_equal(result, df.cats) + result = df.vals.fillna(np.nan) + tm.assert_series_equal(result, df.vals) + + idx = pd.DatetimeIndex(['2011-01-01 09:00', '2016-01-01 23:45', + '2011-01-01 09:00', pd.NaT, pd.NaT]) + df = DataFrame({'a': pd.Categorical(idx)}) + tm.assert_frame_equal(df.fillna(value=pd.NaT), df) + + idx = pd.PeriodIndex(['2011-01', '2011-01', '2011-01', + pd.NaT, pd.NaT], freq='M') + df = DataFrame({'a': pd.Categorical(idx)}) + tm.assert_frame_equal(df.fillna(value=pd.NaT), df) + + idx = pd.TimedeltaIndex(['1 days', '2 days', + '1 days', pd.NaT, pd.NaT]) + df = pd.DataFrame({'a': pd.Categorical(idx)}) + tm.assert_frame_equal(df.fillna(value=pd.NaT), df) + def test_astype_to_other(self): s = self.cat['value_group'] expected = s tm.assert_series_equal(s.astype('category'), expected) - tm.assert_series_equal(s.astype(com.CategoricalDtype()), expected) + tm.assert_series_equal(s.astype(CategoricalDtype()), expected) self.assertRaises(ValueError, lambda: s.astype('float64')) cat = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'])) exp = Series(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']) tm.assert_series_equal(cat.astype('str'), exp) - s2 = Series(Categorical.from_array(['1', '2', '3', '4'])) + s2 = Series(Categorical(['1', '2', '3', '4'])) exp2 = Series([1, 2, 3, 4]).astype(int) tm.assert_series_equal(s2.astype('int'), exp2) @@ -4136,10 +4203,10 @@ def cmp(a, b): # valid conversion for valid in [lambda x: x.astype('category'), - lambda x: x.astype(com.CategoricalDtype()), + lambda x: x.astype(CategoricalDtype()), lambda x: x.astype('object').astype('category'), lambda x: x.astype('object').astype( - com.CategoricalDtype()) + CategoricalDtype()) ]: result = valid(s) @@ -4396,67 +4463,54 @@ def test_dt_accessor_api_for_categorical(self): invalid.dt self.assertFalse(hasattr(invalid, 'str')) - def test_pickle_v0_14_1(self): + def test_concat_categorical(self): + # See GH 10177 + df1 = pd.DataFrame(np.arange(18, dtype='int64').reshape(6, 3), + columns=["a", "b", "c"]) - # we have the name warning - # 10482 - with tm.assert_produces_warning(UserWarning): - cat = pd.Categorical(values=['a', 'b', 'c'], - categories=['a', 'b', 'c', 'd'], - name='foobar', ordered=False) - pickle_path = os.path.join(tm.get_data_path(), - 'categorical_0_14_1.pickle') - # This code was executed once on v0.14.1 to generate the pickle: - # - # cat = Categorical(labels=np.arange(3), levels=['a', 'b', 'c', 'd'], - # name='foobar') - # with open(pickle_path, 'wb') as f: pickle.dump(cat, f) - # - self.assert_categorical_equal(cat, pd.read_pickle(pickle_path)) - - def test_pickle_v0_15_2(self): - # ordered -> _ordered - # GH 9347 + df2 = pd.DataFrame(np.arange(14, dtype='int64').reshape(7, 2), + columns=["a", "c"]) - # we have the name warning - # 10482 - with tm.assert_produces_warning(UserWarning): - cat = pd.Categorical(values=['a', 'b', 'c'], - categories=['a', 'b', 'c', 'd'], - name='foobar', ordered=False) - pickle_path = os.path.join(tm.get_data_path(), - 'categorical_0_15_2.pickle') - # This code was executed once on v0.15.2 to generate the pickle: - # - # cat = Categorical(labels=np.arange(3), levels=['a', 'b', 'c', 'd'], - # name='foobar') - # with open(pickle_path, 'wb') as f: pickle.dump(cat, f) - # - self.assert_categorical_equal(cat, pd.read_pickle(pickle_path)) + cat_values = ["one", "one", "two", "one", "two", "two", "one"] + df2['h'] = pd.Series(pd.Categorical(cat_values)) - def test_concat_categorical(self): - # See GH 10177 - df1 = pd.DataFrame( - np.arange(18, dtype='int64').reshape(6, - 3), columns=["a", "b", "c"]) - - df2 = pd.DataFrame( - np.arange(14, dtype='int64').reshape(7, 2), columns=["a", "c"]) - df2['h'] = pd.Series(pd.Categorical(["one", "one", "two", "one", "two", - "two", "one"])) - - df_concat = pd.concat((df1, df2), axis=0).reset_index(drop=True) - - df_expected = pd.DataFrame( - {'a': [0, 3, 6, 9, 12, 15, 0, 2, 4, 6, 8, 10, 12], - 'b': [1, 4, 7, 10, 13, 16, np.nan, np.nan, np.nan, np.nan, np.nan, - np.nan, np.nan], - 'c': [2, 5, 8, 11, 14, 17, 1, 3, 5, 7, 9, 11, 13]}) - df_expected['h'] = pd.Series(pd.Categorical( - [None, None, None, None, None, None, "one", "one", "two", "one", - "two", "two", "one"])) - - tm.assert_frame_equal(df_expected, df_concat) + res = pd.concat((df1, df2), axis=0, ignore_index=True) + exp = pd.DataFrame({'a': [0, 3, 6, 9, 12, 15, 0, 2, 4, 6, 8, 10, 12], + 'b': [1, 4, 7, 10, 13, 16, np.nan, np.nan, + np.nan, np.nan, np.nan, np.nan, np.nan], + 'c': [2, 5, 8, 11, 14, 17, 1, 3, 5, 7, 9, 11, 13], + 'h': [None] * 6 + cat_values}) + tm.assert_frame_equal(res, exp) + + +class TestCategoricalSubclassing(tm.TestCase): + + _multiprocess_can_split_ = True + + def test_constructor(self): + sc = tm.SubclassedCategorical(['a', 'b', 'c']) + self.assertIsInstance(sc, tm.SubclassedCategorical) + tm.assert_categorical_equal(sc, Categorical(['a', 'b', 'c'])) + + def test_from_array(self): + sc = tm.SubclassedCategorical.from_codes([1, 0, 2], ['a', 'b', 'c']) + self.assertIsInstance(sc, tm.SubclassedCategorical) + exp = Categorical.from_codes([1, 0, 2], ['a', 'b', 'c']) + tm.assert_categorical_equal(sc, exp) + + def test_map(self): + sc = tm.SubclassedCategorical(['a', 'b', 'c']) + res = sc.map(lambda x: x.upper()) + self.assertIsInstance(res, tm.SubclassedCategorical) + exp = Categorical(['A', 'B', 'C']) + tm.assert_categorical_equal(res, exp) + + def test_map(self): + sc = tm.SubclassedCategorical(['a', 'b', 'c']) + res = sc.map(lambda x: x.upper()) + self.assertIsInstance(res, tm.SubclassedCategorical) + exp = Categorical(['A', 'B', 'C']) + tm.assert_categorical_equal(res, exp) if __name__ == '__main__': diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index ad43dc1c09ef1..09dd3f7ab517c 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -1,21 +1,12 @@ # -*- coding: utf-8 -*- -import collections -from datetime import datetime, timedelta -import re import nose import numpy as np -import pandas as pd -from pandas.tslib import iNaT, NaT -from pandas import (Series, DataFrame, date_range, DatetimeIndex, - TimedeltaIndex, Timestamp, Float64Index) -from pandas import compat -from pandas.compat import range, lrange, lmap, u -from pandas.core.common import notnull, isnull, array_equivalent + +from pandas import Series, Timestamp +from pandas.compat import range, lmap import pandas.core.common as com -import pandas.core.convert as convert import pandas.util.testing as tm -import pandas.core.config as cf _multiprocess_can_split_ = True @@ -28,22 +19,6 @@ def test_mut_exclusive(): assert com._mut_exclusive(major=None, major_axis=None) is None -def test_is_sequence(): - is_seq = com.is_sequence - assert (is_seq((1, 2))) - assert (is_seq([1, 2])) - assert (not is_seq("abcd")) - assert (not is_seq(u("abcd"))) - assert (not is_seq(np.int64)) - - class A(object): - - def __getitem__(self): - return 1 - - assert (not is_seq(A())) - - def test_get_callable_name(): from functools import partial getname = com._get_callable_name @@ -68,407 +43,6 @@ def __call__(self): assert getname(1) is None -class TestInferDtype(tm.TestCase): - - def test_infer_dtype_from_scalar(self): - # Test that _infer_dtype_from_scalar is returning correct dtype for int - # and float. - - for dtypec in [np.uint8, np.int8, np.uint16, np.int16, np.uint32, - np.int32, np.uint64, np.int64]: - data = dtypec(12) - dtype, val = com._infer_dtype_from_scalar(data) - self.assertEqual(dtype, type(data)) - - data = 12 - dtype, val = com._infer_dtype_from_scalar(data) - self.assertEqual(dtype, np.int64) - - for dtypec in [np.float16, np.float32, np.float64]: - data = dtypec(12) - dtype, val = com._infer_dtype_from_scalar(data) - self.assertEqual(dtype, dtypec) - - data = np.float(12) - dtype, val = com._infer_dtype_from_scalar(data) - self.assertEqual(dtype, np.float64) - - for data in [True, False]: - dtype, val = com._infer_dtype_from_scalar(data) - self.assertEqual(dtype, np.bool_) - - for data in [np.complex64(1), np.complex128(1)]: - dtype, val = com._infer_dtype_from_scalar(data) - self.assertEqual(dtype, np.complex_) - - import datetime - for data in [np.datetime64(1, 'ns'), pd.Timestamp(1), - datetime.datetime(2000, 1, 1, 0, 0)]: - dtype, val = com._infer_dtype_from_scalar(data) - self.assertEqual(dtype, 'M8[ns]') - - for data in [np.timedelta64(1, 'ns'), pd.Timedelta(1), - datetime.timedelta(1)]: - dtype, val = com._infer_dtype_from_scalar(data) - self.assertEqual(dtype, 'm8[ns]') - - for data in [datetime.date(2000, 1, 1), - pd.Timestamp(1, tz='US/Eastern'), 'foo']: - dtype, val = com._infer_dtype_from_scalar(data) - self.assertEqual(dtype, np.object_) - - -def test_notnull(): - assert notnull(1.) - assert not notnull(None) - assert not notnull(np.NaN) - - with cf.option_context("mode.use_inf_as_null", False): - assert notnull(np.inf) - assert notnull(-np.inf) - - arr = np.array([1.5, np.inf, 3.5, -np.inf]) - result = notnull(arr) - assert result.all() - - with cf.option_context("mode.use_inf_as_null", True): - assert not notnull(np.inf) - assert not notnull(-np.inf) - - arr = np.array([1.5, np.inf, 3.5, -np.inf]) - result = notnull(arr) - assert result.sum() == 2 - - with cf.option_context("mode.use_inf_as_null", False): - for s in [tm.makeFloatSeries(), tm.makeStringSeries(), - tm.makeObjectSeries(), tm.makeTimeSeries(), - tm.makePeriodSeries()]: - assert (isinstance(isnull(s), Series)) - - -def test_isnull(): - assert not isnull(1.) - assert isnull(None) - assert isnull(np.NaN) - assert not isnull(np.inf) - assert not isnull(-np.inf) - - # series - for s in [tm.makeFloatSeries(), tm.makeStringSeries(), - tm.makeObjectSeries(), tm.makeTimeSeries(), - tm.makePeriodSeries()]: - assert (isinstance(isnull(s), Series)) - - # frame - for df in [tm.makeTimeDataFrame(), tm.makePeriodFrame(), - tm.makeMixedDataFrame()]: - result = isnull(df) - expected = df.apply(isnull) - tm.assert_frame_equal(result, expected) - - # panel - for p in [tm.makePanel(), tm.makePeriodPanel(), tm.add_nans(tm.makePanel()) - ]: - result = isnull(p) - expected = p.apply(isnull) - tm.assert_panel_equal(result, expected) - - # panel 4d - for p in [tm.makePanel4D(), tm.add_nans_panel4d(tm.makePanel4D())]: - result = isnull(p) - expected = p.apply(isnull) - tm.assert_panel4d_equal(result, expected) - - -def test_isnull_lists(): - result = isnull([[False]]) - exp = np.array([[False]]) - assert (np.array_equal(result, exp)) - - result = isnull([[1], [2]]) - exp = np.array([[False], [False]]) - assert (np.array_equal(result, exp)) - - # list of strings / unicode - result = isnull(['foo', 'bar']) - assert (not result.any()) - - result = isnull([u('foo'), u('bar')]) - assert (not result.any()) - - -def test_isnull_nat(): - result = isnull([NaT]) - exp = np.array([True]) - assert (np.array_equal(result, exp)) - - result = isnull(np.array([NaT], dtype=object)) - exp = np.array([True]) - assert (np.array_equal(result, exp)) - - -def test_isnull_numpy_nat(): - arr = np.array([NaT, np.datetime64('NaT'), np.timedelta64('NaT'), - np.datetime64('NaT', 's')]) - result = isnull(arr) - expected = np.array([True] * 4) - tm.assert_numpy_array_equal(result, expected) - - -def test_isnull_datetime(): - assert (not isnull(datetime.now())) - assert notnull(datetime.now()) - - idx = date_range('1/1/1990', periods=20) - assert (notnull(idx).all()) - - idx = np.asarray(idx) - idx[0] = iNaT - idx = DatetimeIndex(idx) - mask = isnull(idx) - assert (mask[0]) - assert (not mask[1:].any()) - - # GH 9129 - pidx = idx.to_period(freq='M') - mask = isnull(pidx) - assert (mask[0]) - assert (not mask[1:].any()) - - mask = isnull(pidx[1:]) - assert (not mask.any()) - - -class TestIsNull(tm.TestCase): - - def test_0d_array(self): - self.assertTrue(isnull(np.array(np.nan))) - self.assertFalse(isnull(np.array(0.0))) - self.assertFalse(isnull(np.array(0))) - # test object dtype - self.assertTrue(isnull(np.array(np.nan, dtype=object))) - self.assertFalse(isnull(np.array(0.0, dtype=object))) - self.assertFalse(isnull(np.array(0, dtype=object))) - - -class TestNumberScalar(tm.TestCase): - - def test_is_number(self): - - self.assertTrue(com.is_number(True)) - self.assertTrue(com.is_number(1)) - self.assertTrue(com.is_number(1.1)) - self.assertTrue(com.is_number(1 + 3j)) - self.assertTrue(com.is_number(np.bool(False))) - self.assertTrue(com.is_number(np.int64(1))) - self.assertTrue(com.is_number(np.float64(1.1))) - self.assertTrue(com.is_number(np.complex128(1 + 3j))) - self.assertTrue(com.is_number(np.nan)) - - self.assertFalse(com.is_number(None)) - self.assertFalse(com.is_number('x')) - self.assertFalse(com.is_number(datetime(2011, 1, 1))) - self.assertFalse(com.is_number(np.datetime64('2011-01-01'))) - self.assertFalse(com.is_number(pd.Timestamp('2011-01-01'))) - self.assertFalse(com.is_number(pd.Timestamp('2011-01-01', - tz='US/Eastern'))) - self.assertFalse(com.is_number(timedelta(1000))) - self.assertFalse(com.is_number(pd.Timedelta('1 days'))) - - # questionable - self.assertFalse(com.is_number(np.bool_(False))) - self.assertTrue(com.is_number(np.timedelta64(1, 'D'))) - - def test_is_bool(self): - self.assertTrue(com.is_bool(True)) - self.assertTrue(com.is_bool(np.bool(False))) - self.assertTrue(com.is_bool(np.bool_(False))) - - self.assertFalse(com.is_bool(1)) - self.assertFalse(com.is_bool(1.1)) - self.assertFalse(com.is_bool(1 + 3j)) - self.assertFalse(com.is_bool(np.int64(1))) - self.assertFalse(com.is_bool(np.float64(1.1))) - self.assertFalse(com.is_bool(np.complex128(1 + 3j))) - self.assertFalse(com.is_bool(np.nan)) - self.assertFalse(com.is_bool(None)) - self.assertFalse(com.is_bool('x')) - self.assertFalse(com.is_bool(datetime(2011, 1, 1))) - self.assertFalse(com.is_bool(np.datetime64('2011-01-01'))) - self.assertFalse(com.is_bool(pd.Timestamp('2011-01-01'))) - self.assertFalse(com.is_bool(pd.Timestamp('2011-01-01', - tz='US/Eastern'))) - self.assertFalse(com.is_bool(timedelta(1000))) - self.assertFalse(com.is_bool(np.timedelta64(1, 'D'))) - self.assertFalse(com.is_bool(pd.Timedelta('1 days'))) - - def test_is_integer(self): - self.assertTrue(com.is_integer(1)) - self.assertTrue(com.is_integer(np.int64(1))) - - self.assertFalse(com.is_integer(True)) - self.assertFalse(com.is_integer(1.1)) - self.assertFalse(com.is_integer(1 + 3j)) - self.assertFalse(com.is_integer(np.bool(False))) - self.assertFalse(com.is_integer(np.bool_(False))) - self.assertFalse(com.is_integer(np.float64(1.1))) - self.assertFalse(com.is_integer(np.complex128(1 + 3j))) - self.assertFalse(com.is_integer(np.nan)) - self.assertFalse(com.is_integer(None)) - self.assertFalse(com.is_integer('x')) - self.assertFalse(com.is_integer(datetime(2011, 1, 1))) - self.assertFalse(com.is_integer(np.datetime64('2011-01-01'))) - self.assertFalse(com.is_integer(pd.Timestamp('2011-01-01'))) - self.assertFalse(com.is_integer(pd.Timestamp('2011-01-01', - tz='US/Eastern'))) - self.assertFalse(com.is_integer(timedelta(1000))) - self.assertFalse(com.is_integer(pd.Timedelta('1 days'))) - - # questionable - self.assertTrue(com.is_integer(np.timedelta64(1, 'D'))) - - def test_is_float(self): - self.assertTrue(com.is_float(1.1)) - self.assertTrue(com.is_float(np.float64(1.1))) - self.assertTrue(com.is_float(np.nan)) - - self.assertFalse(com.is_float(True)) - self.assertFalse(com.is_float(1)) - self.assertFalse(com.is_float(1 + 3j)) - self.assertFalse(com.is_float(np.bool(False))) - self.assertFalse(com.is_float(np.bool_(False))) - self.assertFalse(com.is_float(np.int64(1))) - self.assertFalse(com.is_float(np.complex128(1 + 3j))) - self.assertFalse(com.is_float(None)) - self.assertFalse(com.is_float('x')) - self.assertFalse(com.is_float(datetime(2011, 1, 1))) - self.assertFalse(com.is_float(np.datetime64('2011-01-01'))) - self.assertFalse(com.is_float(pd.Timestamp('2011-01-01'))) - self.assertFalse(com.is_float(pd.Timestamp('2011-01-01', - tz='US/Eastern'))) - self.assertFalse(com.is_float(timedelta(1000))) - self.assertFalse(com.is_float(np.timedelta64(1, 'D'))) - self.assertFalse(com.is_float(pd.Timedelta('1 days'))) - - -def test_downcast_conv(): - # test downcasting - - arr = np.array([8.5, 8.6, 8.7, 8.8, 8.9999999999995]) - result = com._possibly_downcast_to_dtype(arr, 'infer') - assert (np.array_equal(result, arr)) - - arr = np.array([8., 8., 8., 8., 8.9999999999995]) - result = com._possibly_downcast_to_dtype(arr, 'infer') - expected = np.array([8, 8, 8, 8, 9]) - assert (np.array_equal(result, expected)) - - arr = np.array([8., 8., 8., 8., 9.0000000000005]) - result = com._possibly_downcast_to_dtype(arr, 'infer') - expected = np.array([8, 8, 8, 8, 9]) - assert (np.array_equal(result, expected)) - - # conversions - - expected = np.array([1, 2]) - for dtype in [np.float64, object, np.int64]: - arr = np.array([1.0, 2.0], dtype=dtype) - result = com._possibly_downcast_to_dtype(arr, 'infer') - tm.assert_almost_equal(result, expected, check_dtype=False) - - for dtype in [np.float64, object]: - expected = np.array([1.0, 2.0, np.nan], dtype=dtype) - arr = np.array([1.0, 2.0, np.nan], dtype=dtype) - result = com._possibly_downcast_to_dtype(arr, 'infer') - tm.assert_almost_equal(result, expected) - - # empties - for dtype in [np.int32, np.float64, np.float32, np.bool_, - np.int64, object]: - arr = np.array([], dtype=dtype) - result = com._possibly_downcast_to_dtype(arr, 'int64') - tm.assert_almost_equal(result, np.array([], dtype=np.int64)) - assert result.dtype == np.int64 - - -def test_array_equivalent(): - assert array_equivalent(np.array([np.nan, np.nan]), - np.array([np.nan, np.nan])) - assert array_equivalent(np.array([np.nan, 1, np.nan]), - np.array([np.nan, 1, np.nan])) - assert array_equivalent(np.array([np.nan, None], dtype='object'), - np.array([np.nan, None], dtype='object')) - assert array_equivalent(np.array([np.nan, 1 + 1j], dtype='complex'), - np.array([np.nan, 1 + 1j], dtype='complex')) - assert not array_equivalent( - np.array([np.nan, 1 + 1j], dtype='complex'), np.array( - [np.nan, 1 + 2j], dtype='complex')) - assert not array_equivalent( - np.array([np.nan, 1, np.nan]), np.array([np.nan, 2, np.nan])) - assert not array_equivalent( - np.array(['a', 'b', 'c', 'd']), np.array(['e', 'e'])) - assert array_equivalent(Float64Index([0, np.nan]), - Float64Index([0, np.nan])) - assert not array_equivalent( - Float64Index([0, np.nan]), Float64Index([1, np.nan])) - assert array_equivalent(DatetimeIndex([0, np.nan]), - DatetimeIndex([0, np.nan])) - assert not array_equivalent( - DatetimeIndex([0, np.nan]), DatetimeIndex([1, np.nan])) - assert array_equivalent(TimedeltaIndex([0, np.nan]), - TimedeltaIndex([0, np.nan])) - assert not array_equivalent( - TimedeltaIndex([0, np.nan]), TimedeltaIndex([1, np.nan])) - assert array_equivalent(DatetimeIndex([0, np.nan], tz='US/Eastern'), - DatetimeIndex([0, np.nan], tz='US/Eastern')) - assert not array_equivalent( - DatetimeIndex([0, np.nan], tz='US/Eastern'), DatetimeIndex( - [1, np.nan], tz='US/Eastern')) - assert not array_equivalent( - DatetimeIndex([0, np.nan]), DatetimeIndex( - [0, np.nan], tz='US/Eastern')) - assert not array_equivalent( - DatetimeIndex([0, np.nan], tz='CET'), DatetimeIndex( - [0, np.nan], tz='US/Eastern')) - assert not array_equivalent( - DatetimeIndex([0, np.nan]), TimedeltaIndex([0, np.nan])) - - -def test_array_equivalent_str(): - for dtype in ['O', 'S', 'U']: - assert array_equivalent(np.array(['A', 'B'], dtype=dtype), - np.array(['A', 'B'], dtype=dtype)) - assert not array_equivalent(np.array(['A', 'B'], dtype=dtype), - np.array(['A', 'X'], dtype=dtype)) - - -def test_datetimeindex_from_empty_datetime64_array(): - for unit in ['ms', 'us', 'ns']: - idx = DatetimeIndex(np.array([], dtype='datetime64[%s]' % unit)) - assert (len(idx) == 0) - - -def test_nan_to_nat_conversions(): - - df = DataFrame(dict({ - 'A': np.asarray( - lrange(10), dtype='float64'), - 'B': Timestamp('20010101') - })) - df.iloc[3:6, :] = np.nan - result = df.loc[4, 'B'].value - assert (result == iNaT) - - s = df['B'].copy() - s._data = s._data.setitem(indexer=tuple([slice(8, 9)]), value=np.nan) - assert (isnull(s[8])) - - # numpy < 1.7.0 is wrong - from distutils.version import LooseVersion - if LooseVersion(np.__version__) >= '1.7.0': - assert (s[8].value == np.datetime64('NaT').astype(np.int64)) - - def test_any_none(): assert (com._any_none(1, 2, 3, None)) assert (not com._any_none(1, 2, 3, 4)) @@ -567,122 +141,6 @@ def test_groupby(): assert v == expected[k] -def test_is_list_like(): - passes = ([], [1], (1, ), (1, 2), {'a': 1}, set([1, 'a']), Series([1]), - Series([]), Series(['a']).str) - fails = (1, '2', object()) - - for p in passes: - assert com.is_list_like(p) - - for f in fails: - assert not com.is_list_like(f) - - -def test_is_dict_like(): - passes = [{}, {'A': 1}, pd.Series([1])] - fails = ['1', 1, [1, 2], (1, 2), range(2), pd.Index([1])] - - for p in passes: - assert com.is_dict_like(p) - - for f in fails: - assert not com.is_dict_like(f) - - -def test_is_named_tuple(): - passes = (collections.namedtuple('Test', list('abc'))(1, 2, 3), ) - fails = ((1, 2, 3), 'a', Series({'pi': 3.14})) - - for p in passes: - assert com.is_named_tuple(p) - - for f in fails: - assert not com.is_named_tuple(f) - - -def test_is_hashable(): - - # all new-style classes are hashable by default - class HashableClass(object): - pass - - class UnhashableClass1(object): - __hash__ = None - - class UnhashableClass2(object): - - def __hash__(self): - raise TypeError("Not hashable") - - hashable = (1, - 3.14, - np.float64(3.14), - 'a', - tuple(), - (1, ), - HashableClass(), ) - not_hashable = ([], UnhashableClass1(), ) - abc_hashable_not_really_hashable = (([], ), UnhashableClass2(), ) - - for i in hashable: - assert com.is_hashable(i) - for i in not_hashable: - assert not com.is_hashable(i) - for i in abc_hashable_not_really_hashable: - assert not com.is_hashable(i) - - # numpy.array is no longer collections.Hashable as of - # https://github.com/numpy/numpy/pull/5326, just test - # pandas.common.is_hashable() - assert not com.is_hashable(np.array([])) - - # old-style classes in Python 2 don't appear hashable to - # collections.Hashable but also seem to support hash() by default - if compat.PY2: - - class OldStyleClass(): - pass - - c = OldStyleClass() - assert not isinstance(c, collections.Hashable) - assert com.is_hashable(c) - hash(c) # this will not raise - - -def test_ensure_int32(): - values = np.arange(10, dtype=np.int32) - result = com._ensure_int32(values) - assert (result.dtype == np.int32) - - values = np.arange(10, dtype=np.int64) - result = com._ensure_int32(values) - assert (result.dtype == np.int32) - - -def test_is_re(): - passes = re.compile('ad'), - fails = 'x', 2, 3, object() - - for p in passes: - assert com.is_re(p) - - for f in fails: - assert not com.is_re(f) - - -def test_is_recompilable(): - passes = (r'a', u('x'), r'asdf', re.compile('adsf'), u(r'\u2233\s*'), - re.compile(r'')) - fails = 1, [], object() - - for p in passes: - assert com.is_re_compilable(p) - - for f in fails: - assert not com.is_re_compilable(f) - - def test_random_state(): import numpy.random as npr # Check with seed @@ -730,83 +188,6 @@ def test_maybe_match_name(): assert (matched == 'y') -class TestMaybe(tm.TestCase): - - def test_maybe_convert_string_to_array(self): - result = com._maybe_convert_string_to_object('x') - tm.assert_numpy_array_equal(result, np.array(['x'], dtype=object)) - self.assertTrue(result.dtype == object) - - result = com._maybe_convert_string_to_object(1) - self.assertEqual(result, 1) - - arr = np.array(['x', 'y'], dtype=str) - result = com._maybe_convert_string_to_object(arr) - tm.assert_numpy_array_equal(result, np.array(['x', 'y'], dtype=object)) - self.assertTrue(result.dtype == object) - - # unicode - arr = np.array(['x', 'y']).astype('U') - result = com._maybe_convert_string_to_object(arr) - tm.assert_numpy_array_equal(result, np.array(['x', 'y'], dtype=object)) - self.assertTrue(result.dtype == object) - - # object - arr = np.array(['x', 2], dtype=object) - result = com._maybe_convert_string_to_object(arr) - tm.assert_numpy_array_equal(result, np.array(['x', 2], dtype=object)) - self.assertTrue(result.dtype == object) - - def test_maybe_convert_scalar(self): - - # pass thru - result = com._maybe_convert_scalar('x') - self.assertEqual(result, 'x') - result = com._maybe_convert_scalar(np.array([1])) - self.assertEqual(result, np.array([1])) - - # leave scalar dtype - result = com._maybe_convert_scalar(np.int64(1)) - self.assertEqual(result, np.int64(1)) - result = com._maybe_convert_scalar(np.int32(1)) - self.assertEqual(result, np.int32(1)) - result = com._maybe_convert_scalar(np.float32(1)) - self.assertEqual(result, np.float32(1)) - result = com._maybe_convert_scalar(np.int64(1)) - self.assertEqual(result, np.float64(1)) - - # coerce - result = com._maybe_convert_scalar(1) - self.assertEqual(result, np.int64(1)) - result = com._maybe_convert_scalar(1.0) - self.assertEqual(result, np.float64(1)) - result = com._maybe_convert_scalar(pd.Timestamp('20130101')) - self.assertEqual(result, pd.Timestamp('20130101').value) - result = com._maybe_convert_scalar(datetime(2013, 1, 1)) - self.assertEqual(result, pd.Timestamp('20130101').value) - result = com._maybe_convert_scalar(pd.Timedelta('1 day 1 min')) - self.assertEqual(result, pd.Timedelta('1 day 1 min').value) - - -class TestConvert(tm.TestCase): - - def test_possibly_convert_objects_copy(self): - values = np.array([1, 2]) - - out = convert._possibly_convert_objects(values, copy=False) - self.assertTrue(values is out) - - out = convert._possibly_convert_objects(values, copy=True) - self.assertTrue(values is not out) - - values = np.array(['apply', 'banana']) - out = convert._possibly_convert_objects(values, copy=False) - self.assertTrue(values is out) - - out = convert._possibly_convert_objects(values, copy=True) - self.assertTrue(values is not out) - - def test_dict_compat(): data_datetime64 = {np.datetime64('1990-03-15'): 1, np.datetime64('2015-03-15'): 2} @@ -817,21 +198,6 @@ def test_dict_compat(): assert (com._dict_compat(data_unchanged) == data_unchanged) -def test_is_timedelta(): - assert (com.is_timedelta64_dtype('timedelta64')) - assert (com.is_timedelta64_dtype('timedelta64[ns]')) - assert (not com.is_timedelta64_ns_dtype('timedelta64')) - assert (com.is_timedelta64_ns_dtype('timedelta64[ns]')) - - tdi = TimedeltaIndex([1e14, 2e14], dtype='timedelta64') - assert (com.is_timedelta64_dtype(tdi)) - assert (com.is_timedelta64_ns_dtype(tdi)) - assert (com.is_timedelta64_ns_dtype(tdi.astype('timedelta64[ns]'))) - # Conversion to Int64Index: - assert (not com.is_timedelta64_ns_dtype(tdi.astype('timedelta64'))) - assert (not com.is_timedelta64_ns_dtype(tdi.astype('timedelta64[h]'))) - - if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) diff --git a/pandas/tests/test_expressions.py b/pandas/tests/test_expressions.py index cc0972937b8a2..c037f02f20609 100644 --- a/pandas/tests/test_expressions.py +++ b/pandas/tests/test_expressions.py @@ -208,8 +208,9 @@ def test_float_panel(self): @slow def test_panel4d(self): - self.run_panel(tm.makePanel4D(), np.random.randn() + 0.5, - assert_func=assert_panel4d_equal, binary_comp=3) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + self.run_panel(tm.makePanel4D(), np.random.randn() + 0.5, + assert_func=assert_panel4d_equal, binary_comp=3) def test_mixed_arithmetic_frame(self): # TODO: FIGURE OUT HOW TO GET IT TO WORK... diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py index 2f4c2b414cc30..cdcd8b1bcba60 100644 --- a/pandas/tests/test_generic.py +++ b/pandas/tests/test_generic.py @@ -7,12 +7,12 @@ from numpy import nan import pandas as pd +from pandas.types.common import is_scalar from pandas import (Index, Series, DataFrame, Panel, isnull, date_range, period_range, Panel4D) from pandas.core.index import MultiIndex import pandas.formats.printing as printing -import pandas.lib as lib from pandas.compat import range, zip, PY3 from pandas import compat @@ -53,7 +53,7 @@ def _construct(self, shape, value=None, dtype=None, **kwargs): if isinstance(shape, int): shape = tuple([shape] * self._ndim) if value is not None: - if lib.isscalar(value): + if is_scalar(value): if value == 'empty': arr = None @@ -1532,17 +1532,41 @@ def test_to_xarray(self): tm._skip_if_no_xarray() from xarray import DataArray - p = tm.makePanel4D() + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + p = tm.makePanel4D() - result = p.to_xarray() - self.assertIsInstance(result, DataArray) - self.assertEqual(len(result.coords), 4) - assert_almost_equal(list(result.coords.keys()), - ['labels', 'items', 'major_axis', 'minor_axis']) - self.assertEqual(len(result.dims), 4) - - # non-convertible - self.assertRaises(ValueError, lambda: result.to_pandas()) + result = p.to_xarray() + self.assertIsInstance(result, DataArray) + self.assertEqual(len(result.coords), 4) + assert_almost_equal(list(result.coords.keys()), + ['labels', 'items', 'major_axis', + 'minor_axis']) + self.assertEqual(len(result.dims), 4) + + # non-convertible + self.assertRaises(ValueError, lambda: result.to_pandas()) + +# run all the tests, but wrap each in a warning catcher +for t in ['test_rename', 'test_rename_axis', 'test_get_numeric_data', + 'test_get_default', 'test_nonzero', + 'test_numpy_1_7_compat_numeric_methods', + 'test_downcast', 'test_constructor_compound_dtypes', + 'test_head_tail', + 'test_size_compat', 'test_split_compat', + 'test_unexpected_keyword', + 'test_stat_unexpected_keyword', 'test_api_compat', + 'test_stat_non_defaults_args', + 'test_clip', 'test_truncate_out_of_bounds', 'test_numpy_clip', + 'test_metadata_propagation']: + + def f(): + def tester(self): + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + return getattr(super(TestPanel4D, self), t)() + return tester + + setattr(TestPanel4D, t, f()) class TestNDFrame(tm.TestCase): @@ -1674,8 +1698,9 @@ def test_squeeze(self): tm.assert_frame_equal(df.squeeze(), df) for p in [tm.makePanel()]: tm.assert_panel_equal(p.squeeze(), p) - for p4d in [tm.makePanel4D()]: - tm.assert_panel4d_equal(p4d.squeeze(), p4d) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + for p4d in [tm.makePanel4D()]: + tm.assert_panel4d_equal(p4d.squeeze(), p4d) # squeezing df = tm.makeTimeDataFrame().reindex(columns=['A']) @@ -1687,11 +1712,13 @@ def test_squeeze(self): p = tm.makePanel().reindex(items=['ItemA'], minor_axis=['A']) tm.assert_series_equal(p.squeeze(), p.ix['ItemA', :, 'A']) - p4d = tm.makePanel4D().reindex(labels=['label1']) - tm.assert_panel_equal(p4d.squeeze(), p4d['label1']) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + p4d = tm.makePanel4D().reindex(labels=['label1']) + tm.assert_panel_equal(p4d.squeeze(), p4d['label1']) - p4d = tm.makePanel4D().reindex(labels=['label1'], items=['ItemA']) - tm.assert_frame_equal(p4d.squeeze(), p4d.ix['label1', 'ItemA']) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + p4d = tm.makePanel4D().reindex(labels=['label1'], items=['ItemA']) + tm.assert_frame_equal(p4d.squeeze(), p4d.ix['label1', 'ItemA']) # don't fail with 0 length dimensions GH11229 & GH8999 empty_series = pd.Series([], name='five') @@ -1726,11 +1753,13 @@ def test_transpose(self): .transpose(1, 2, 0), p) tm.assertRaisesRegexp(TypeError, msg, p.transpose, 2, 0, 1, axes=(2, 0, 1)) - for p4d in [tm.makePanel4D()]: - tm.assert_panel4d_equal(p4d.transpose(2, 0, 3, 1) - .transpose(1, 3, 0, 2), p4d) - tm.assertRaisesRegexp(TypeError, msg, p4d.transpose, - 2, 0, 3, 1, axes=(2, 0, 3, 1)) + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + for p4d in [tm.makePanel4D()]: + tm.assert_panel4d_equal(p4d.transpose(2, 0, 3, 1) + .transpose(1, 3, 0, 2), p4d) + tm.assertRaisesRegexp(TypeError, msg, p4d.transpose, + 2, 0, 3, 1, axes=(2, 0, 3, 1)) def test_numpy_transpose(self): msg = "the 'axes' parameter is not supported" @@ -1752,10 +1781,11 @@ def test_numpy_transpose(self): np.transpose(p, axes=(2, 0, 1)), axes=(1, 2, 0)), p) - p4d = tm.makePanel4D() - tm.assert_panel4d_equal(np.transpose( - np.transpose(p4d, axes=(2, 0, 3, 1)), - axes=(1, 3, 0, 2)), p4d) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + p4d = tm.makePanel4D() + tm.assert_panel4d_equal(np.transpose( + np.transpose(p4d, axes=(2, 0, 3, 1)), + axes=(1, 3, 0, 2)), p4d) def test_take(self): indices = [1, 5, -2, 6, 3, -1] @@ -1780,21 +1810,25 @@ def test_take(self): major_axis=p.major_axis, minor_axis=p.minor_axis) tm.assert_panel_equal(out, expected) - for p4d in [tm.makePanel4D()]: - out = p4d.take(indices) - expected = Panel4D(data=p4d.values.take(indices, axis=0), - labels=p4d.labels.take(indices), - major_axis=p4d.major_axis, - minor_axis=p4d.minor_axis, - items=p4d.items) - tm.assert_panel4d_equal(out, expected) + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + for p4d in [tm.makePanel4D()]: + out = p4d.take(indices) + expected = Panel4D(data=p4d.values.take(indices, axis=0), + labels=p4d.labels.take(indices), + major_axis=p4d.major_axis, + minor_axis=p4d.minor_axis, + items=p4d.items) + tm.assert_panel4d_equal(out, expected) def test_take_invalid_kwargs(self): indices = [-3, 2, 0, 1] s = tm.makeFloatSeries() df = tm.makeTimeDataFrame() p = tm.makePanel() - p4d = tm.makePanel4D() + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + p4d = tm.makePanel4D() for obj in (s, df, p, p4d): msg = "take\(\) got an unexpected keyword argument 'foo'" diff --git a/pandas/tests/test_graphics_others.py b/pandas/tests/test_graphics_others.py deleted file mode 100644 index 7285d84865542..0000000000000 --- a/pandas/tests/test_graphics_others.py +++ /dev/null @@ -1,984 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -import nose -import itertools -import os -import string -import warnings -from distutils.version import LooseVersion - -from pandas import Series, DataFrame, MultiIndex -from pandas.compat import range, lmap, lzip -import pandas.util.testing as tm -from pandas.util.testing import slow - -import numpy as np -from numpy import random -from numpy.random import randn - -import pandas.tools.plotting as plotting - -from pandas.tests.test_graphics import (TestPlotBase, _check_plot_works, - curpath, _ok_for_gaussian_kde) - - -""" -These tests are for ``DataFrame.hist``, ``DataFrame.boxplot`` and -other miscellaneous plots. -`Dataframe.plot`` and ``Series.plot`` are tested in test_graphics.py -""" - - -def _skip_if_mpl_14_or_dev_boxplot(): - # GH 8382 - # Boxplot failures on 1.4 and 1.4.1 - # Don't need try / except since that's done at class level - import matplotlib - if str(matplotlib.__version__) >= LooseVersion('1.4'): - raise nose.SkipTest("Matplotlib Regression in 1.4 and current dev.") - - -@tm.mplskip -class TestSeriesPlots(TestPlotBase): - - def setUp(self): - TestPlotBase.setUp(self) - import matplotlib as mpl - mpl.rcdefaults() - - self.ts = tm.makeTimeSeries() - self.ts.name = 'ts' - - self.series = tm.makeStringSeries() - self.series.name = 'series' - - self.iseries = tm.makePeriodSeries() - self.iseries.name = 'iseries' - - @slow - def test_hist_legacy(self): - _check_plot_works(self.ts.hist) - _check_plot_works(self.ts.hist, grid=False) - _check_plot_works(self.ts.hist, figsize=(8, 10)) - _check_plot_works(self.ts.hist, by=self.ts.index.month) - _check_plot_works(self.ts.hist, by=self.ts.index.month, bins=5) - - fig, ax = self.plt.subplots(1, 1) - _check_plot_works(self.ts.hist, ax=ax) - _check_plot_works(self.ts.hist, ax=ax, figure=fig) - _check_plot_works(self.ts.hist, figure=fig) - tm.close() - - fig, (ax1, ax2) = self.plt.subplots(1, 2) - _check_plot_works(self.ts.hist, figure=fig, ax=ax1) - _check_plot_works(self.ts.hist, figure=fig, ax=ax2) - - with tm.assertRaises(ValueError): - self.ts.hist(by=self.ts.index, figure=fig) - - @slow - def test_hist_bins_legacy(self): - df = DataFrame(np.random.randn(10, 2)) - ax = df.hist(bins=2)[0][0] - self.assertEqual(len(ax.patches), 2) - - @slow - def test_hist_layout(self): - df = self.hist_df - with tm.assertRaises(ValueError): - df.height.hist(layout=(1, 1)) - - with tm.assertRaises(ValueError): - df.height.hist(layout=[1, 1]) - - @slow - def test_hist_layout_with_by(self): - df = self.hist_df - - axes = _check_plot_works(df.height.hist, by=df.gender, layout=(2, 1)) - self._check_axes_shape(axes, axes_num=2, layout=(2, 1)) - - axes = _check_plot_works(df.height.hist, by=df.gender, layout=(3, -1)) - self._check_axes_shape(axes, axes_num=2, layout=(3, 1)) - - axes = _check_plot_works(df.height.hist, by=df.category, layout=(4, 1)) - self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) - - axes = _check_plot_works( - df.height.hist, by=df.category, layout=(2, -1)) - self._check_axes_shape(axes, axes_num=4, layout=(2, 2)) - - axes = _check_plot_works( - df.height.hist, by=df.category, layout=(3, -1)) - self._check_axes_shape(axes, axes_num=4, layout=(3, 2)) - - axes = _check_plot_works( - df.height.hist, by=df.category, layout=(-1, 4)) - self._check_axes_shape(axes, axes_num=4, layout=(1, 4)) - - axes = _check_plot_works( - df.height.hist, by=df.classroom, layout=(2, 2)) - self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) - - axes = df.height.hist(by=df.category, layout=(4, 2), figsize=(12, 7)) - self._check_axes_shape( - axes, axes_num=4, layout=(4, 2), figsize=(12, 7)) - - @slow - def test_hist_no_overlap(self): - from matplotlib.pyplot import subplot, gcf - x = Series(randn(2)) - y = Series(randn(2)) - subplot(121) - x.hist() - subplot(122) - y.hist() - fig = gcf() - axes = fig.get_axes() - self.assertEqual(len(axes), 2) - - @slow - def test_hist_by_no_extra_plots(self): - df = self.hist_df - axes = df.height.hist(by=df.gender) # noqa - self.assertEqual(len(self.plt.get_fignums()), 1) - - @slow - def test_plot_fails_when_ax_differs_from_figure(self): - from pylab import figure - fig1 = figure() - fig2 = figure() - ax1 = fig1.add_subplot(111) - with tm.assertRaises(AssertionError): - self.ts.hist(ax=ax1, figure=fig2) - - @slow - def test_autocorrelation_plot(self): - from pandas.tools.plotting import autocorrelation_plot - _check_plot_works(autocorrelation_plot, series=self.ts) - _check_plot_works(autocorrelation_plot, series=self.ts.values) - - ax = autocorrelation_plot(self.ts, label='Test') - self._check_legend_labels(ax, labels=['Test']) - - @slow - def test_lag_plot(self): - from pandas.tools.plotting import lag_plot - _check_plot_works(lag_plot, series=self.ts) - _check_plot_works(lag_plot, series=self.ts, lag=5) - - @slow - def test_bootstrap_plot(self): - from pandas.tools.plotting import bootstrap_plot - _check_plot_works(bootstrap_plot, series=self.ts, size=10) - - -@tm.mplskip -class TestDataFramePlots(TestPlotBase): - - def setUp(self): - TestPlotBase.setUp(self) - import matplotlib as mpl - mpl.rcdefaults() - - self.tdf = tm.makeTimeDataFrame() - self.hexbin_df = DataFrame({ - "A": np.random.uniform(size=20), - "B": np.random.uniform(size=20), - "C": np.arange(20) + np.random.uniform(size=20)}) - - from pandas import read_csv - path = os.path.join(curpath(), 'data', 'iris.csv') - self.iris = read_csv(path) - - @slow - def test_boxplot_legacy(self): - df = DataFrame(randn(6, 4), - index=list(string.ascii_letters[:6]), - columns=['one', 'two', 'three', 'four']) - df['indic'] = ['foo', 'bar'] * 3 - df['indic2'] = ['foo', 'bar', 'foo'] * 2 - - _check_plot_works(df.boxplot, return_type='dict') - _check_plot_works(df.boxplot, column=[ - 'one', 'two'], return_type='dict') - _check_plot_works(df.boxplot, column=['one', 'two'], by='indic') - _check_plot_works(df.boxplot, column='one', by=['indic', 'indic2']) - _check_plot_works(df.boxplot, by='indic') - _check_plot_works(df.boxplot, by=['indic', 'indic2']) - _check_plot_works(plotting.boxplot, data=df['one'], return_type='dict') - _check_plot_works(df.boxplot, notch=1, return_type='dict') - _check_plot_works(df.boxplot, by='indic', notch=1) - - df = DataFrame(np.random.rand(10, 2), columns=['Col1', 'Col2']) - df['X'] = Series(['A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B']) - df['Y'] = Series(['A'] * 10) - _check_plot_works(df.boxplot, by='X') - - # When ax is supplied and required number of axes is 1, - # passed ax should be used: - fig, ax = self.plt.subplots() - axes = df.boxplot('Col1', by='X', ax=ax) - self.assertIs(ax.get_axes(), axes) - - fig, ax = self.plt.subplots() - axes = df.groupby('Y').boxplot(ax=ax, return_type='axes') - self.assertIs(ax.get_axes(), axes['A']) - - # Multiple columns with an ax argument should use same figure - fig, ax = self.plt.subplots() - axes = df.boxplot(column=['Col1', 'Col2'], - by='X', ax=ax, return_type='axes') - self.assertIs(axes['Col1'].get_figure(), fig) - - # When by is None, check that all relevant lines are present in the - # dict - fig, ax = self.plt.subplots() - d = df.boxplot(ax=ax, return_type='dict') - lines = list(itertools.chain.from_iterable(d.values())) - self.assertEqual(len(ax.get_lines()), len(lines)) - - @slow - def test_boxplot_return_type_legacy(self): - # API change in https://github.com/pydata/pandas/pull/7096 - import matplotlib as mpl # noqa - - df = DataFrame(randn(6, 4), - index=list(string.ascii_letters[:6]), - columns=['one', 'two', 'three', 'four']) - with tm.assertRaises(ValueError): - df.boxplot(return_type='NOTATYPE') - - with tm.assert_produces_warning(FutureWarning): - result = df.boxplot() - # change to Axes in future - self._check_box_return_type(result, 'dict') - - with tm.assert_produces_warning(False): - result = df.boxplot(return_type='dict') - self._check_box_return_type(result, 'dict') - - with tm.assert_produces_warning(False): - result = df.boxplot(return_type='axes') - self._check_box_return_type(result, 'axes') - - with tm.assert_produces_warning(False): - result = df.boxplot(return_type='both') - self._check_box_return_type(result, 'both') - - @slow - def test_boxplot_axis_limits(self): - - def _check_ax_limits(col, ax): - y_min, y_max = ax.get_ylim() - self.assertTrue(y_min <= col.min()) - self.assertTrue(y_max >= col.max()) - - df = self.hist_df.copy() - df['age'] = np.random.randint(1, 20, df.shape[0]) - # One full row - height_ax, weight_ax = df.boxplot(['height', 'weight'], by='category') - _check_ax_limits(df['height'], height_ax) - _check_ax_limits(df['weight'], weight_ax) - self.assertEqual(weight_ax._sharey, height_ax) - - # Two rows, one partial - p = df.boxplot(['height', 'weight', 'age'], by='category') - height_ax, weight_ax, age_ax = p[0, 0], p[0, 1], p[1, 0] - dummy_ax = p[1, 1] - _check_ax_limits(df['height'], height_ax) - _check_ax_limits(df['weight'], weight_ax) - _check_ax_limits(df['age'], age_ax) - self.assertEqual(weight_ax._sharey, height_ax) - self.assertEqual(age_ax._sharey, height_ax) - self.assertIsNone(dummy_ax._sharey) - - @slow - def test_boxplot_empty_column(self): - _skip_if_mpl_14_or_dev_boxplot() - df = DataFrame(np.random.randn(20, 4)) - df.loc[:, 0] = np.nan - _check_plot_works(df.boxplot, return_type='axes') - - @slow - def test_hist_df_legacy(self): - from matplotlib.patches import Rectangle - _check_plot_works(self.hist_df.hist) - - # make sure layout is handled - df = DataFrame(randn(100, 3)) - axes = _check_plot_works(df.hist, grid=False) - self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) - self.assertFalse(axes[1, 1].get_visible()) - - df = DataFrame(randn(100, 1)) - _check_plot_works(df.hist) - - # make sure layout is handled - df = DataFrame(randn(100, 6)) - axes = _check_plot_works(df.hist, layout=(4, 2)) - self._check_axes_shape(axes, axes_num=6, layout=(4, 2)) - - # make sure sharex, sharey is handled - _check_plot_works(df.hist, sharex=True, sharey=True) - - # handle figsize arg - _check_plot_works(df.hist, figsize=(8, 10)) - - # check bins argument - _check_plot_works(df.hist, bins=5) - - # make sure xlabelsize and xrot are handled - ser = df[0] - xf, yf = 20, 18 - xrot, yrot = 30, 40 - axes = ser.hist(xlabelsize=xf, xrot=xrot, ylabelsize=yf, yrot=yrot) - self._check_ticks_props(axes, xlabelsize=xf, xrot=xrot, - ylabelsize=yf, yrot=yrot) - - xf, yf = 20, 18 - xrot, yrot = 30, 40 - axes = df.hist(xlabelsize=xf, xrot=xrot, ylabelsize=yf, yrot=yrot) - self._check_ticks_props(axes, xlabelsize=xf, xrot=xrot, - ylabelsize=yf, yrot=yrot) - - tm.close() - # make sure kwargs to hist are handled - ax = ser.hist(normed=True, cumulative=True, bins=4) - # height of last bin (index 5) must be 1.0 - rects = [x for x in ax.get_children() if isinstance(x, Rectangle)] - self.assertAlmostEqual(rects[-1].get_height(), 1.0) - - tm.close() - ax = ser.hist(log=True) - # scale of y must be 'log' - self._check_ax_scales(ax, yaxis='log') - - tm.close() - - # propagate attr exception from matplotlib.Axes.hist - with tm.assertRaises(AttributeError): - ser.hist(foo='bar') - - @slow - def test_hist_layout(self): - df = DataFrame(randn(100, 3)) - - layout_to_expected_size = ( - {'layout': None, 'expected_size': (2, 2)}, # default is 2x2 - {'layout': (2, 2), 'expected_size': (2, 2)}, - {'layout': (4, 1), 'expected_size': (4, 1)}, - {'layout': (1, 4), 'expected_size': (1, 4)}, - {'layout': (3, 3), 'expected_size': (3, 3)}, - {'layout': (-1, 4), 'expected_size': (1, 4)}, - {'layout': (4, -1), 'expected_size': (4, 1)}, - {'layout': (-1, 2), 'expected_size': (2, 2)}, - {'layout': (2, -1), 'expected_size': (2, 2)} - ) - - for layout_test in layout_to_expected_size: - axes = df.hist(layout=layout_test['layout']) - expected = layout_test['expected_size'] - self._check_axes_shape(axes, axes_num=3, layout=expected) - - # layout too small for all 4 plots - with tm.assertRaises(ValueError): - df.hist(layout=(1, 1)) - - # invalid format for layout - with tm.assertRaises(ValueError): - df.hist(layout=(1,)) - with tm.assertRaises(ValueError): - df.hist(layout=(-1, -1)) - - @slow - def test_scatter_plot_legacy(self): - tm._skip_if_no_scipy() - - df = DataFrame(randn(100, 2)) - - def scat(**kwds): - return plotting.scatter_matrix(df, **kwds) - - _check_plot_works(scat) - _check_plot_works(scat, marker='+') - _check_plot_works(scat, vmin=0) - if _ok_for_gaussian_kde('kde'): - _check_plot_works(scat, diagonal='kde') - if _ok_for_gaussian_kde('density'): - _check_plot_works(scat, diagonal='density') - _check_plot_works(scat, diagonal='hist') - _check_plot_works(scat, range_padding=.1) - - def scat2(x, y, by=None, ax=None, figsize=None): - return plotting.scatter_plot(df, x, y, by, ax, figsize=None) - - _check_plot_works(scat2, x=0, y=1) - grouper = Series(np.repeat([1, 2, 3, 4, 5], 20), df.index) - _check_plot_works(scat2, x=0, y=1, by=grouper) - - def test_scatter_matrix_axis(self): - tm._skip_if_no_scipy() - scatter_matrix = plotting.scatter_matrix - - with tm.RNGContext(42): - df = DataFrame(randn(100, 3)) - - # we are plotting multiples on a sub-plot - with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(scatter_matrix, filterwarnings='always', - frame=df, range_padding=.1) - axes0_labels = axes[0][0].yaxis.get_majorticklabels() - - # GH 5662 - expected = ['-2', '-1', '0', '1', '2'] - self._check_text_labels(axes0_labels, expected) - self._check_ticks_props( - axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0) - - df[0] = ((df[0] - 2) / 3) - - # we are plotting multiples on a sub-plot - with tm.assert_produces_warning(UserWarning): - axes = _check_plot_works(scatter_matrix, filterwarnings='always', - frame=df, range_padding=.1) - axes0_labels = axes[0][0].yaxis.get_majorticklabels() - expected = ['-1.2', '-1.0', '-0.8', '-0.6', '-0.4', '-0.2', '0.0'] - self._check_text_labels(axes0_labels, expected) - self._check_ticks_props( - axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0) - - @slow - def test_andrews_curves(self): - from pandas.tools.plotting import andrews_curves - from matplotlib import cm - - df = self.iris - - _check_plot_works(andrews_curves, frame=df, class_column='Name') - - rgba = ('#556270', '#4ECDC4', '#C7F464') - ax = _check_plot_works(andrews_curves, frame=df, - class_column='Name', color=rgba) - self._check_colors( - ax.get_lines()[:10], linecolors=rgba, mapping=df['Name'][:10]) - - cnames = ['dodgerblue', 'aquamarine', 'seagreen'] - ax = _check_plot_works(andrews_curves, frame=df, - class_column='Name', color=cnames) - self._check_colors( - ax.get_lines()[:10], linecolors=cnames, mapping=df['Name'][:10]) - - ax = _check_plot_works(andrews_curves, frame=df, - class_column='Name', colormap=cm.jet) - cmaps = lmap(cm.jet, np.linspace(0, 1, df['Name'].nunique())) - self._check_colors( - ax.get_lines()[:10], linecolors=cmaps, mapping=df['Name'][:10]) - - length = 10 - df = DataFrame({"A": random.rand(length), - "B": random.rand(length), - "C": random.rand(length), - "Name": ["A"] * length}) - - _check_plot_works(andrews_curves, frame=df, class_column='Name') - - rgba = ('#556270', '#4ECDC4', '#C7F464') - ax = _check_plot_works(andrews_curves, frame=df, - class_column='Name', color=rgba) - self._check_colors( - ax.get_lines()[:10], linecolors=rgba, mapping=df['Name'][:10]) - - cnames = ['dodgerblue', 'aquamarine', 'seagreen'] - ax = _check_plot_works(andrews_curves, frame=df, - class_column='Name', color=cnames) - self._check_colors( - ax.get_lines()[:10], linecolors=cnames, mapping=df['Name'][:10]) - - ax = _check_plot_works(andrews_curves, frame=df, - class_column='Name', colormap=cm.jet) - cmaps = lmap(cm.jet, np.linspace(0, 1, df['Name'].nunique())) - self._check_colors( - ax.get_lines()[:10], linecolors=cmaps, mapping=df['Name'][:10]) - - colors = ['b', 'g', 'r'] - df = DataFrame({"A": [1, 2, 3], - "B": [1, 2, 3], - "C": [1, 2, 3], - "Name": colors}) - ax = andrews_curves(df, 'Name', color=colors) - handles, labels = ax.get_legend_handles_labels() - self._check_colors(handles, linecolors=colors) - - with tm.assert_produces_warning(FutureWarning): - andrews_curves(data=df, class_column='Name') - - @slow - def test_parallel_coordinates(self): - from pandas.tools.plotting import parallel_coordinates - from matplotlib import cm - - df = self.iris - - ax = _check_plot_works(parallel_coordinates, - frame=df, class_column='Name') - nlines = len(ax.get_lines()) - nxticks = len(ax.xaxis.get_ticklabels()) - - rgba = ('#556270', '#4ECDC4', '#C7F464') - ax = _check_plot_works(parallel_coordinates, - frame=df, class_column='Name', color=rgba) - self._check_colors( - ax.get_lines()[:10], linecolors=rgba, mapping=df['Name'][:10]) - - cnames = ['dodgerblue', 'aquamarine', 'seagreen'] - ax = _check_plot_works(parallel_coordinates, - frame=df, class_column='Name', color=cnames) - self._check_colors( - ax.get_lines()[:10], linecolors=cnames, mapping=df['Name'][:10]) - - ax = _check_plot_works(parallel_coordinates, - frame=df, class_column='Name', colormap=cm.jet) - cmaps = lmap(cm.jet, np.linspace(0, 1, df['Name'].nunique())) - self._check_colors( - ax.get_lines()[:10], linecolors=cmaps, mapping=df['Name'][:10]) - - ax = _check_plot_works(parallel_coordinates, - frame=df, class_column='Name', axvlines=False) - assert len(ax.get_lines()) == (nlines - nxticks) - - colors = ['b', 'g', 'r'] - df = DataFrame({"A": [1, 2, 3], - "B": [1, 2, 3], - "C": [1, 2, 3], - "Name": colors}) - ax = parallel_coordinates(df, 'Name', color=colors) - handles, labels = ax.get_legend_handles_labels() - self._check_colors(handles, linecolors=colors) - - with tm.assert_produces_warning(FutureWarning): - parallel_coordinates(data=df, class_column='Name') - with tm.assert_produces_warning(FutureWarning): - parallel_coordinates(df, 'Name', colors=colors) - - @slow - def test_radviz(self): - from pandas.tools.plotting import radviz - from matplotlib import cm - - df = self.iris - _check_plot_works(radviz, frame=df, class_column='Name') - - rgba = ('#556270', '#4ECDC4', '#C7F464') - ax = _check_plot_works( - radviz, frame=df, class_column='Name', color=rgba) - # skip Circle drawn as ticks - patches = [p for p in ax.patches[:20] if p.get_label() != ''] - self._check_colors( - patches[:10], facecolors=rgba, mapping=df['Name'][:10]) - - cnames = ['dodgerblue', 'aquamarine', 'seagreen'] - _check_plot_works(radviz, frame=df, class_column='Name', color=cnames) - patches = [p for p in ax.patches[:20] if p.get_label() != ''] - self._check_colors(patches, facecolors=cnames, mapping=df['Name'][:10]) - - _check_plot_works(radviz, frame=df, - class_column='Name', colormap=cm.jet) - cmaps = lmap(cm.jet, np.linspace(0, 1, df['Name'].nunique())) - patches = [p for p in ax.patches[:20] if p.get_label() != ''] - self._check_colors(patches, facecolors=cmaps, mapping=df['Name'][:10]) - - colors = [[0., 0., 1., 1.], - [0., 0.5, 1., 1.], - [1., 0., 0., 1.]] - df = DataFrame({"A": [1, 2, 3], - "B": [2, 1, 3], - "C": [3, 2, 1], - "Name": ['b', 'g', 'r']}) - ax = radviz(df, 'Name', color=colors) - handles, labels = ax.get_legend_handles_labels() - self._check_colors(handles, facecolors=colors) - - -@tm.mplskip -class TestDataFrameGroupByPlots(TestPlotBase): - - @slow - def test_boxplot_legacy(self): - grouped = self.hist_df.groupby(by='gender') - with warnings.catch_warnings(): - warnings.simplefilter('ignore') - axes = _check_plot_works(grouped.boxplot, return_type='axes') - self._check_axes_shape(list(axes.values()), axes_num=2, layout=(1, 2)) - - axes = _check_plot_works(grouped.boxplot, subplots=False, - return_type='axes') - self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) - tuples = lzip(string.ascii_letters[:10], range(10)) - df = DataFrame(np.random.rand(10, 3), - index=MultiIndex.from_tuples(tuples)) - - grouped = df.groupby(level=1) - axes = _check_plot_works(grouped.boxplot, return_type='axes') - self._check_axes_shape(list(axes.values()), axes_num=10, layout=(4, 3)) - - axes = _check_plot_works(grouped.boxplot, subplots=False, - return_type='axes') - self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) - - grouped = df.unstack(level=1).groupby(level=0, axis=1) - axes = _check_plot_works(grouped.boxplot, return_type='axes') - self._check_axes_shape(list(axes.values()), axes_num=3, layout=(2, 2)) - - axes = _check_plot_works(grouped.boxplot, subplots=False, - return_type='axes') - self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) - - @slow - def test_grouped_plot_fignums(self): - n = 10 - weight = Series(np.random.normal(166, 20, size=n)) - height = Series(np.random.normal(60, 10, size=n)) - with tm.RNGContext(42): - gender = np.random.choice(['male', 'female'], size=n) - df = DataFrame({'height': height, 'weight': weight, 'gender': gender}) - gb = df.groupby('gender') - - res = gb.plot() - self.assertEqual(len(self.plt.get_fignums()), 2) - self.assertEqual(len(res), 2) - tm.close() - - res = gb.boxplot(return_type='axes') - self.assertEqual(len(self.plt.get_fignums()), 1) - self.assertEqual(len(res), 2) - tm.close() - - # now works with GH 5610 as gender is excluded - res = df.groupby('gender').hist() - tm.close() - - @slow - def test_grouped_hist_legacy(self): - from matplotlib.patches import Rectangle - - df = DataFrame(randn(500, 2), columns=['A', 'B']) - df['C'] = np.random.randint(0, 4, 500) - df['D'] = ['X'] * 500 - - axes = plotting.grouped_hist(df.A, by=df.C) - self._check_axes_shape(axes, axes_num=4, layout=(2, 2)) - - tm.close() - axes = df.hist(by=df.C) - self._check_axes_shape(axes, axes_num=4, layout=(2, 2)) - - tm.close() - # group by a key with single value - axes = df.hist(by='D', rot=30) - self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) - self._check_ticks_props(axes, xrot=30) - - tm.close() - # make sure kwargs to hist are handled - xf, yf = 20, 18 - xrot, yrot = 30, 40 - axes = plotting.grouped_hist(df.A, by=df.C, normed=True, - cumulative=True, bins=4, - xlabelsize=xf, xrot=xrot, - ylabelsize=yf, yrot=yrot) - # height of last bin (index 5) must be 1.0 - for ax in axes.ravel(): - rects = [x for x in ax.get_children() if isinstance(x, Rectangle)] - height = rects[-1].get_height() - self.assertAlmostEqual(height, 1.0) - self._check_ticks_props(axes, xlabelsize=xf, xrot=xrot, - ylabelsize=yf, yrot=yrot) - - tm.close() - axes = plotting.grouped_hist(df.A, by=df.C, log=True) - # scale of y must be 'log' - self._check_ax_scales(axes, yaxis='log') - - tm.close() - # propagate attr exception from matplotlib.Axes.hist - with tm.assertRaises(AttributeError): - plotting.grouped_hist(df.A, by=df.C, foo='bar') - - with tm.assert_produces_warning(FutureWarning): - df.hist(by='C', figsize='default') - - @slow - def test_grouped_hist_legacy2(self): - n = 10 - weight = Series(np.random.normal(166, 20, size=n)) - height = Series(np.random.normal(60, 10, size=n)) - with tm.RNGContext(42): - gender_int = np.random.choice([0, 1], size=n) - df_int = DataFrame({'height': height, 'weight': weight, - 'gender': gender_int}) - gb = df_int.groupby('gender') - axes = gb.hist() - self.assertEqual(len(axes), 2) - self.assertEqual(len(self.plt.get_fignums()), 2) - tm.close() - - @slow - def test_grouped_box_return_type(self): - df = self.hist_df - - # old style: return_type=None - result = df.boxplot(by='gender') - self.assertIsInstance(result, np.ndarray) - self._check_box_return_type( - result, None, - expected_keys=['height', 'weight', 'category']) - - # now for groupby - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = df.groupby('gender').boxplot() - self._check_box_return_type( - result, 'dict', expected_keys=['Male', 'Female']) - - columns2 = 'X B C D A G Y N Q O'.split() - df2 = DataFrame(random.randn(50, 10), columns=columns2) - categories2 = 'A B C D E F G H I J'.split() - df2['category'] = categories2 * 5 - - for t in ['dict', 'axes', 'both']: - returned = df.groupby('classroom').boxplot(return_type=t) - self._check_box_return_type( - returned, t, expected_keys=['A', 'B', 'C']) - - returned = df.boxplot(by='classroom', return_type=t) - self._check_box_return_type( - returned, t, - expected_keys=['height', 'weight', 'category']) - - returned = df2.groupby('category').boxplot(return_type=t) - self._check_box_return_type(returned, t, expected_keys=categories2) - - returned = df2.boxplot(by='category', return_type=t) - self._check_box_return_type(returned, t, expected_keys=columns2) - - @slow - def test_grouped_box_layout(self): - df = self.hist_df - - self.assertRaises(ValueError, df.boxplot, column=['weight', 'height'], - by=df.gender, layout=(1, 1)) - self.assertRaises(ValueError, df.boxplot, - column=['height', 'weight', 'category'], - layout=(2, 1), return_type='dict') - self.assertRaises(ValueError, df.boxplot, column=['weight', 'height'], - by=df.gender, layout=(-1, -1)) - - box = _check_plot_works(df.groupby('gender').boxplot, column='height', - return_type='dict') - self._check_axes_shape(self.plt.gcf().axes, axes_num=2, layout=(1, 2)) - - box = _check_plot_works(df.groupby('category').boxplot, - column='height', - return_type='dict') - self._check_axes_shape(self.plt.gcf().axes, axes_num=4, layout=(2, 2)) - - # GH 6769 - box = _check_plot_works(df.groupby('classroom').boxplot, - column='height', return_type='dict') - self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(2, 2)) - - # GH 5897 - axes = df.boxplot(column=['height', 'weight', 'category'], by='gender', - return_type='axes') - self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(2, 2)) - for ax in [axes['height']]: - self._check_visible(ax.get_xticklabels(), visible=False) - self._check_visible([ax.xaxis.get_label()], visible=False) - for ax in [axes['weight'], axes['category']]: - self._check_visible(ax.get_xticklabels()) - self._check_visible([ax.xaxis.get_label()]) - - box = df.groupby('classroom').boxplot( - column=['height', 'weight', 'category'], return_type='dict') - self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(2, 2)) - - box = _check_plot_works(df.groupby('category').boxplot, - column='height', - layout=(3, 2), return_type='dict') - self._check_axes_shape(self.plt.gcf().axes, axes_num=4, layout=(3, 2)) - box = _check_plot_works(df.groupby('category').boxplot, - column='height', - layout=(3, -1), return_type='dict') - self._check_axes_shape(self.plt.gcf().axes, axes_num=4, layout=(3, 2)) - - box = df.boxplot(column=['height', 'weight', 'category'], by='gender', - layout=(4, 1)) - self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(4, 1)) - - box = df.boxplot(column=['height', 'weight', 'category'], by='gender', - layout=(-1, 1)) - self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(3, 1)) - - box = df.groupby('classroom').boxplot( - column=['height', 'weight', 'category'], layout=(1, 4), - return_type='dict') - self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(1, 4)) - - box = df.groupby('classroom').boxplot( # noqa - column=['height', 'weight', 'category'], layout=(1, -1), - return_type='dict') - self._check_axes_shape(self.plt.gcf().axes, axes_num=3, layout=(1, 3)) - - @slow - def test_grouped_box_multiple_axes(self): - # GH 6970, GH 7069 - df = self.hist_df - - # check warning to ignore sharex / sharey - # this check should be done in the first function which - # passes multiple axes to plot, hist or boxplot - # location should be changed if other test is added - # which has earlier alphabetical order - with tm.assert_produces_warning(UserWarning): - fig, axes = self.plt.subplots(2, 2) - df.groupby('category').boxplot( - column='height', return_type='axes', ax=axes) - self._check_axes_shape(self.plt.gcf().axes, - axes_num=4, layout=(2, 2)) - - fig, axes = self.plt.subplots(2, 3) - with warnings.catch_warnings(): - warnings.simplefilter('ignore') - returned = df.boxplot(column=['height', 'weight', 'category'], - by='gender', return_type='axes', ax=axes[0]) - returned = np.array(list(returned.values())) - self._check_axes_shape(returned, axes_num=3, layout=(1, 3)) - self.assert_numpy_array_equal(returned, axes[0]) - self.assertIs(returned[0].figure, fig) - - # draw on second row - with warnings.catch_warnings(): - warnings.simplefilter('ignore') - returned = df.groupby('classroom').boxplot( - column=['height', 'weight', 'category'], - return_type='axes', ax=axes[1]) - returned = np.array(list(returned.values())) - self._check_axes_shape(returned, axes_num=3, layout=(1, 3)) - self.assert_numpy_array_equal(returned, axes[1]) - self.assertIs(returned[0].figure, fig) - - with tm.assertRaises(ValueError): - fig, axes = self.plt.subplots(2, 3) - # pass different number of axes from required - axes = df.groupby('classroom').boxplot(ax=axes) - - @slow - def test_grouped_hist_layout(self): - df = self.hist_df - self.assertRaises(ValueError, df.hist, column='weight', by=df.gender, - layout=(1, 1)) - self.assertRaises(ValueError, df.hist, column='height', by=df.category, - layout=(1, 3)) - self.assertRaises(ValueError, df.hist, column='height', by=df.category, - layout=(-1, -1)) - - axes = _check_plot_works(df.hist, column='height', by=df.gender, - layout=(2, 1)) - self._check_axes_shape(axes, axes_num=2, layout=(2, 1)) - - axes = _check_plot_works(df.hist, column='height', by=df.gender, - layout=(2, -1)) - self._check_axes_shape(axes, axes_num=2, layout=(2, 1)) - - axes = df.hist(column='height', by=df.category, layout=(4, 1)) - self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) - - axes = df.hist(column='height', by=df.category, layout=(-1, 1)) - self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) - - axes = df.hist(column='height', by=df.category, - layout=(4, 2), figsize=(12, 8)) - self._check_axes_shape( - axes, axes_num=4, layout=(4, 2), figsize=(12, 8)) - tm.close() - - # GH 6769 - axes = _check_plot_works( - df.hist, column='height', by='classroom', layout=(2, 2)) - self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) - - # without column - axes = _check_plot_works(df.hist, by='classroom') - self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) - - axes = df.hist(by='gender', layout=(3, 5)) - self._check_axes_shape(axes, axes_num=2, layout=(3, 5)) - - axes = df.hist(column=['height', 'weight', 'category']) - self._check_axes_shape(axes, axes_num=3, layout=(2, 2)) - - @slow - def test_grouped_hist_multiple_axes(self): - # GH 6970, GH 7069 - df = self.hist_df - - fig, axes = self.plt.subplots(2, 3) - returned = df.hist(column=['height', 'weight', 'category'], ax=axes[0]) - self._check_axes_shape(returned, axes_num=3, layout=(1, 3)) - self.assert_numpy_array_equal(returned, axes[0]) - self.assertIs(returned[0].figure, fig) - returned = df.hist(by='classroom', ax=axes[1]) - self._check_axes_shape(returned, axes_num=3, layout=(1, 3)) - self.assert_numpy_array_equal(returned, axes[1]) - self.assertIs(returned[0].figure, fig) - - with tm.assertRaises(ValueError): - fig, axes = self.plt.subplots(2, 3) - # pass different number of axes from required - axes = df.hist(column='height', ax=axes) - - @slow - def test_axis_share_x(self): - df = self.hist_df - # GH4089 - ax1, ax2 = df.hist(column='height', by=df.gender, sharex=True) - - # share x - self.assertTrue(ax1._shared_x_axes.joined(ax1, ax2)) - self.assertTrue(ax2._shared_x_axes.joined(ax1, ax2)) - - # don't share y - self.assertFalse(ax1._shared_y_axes.joined(ax1, ax2)) - self.assertFalse(ax2._shared_y_axes.joined(ax1, ax2)) - - @slow - def test_axis_share_y(self): - df = self.hist_df - ax1, ax2 = df.hist(column='height', by=df.gender, sharey=True) - - # share y - self.assertTrue(ax1._shared_y_axes.joined(ax1, ax2)) - self.assertTrue(ax2._shared_y_axes.joined(ax1, ax2)) - - # don't share x - self.assertFalse(ax1._shared_x_axes.joined(ax1, ax2)) - self.assertFalse(ax2._shared_x_axes.joined(ax1, ax2)) - - @slow - def test_axis_share_xy(self): - df = self.hist_df - ax1, ax2 = df.hist(column='height', by=df.gender, sharex=True, - sharey=True) - - # share both x and y - self.assertTrue(ax1._shared_x_axes.joined(ax1, ax2)) - self.assertTrue(ax2._shared_x_axes.joined(ax1, ax2)) - - self.assertTrue(ax1._shared_y_axes.joined(ax1, ax2)) - self.assertTrue(ax2._shared_y_axes.joined(ax1, ax2)) - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 6659e6b106a67..9d8873d843642 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -5,7 +5,8 @@ from datetime import datetime from numpy import nan -from pandas import date_range, bdate_range, Timestamp +from pandas.types.common import _ensure_platform_int +from pandas import date_range, bdate_range, Timestamp, isnull from pandas.core.index import Index, MultiIndex, CategoricalIndex from pandas.core.api import Categorical, DataFrame from pandas.core.common import UnsupportedFunctionCall @@ -163,9 +164,9 @@ def test_first_last_nth(self): grouped['B'].nth(0) self.df.loc[self.df['A'] == 'foo', 'B'] = np.nan - self.assertTrue(com.isnull(grouped['B'].first()['foo'])) - self.assertTrue(com.isnull(grouped['B'].last()['foo'])) - self.assertTrue(com.isnull(grouped['B'].nth(0)['foo'])) + self.assertTrue(isnull(grouped['B'].first()['foo'])) + self.assertTrue(isnull(grouped['B'].last()['foo'])) + self.assertTrue(isnull(grouped['B'].nth(0)['foo'])) # v0.14.0 whatsnew df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) @@ -354,6 +355,35 @@ def test_nth_multi_index_as_expected(self): names=['A', 'B'])) assert_frame_equal(result, expected) + def test_group_selection_cache(self): + # GH 12839 nth, head, and tail should return same result consistently + df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B']) + expected = df.iloc[[0, 2]].set_index('A') + + g = df.groupby('A') + result1 = g.head(n=2) + result2 = g.nth(0) + assert_frame_equal(result1, df) + assert_frame_equal(result2, expected) + + g = df.groupby('A') + result1 = g.tail(n=2) + result2 = g.nth(0) + assert_frame_equal(result1, df) + assert_frame_equal(result2, expected) + + g = df.groupby('A') + result1 = g.nth(0) + result2 = g.head(n=2) + assert_frame_equal(result1, expected) + assert_frame_equal(result2, df) + + g = df.groupby('A') + result1 = g.nth(0) + result2 = g.tail(n=2) + assert_frame_equal(result1, expected) + assert_frame_equal(result2, df) + def test_grouper_index_types(self): # related GH5375 # groupby misbehaving when using a Floatlike index @@ -690,6 +720,32 @@ def test_agg_period_index(self): grouped = df.groupby(df.index.month) list(grouped) + def test_agg_dict_parameter_cast_result_dtypes(self): + # GH 12821 + + df = DataFrame( + {'class': ['A', 'A', 'B', 'B', 'C', 'C', 'D', 'D'], + 'time': date_range('1/1/2011', periods=8, freq='H')}) + df.loc[[0, 1, 2, 5], 'time'] = None + + # test for `first` function + exp = df.loc[[0, 3, 4, 6]].set_index('class') + grouped = df.groupby('class') + assert_frame_equal(grouped.first(), exp) + assert_frame_equal(grouped.agg('first'), exp) + assert_frame_equal(grouped.agg({'time': 'first'}), exp) + assert_series_equal(grouped.time.first(), exp['time']) + assert_series_equal(grouped.time.agg('first'), exp['time']) + + # test for `last` function + exp = df.loc[[0, 3, 4, 7]].set_index('class') + grouped = df.groupby('class') + assert_frame_equal(grouped.last(), exp) + assert_frame_equal(grouped.agg('last'), exp) + assert_frame_equal(grouped.agg({'time': 'last'}), exp) + assert_series_equal(grouped.time.last(), exp['time']) + assert_series_equal(grouped.time.agg('last'), exp['time']) + def test_agg_must_agg(self): grouped = self.df.groupby('A')['C'] self.assertRaises(Exception, grouped.agg, lambda x: x.describe()) @@ -723,7 +779,7 @@ def test_get_group(self): g = df.groupby('DATE') key = list(g.groups)[0] result1 = g.get_group(key) - result2 = g.get_group(Timestamp(key).to_datetime()) + result2 = g.get_group(Timestamp(key).to_pydatetime()) result3 = g.get_group(str(Timestamp(key))) assert_frame_equal(result1, result2) assert_frame_equal(result1, result3) @@ -732,7 +788,7 @@ def test_get_group(self): key = list(g.groups)[0] result1 = g.get_group(key) - result2 = g.get_group((Timestamp(key[0]).to_datetime(), key[1])) + result2 = g.get_group((Timestamp(key[0]).to_pydatetime(), key[1])) result3 = g.get_group((str(Timestamp(key[0])), key[1])) assert_frame_equal(result1, result2) assert_frame_equal(result1, result3) @@ -1050,8 +1106,9 @@ def test_transform_fast(self): grp = df.groupby('id')['val'] values = np.repeat(grp.mean().values, - com._ensure_platform_int(grp.count().values)) + _ensure_platform_int(grp.count().values)) expected = pd.Series(values, index=df.index, name='val') + result = grp.transform(np.mean) assert_series_equal(result, expected) @@ -2085,8 +2142,8 @@ def test_groupby_head_tail(self): assert_frame_equal(df.loc[[0, 2]], g_not_as.head(1)) assert_frame_equal(df.loc[[1, 2]], g_not_as.tail(1)) - empty_not_as = DataFrame(columns=df.columns, index=pd.Index( - [], dtype=df.index.dtype)) + empty_not_as = DataFrame(columns=df.columns, + index=pd.Index([], dtype=df.index.dtype)) empty_not_as['A'] = empty_not_as['A'].astype(df.A.dtype) empty_not_as['B'] = empty_not_as['B'].astype(df.B.dtype) assert_frame_equal(empty_not_as, g_not_as.head(0)) @@ -2519,12 +2576,12 @@ def test_groupby_complex(self): def test_level_preserve_order(self): grouped = self.mframe.groupby(level=0) - exp_labels = np.array([0, 0, 0, 1, 1, 2, 2, 3, 3, 3]) + exp_labels = np.array([0, 0, 0, 1, 1, 2, 2, 3, 3, 3], np.intp) assert_almost_equal(grouped.grouper.labels[0], exp_labels) def test_grouping_labels(self): grouped = self.mframe.groupby(self.mframe.index.get_level_values(0)) - exp_labels = np.array([2, 2, 2, 0, 0, 1, 1, 3, 3, 3]) + exp_labels = np.array([2, 2, 2, 0, 0, 1, 1, 3, 3, 3], dtype=np.intp) assert_almost_equal(grouped.grouper.labels[0], exp_labels) def test_cython_fail_agg(self): @@ -2538,9 +2595,11 @@ def test_cython_fail_agg(self): def test_apply_series_to_frame(self): def f(piece): + with np.errstate(invalid='ignore'): + logged = np.log(piece) return DataFrame({'value': piece, 'demeaned': piece - piece.mean(), - 'logged': np.log(piece)}) + 'logged': logged}) dr = bdate_range('1/1/2000', periods=100) ts = Series(np.random.randn(100), index=dr) @@ -2555,6 +2614,16 @@ def test_apply_series_yield_constant(self): result = self.df.groupby(['A', 'B'])['C'].apply(len) self.assertEqual(result.index.names[:2], ('A', 'B')) + def test_apply_frame_yield_constant(self): + # GH13568 + result = self.df.groupby(['A', 'B']).apply(len) + self.assertTrue(isinstance(result, Series)) + self.assertIsNone(result.name) + + result = self.df.groupby(['A', 'B'])[['C', 'D']].apply(len) + self.assertTrue(isinstance(result, Series)) + self.assertIsNone(result.name) + def test_apply_frame_to_series(self): grouped = self.df.groupby(['A', 'B']) result = grouped.apply(len) @@ -2906,6 +2975,14 @@ def test_cython_api2(self): result = df.groupby('A', as_index=False).cumsum() assert_frame_equal(result, expected) + # GH 13994 + result = df.groupby('A').cumsum(axis=1) + expected = df.cumsum(axis=1) + assert_frame_equal(result, expected) + result = df.groupby('A').cumprod(axis=1) + expected = df.cumprod(axis=1) + assert_frame_equal(result, expected) + def test_grouping_ndarray(self): grouped = self.df.groupby(self.df['A'].values) @@ -3031,8 +3108,10 @@ def test_apply_categorical_data(self): grouped = df.groupby(['missing', 'dense']) # missing category 'b' should still exist in the output index - idx = MultiIndex.from_product([['a', 'b'], ['a', 'b', 'c']], - names=['missing', 'dense']) + idx = MultiIndex.from_product( + [Categorical(['a', 'b'], ordered=ordered), + Categorical(['a', 'b', 'c'], ordered=ordered)], + names=['missing', 'dense']) expected = DataFrame([0, 1, 2, np.nan, np.nan, np.nan], index=idx, columns=['values']) @@ -3169,6 +3248,18 @@ def test_groupby_nonstring_columns(self): expected = df.groupby(df[0]).mean() assert_frame_equal(result, expected) + def test_groupby_mixed_type_columns(self): + # GH 13432, unorderable types in py3 + df = DataFrame([[0, 1, 2]], columns=['A', 'B', 0]) + expected = DataFrame([[1, 2]], columns=['B', 0], + index=Index([0], name='A')) + + result = df.groupby('A').first() + tm.assert_frame_equal(result, expected) + + result = df.groupby('A').sum() + tm.assert_frame_equal(result, expected) + def test_cython_grouper_series_bug_noncontig(self): arr = np.empty((100, 100)) arr.fill(np.nan) @@ -3716,6 +3807,22 @@ def test_getitem_list_of_columns(self): assert_frame_equal(result2, expected) assert_frame_equal(result3, expected) + def test_getitem_numeric_column_names(self): + # GH #13731 + df = DataFrame({0: list('abcd') * 2, + 2: np.random.randn(8), + 4: np.random.randn(8), + 6: np.random.randn(8)}) + result = df.groupby(0)[df.columns[1:3]].mean() + result2 = df.groupby(0)[2, 4].mean() + result3 = df.groupby(0)[[2, 4]].mean() + + expected = df.ix[:, [0, 2, 4]].groupby(0).mean() + + assert_frame_equal(result, expected) + assert_frame_equal(result2, expected) + assert_frame_equal(result3, expected) + def test_agg_multiple_functions_maintain_order(self): # GH #610 funcs = [('mean', np.mean), ('max', np.max), ('min', np.min)] @@ -4549,6 +4656,15 @@ def test_groupby_with_empty(self): grouped = series.groupby(grouper) assert next(iter(grouped), None) is None + def test_groupby_with_single_column(self): + df = pd.DataFrame({'a': list('abssbab')}) + tm.assert_frame_equal(df.groupby('a').get_group('a'), df.iloc[[0, 5]]) + # GH 13530 + exp = pd.DataFrame([], index=pd.Index(['a', 'b', 's'], name='a')) + tm.assert_frame_equal(df.groupby('a').count(), exp) + tm.assert_frame_equal(df.groupby('a').sum(), exp) + tm.assert_frame_equal(df.groupby('a').nth(1), exp) + def test_groupby_with_small_elem(self): # GH 8542 # length=2 @@ -4989,8 +5105,8 @@ def test_cumcount_empty(self): ge = DataFrame().groupby(level=0) se = Series().groupby(level=0) - e = Series(dtype='int64' - ) # edge case, as this is usually considered float + # edge case, as this is usually considered float + e = Series(dtype='int64') assert_series_equal(e, ge.cumcount()) assert_series_equal(e, se.cumcount()) @@ -5854,22 +5970,22 @@ def test_lexsort_indexer(self): # orders=True, na_position='last' result = _lexsort_indexer(keys, orders=True, na_position='last') exp = list(range(5, 105)) + list(range(5)) + list(range(105, 110)) - tm.assert_numpy_array_equal(result, np.array(exp)) + tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.intp)) # orders=True, na_position='first' result = _lexsort_indexer(keys, orders=True, na_position='first') exp = list(range(5)) + list(range(105, 110)) + list(range(5, 105)) - tm.assert_numpy_array_equal(result, np.array(exp)) + tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.intp)) # orders=False, na_position='last' result = _lexsort_indexer(keys, orders=False, na_position='last') exp = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110)) - tm.assert_numpy_array_equal(result, np.array(exp)) + tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.intp)) # orders=False, na_position='first' result = _lexsort_indexer(keys, orders=False, na_position='first') exp = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)) - tm.assert_numpy_array_equal(result, np.array(exp)) + tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.intp)) def test_nargsort(self): # np.argsort(items) places NaNs last @@ -5896,49 +6012,49 @@ def test_nargsort(self): result = _nargsort(items, kind='mergesort', ascending=True, na_position='last') exp = list(range(5, 105)) + list(range(5)) + list(range(105, 110)) - tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.int64)) + tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) # mergesort, ascending=True, na_position='first' result = _nargsort(items, kind='mergesort', ascending=True, na_position='first') exp = list(range(5)) + list(range(105, 110)) + list(range(5, 105)) - tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.int64)) + tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) # mergesort, ascending=False, na_position='last' result = _nargsort(items, kind='mergesort', ascending=False, na_position='last') exp = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110)) - tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.int64)) + tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) # mergesort, ascending=False, na_position='first' result = _nargsort(items, kind='mergesort', ascending=False, na_position='first') exp = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)) - tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.int64)) + tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) # mergesort, ascending=True, na_position='last' result = _nargsort(items2, kind='mergesort', ascending=True, na_position='last') exp = list(range(5, 105)) + list(range(5)) + list(range(105, 110)) - tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.int64)) + tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) # mergesort, ascending=True, na_position='first' result = _nargsort(items2, kind='mergesort', ascending=True, na_position='first') exp = list(range(5)) + list(range(105, 110)) + list(range(5, 105)) - tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.int64)) + tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) # mergesort, ascending=False, na_position='last' result = _nargsort(items2, kind='mergesort', ascending=False, na_position='last') exp = list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110)) - tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.int64)) + tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) # mergesort, ascending=False, na_position='first' result = _nargsort(items2, kind='mergesort', ascending=False, na_position='first') exp = list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)) - tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.int64)) + tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False) def test_datetime_count(self): df = DataFrame({'a': [1, 2, 3] * 2, @@ -6107,7 +6223,7 @@ def test_cython_transform(self): # bit a of hack to make sure the cythonized shift # is equivalent to pre 0.17.1 behavior if op == 'shift': - gb._set_selection_from_grouper() + gb._set_group_selection() for (op, args), targop in ops: if op != 'shift' and 'int' not in gb_target: @@ -6275,7 +6391,8 @@ def test_groupby_categorical_two_columns(self): groups_double_key = test.groupby(["cat", "ints"]) res = groups_double_key.agg('mean') exp = DataFrame({"val": [10, 30, 20, 40, np.nan, np.nan], - "cat": ["a", "a", "b", "b", "c", "c"], + "cat": pd.Categorical(["a", "a", "b", "b", "c", "c"], + ordered=True), "ints": [1, 2, 1, 2, 1, 2]}).set_index(["cat", "ints" ]) tm.assert_frame_equal(res, exp) @@ -6295,15 +6412,100 @@ def test_groupby_categorical_two_columns(self): res = groups_double_key.agg('mean') nan = np.nan - idx = MultiIndex.from_product([["(1, 2]", "(2, 3]", "(3, 6]"], - [1, 2, 3, 4]], - names=["cat", "C2"]) + idx = MultiIndex.from_product( + [Categorical(["(1, 2]", "(2, 3]", "(3, 6]"], ordered=True), + [1, 2, 3, 4]], + names=["cat", "C2"]) exp = DataFrame({"C1": [nan, nan, nan, nan, 3, 3, nan, nan, nan, nan, 4, 5], "C3": [nan, nan, nan, nan, 10, 100, nan, nan, nan, nan, 200, 34]}, index=idx) tm.assert_frame_equal(res, exp) + def test_groupby_multi_categorical_as_index(self): + # GH13204 + df = DataFrame({'cat': Categorical([1, 2, 2], [1, 2, 3]), + 'A': [10, 11, 11], + 'B': [101, 102, 103]}) + result = df.groupby(['cat', 'A'], as_index=False).sum() + expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]), + 'A': [10, 11, 10, 11, 10, 11], + 'B': [101.0, nan, nan, 205.0, nan, nan]}, + columns=['cat', 'A', 'B']) + tm.assert_frame_equal(result, expected) + + # function grouper + f = lambda r: df.loc[r, 'A'] + result = df.groupby(['cat', f], as_index=False).sum() + expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]), + 'A': [10.0, nan, nan, 22.0, nan, nan], + 'B': [101.0, nan, nan, 205.0, nan, nan]}, + columns=['cat', 'A', 'B']) + tm.assert_frame_equal(result, expected) + + # another not in-axis grouper (conflicting names in index) + s = Series(['a', 'b', 'b'], name='cat') + result = df.groupby(['cat', s], as_index=False).sum() + expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]), + 'A': [10.0, nan, nan, 22.0, nan, nan], + 'B': [101.0, nan, nan, 205.0, nan, nan]}, + columns=['cat', 'A', 'B']) + tm.assert_frame_equal(result, expected) + + # is original index dropped? + expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]), + 'A': [10, 11, 10, 11, 10, 11], + 'B': [101.0, nan, nan, 205.0, nan, nan]}, + columns=['cat', 'A', 'B']) + + for name in [None, 'X', 'B', 'cat']: + df.index = Index(list("abc"), name=name) + result = df.groupby(['cat', 'A'], as_index=False).sum() + tm.assert_frame_equal(result, expected, check_index_type=True) + + def test_groupby_preserve_categorical_dtype(self): + # GH13743, GH13854 + df = DataFrame({'A': [1, 2, 1, 1, 2], + 'B': [10, 16, 22, 28, 34], + 'C1': Categorical(list("abaab"), + categories=list("bac"), + ordered=False), + 'C2': Categorical(list("abaab"), + categories=list("bac"), + ordered=True)}) + # single grouper + exp_full = DataFrame({'A': [2.0, 1.0, np.nan], + 'B': [25.0, 20.0, np.nan], + 'C1': Categorical(list("bac"), + categories=list("bac"), + ordered=False), + 'C2': Categorical(list("bac"), + categories=list("bac"), + ordered=True)}) + for col in ['C1', 'C2']: + result1 = df.groupby(by=col, as_index=False).mean() + result2 = df.groupby(by=col, as_index=True).mean().reset_index() + expected = exp_full.reindex(columns=result1.columns) + tm.assert_frame_equal(result1, expected) + tm.assert_frame_equal(result2, expected) + + # multiple grouper + exp_full = DataFrame({'A': [1, 1, 1, 2, 2, 2], + 'B': [np.nan, 20.0, np.nan, 25.0, np.nan, + np.nan], + 'C1': Categorical(list("bacbac"), + categories=list("bac"), + ordered=False), + 'C2': Categorical(list("bacbac"), + categories=list("bac"), + ordered=True)}) + for cols in [['A', 'C1'], ['A', 'C2']]: + result1 = df.groupby(by=cols, as_index=False).mean() + result2 = df.groupby(by=cols, as_index=True).mean().reset_index() + expected = exp_full.reindex(columns=result1.columns) + tm.assert_frame_equal(result1, expected) + tm.assert_frame_equal(result2, expected) + def test_groupby_apply_all_none(self): # Tests to make sure no errors if apply function returns all None # values. Issue 9684. @@ -6431,6 +6633,37 @@ def test_numpy_compat(self): tm.assertRaisesRegexp(UnsupportedFunctionCall, msg, getattr(g, func), foo=1) + def test_grouping_string_repr(self): + # GH 13394 + mi = MultiIndex.from_arrays([list("AAB"), list("aba")]) + df = DataFrame([[1, 2, 3]], columns=mi) + gr = df.groupby(df[('A', 'a')]) + + result = gr.grouper.groupings[0].__repr__() + expected = "Grouping(('A', 'a'))" + tm.assert_equal(result, expected) + + def test_group_shift_with_null_key(self): + # This test is designed to replicate the segfault in issue #13813. + n_rows = 1200 + + # Generate a moderately large dataframe with occasional missing + # values in column `B`, and then group by [`A`, `B`]. This should + # force `-1` in `labels` array of `g.grouper.group_info` exactly + # at those places, where the group-by key is partilly missing. + df = DataFrame([(i % 12, i % 3 if i % 3 else np.nan, i) + for i in range(n_rows)], dtype=float, + columns=["A", "B", "Z"], index=None) + g = df.groupby(["A", "B"]) + + expected = DataFrame([(i + 12 if i % 3 and i < n_rows - 12 + else np.nan) + for i in range(n_rows)], dtype=float, + columns=["Z"], index=None) + result = g.shift(-1) + + assert_frame_equal(result, expected) + def assert_fp_equal(a, b): assert (np.abs(a - b) < 1e-12).all() diff --git a/pandas/tests/test_infer_and_convert.py b/pandas/tests/test_infer_and_convert.py deleted file mode 100644 index 68eac12e5ec4c..0000000000000 --- a/pandas/tests/test_infer_and_convert.py +++ /dev/null @@ -1,408 +0,0 @@ -# -*- coding: utf-8 -*- - -from datetime import datetime, timedelta, date, time - -import numpy as np -import pandas as pd -import pandas.lib as lib -import pandas.util.testing as tm -from pandas import Index - -from pandas.compat import long, u, PY2 - - -class TestInference(tm.TestCase): - - def test_infer_dtype_bytes(self): - compare = 'string' if PY2 else 'bytes' - - # string array of bytes - arr = np.array(list('abc'), dtype='S1') - self.assertEqual(pd.lib.infer_dtype(arr), compare) - - # object array of bytes - arr = arr.astype(object) - self.assertEqual(pd.lib.infer_dtype(arr), compare) - - def test_isinf_scalar(self): - # GH 11352 - self.assertTrue(lib.isposinf_scalar(float('inf'))) - self.assertTrue(lib.isposinf_scalar(np.inf)) - self.assertFalse(lib.isposinf_scalar(-np.inf)) - self.assertFalse(lib.isposinf_scalar(1)) - self.assertFalse(lib.isposinf_scalar('a')) - - self.assertTrue(lib.isneginf_scalar(float('-inf'))) - self.assertTrue(lib.isneginf_scalar(-np.inf)) - self.assertFalse(lib.isneginf_scalar(np.inf)) - self.assertFalse(lib.isneginf_scalar(1)) - self.assertFalse(lib.isneginf_scalar('a')) - - def test_maybe_convert_numeric_infinities(self): - # see gh-13274 - infinities = ['inf', 'inF', 'iNf', 'Inf', - 'iNF', 'InF', 'INf', 'INF'] - na_values = set(['', 'NULL', 'nan']) - - pos = np.array(['inf'], dtype=np.float64) - neg = np.array(['-inf'], dtype=np.float64) - - msg = "Unable to parse string" - - for infinity in infinities: - for maybe_int in (True, False): - out = lib.maybe_convert_numeric( - np.array([infinity], dtype=object), - na_values, maybe_int) - tm.assert_numpy_array_equal(out, pos) - - out = lib.maybe_convert_numeric( - np.array(['-' + infinity], dtype=object), - na_values, maybe_int) - tm.assert_numpy_array_equal(out, neg) - - out = lib.maybe_convert_numeric( - np.array([u(infinity)], dtype=object), - na_values, maybe_int) - tm.assert_numpy_array_equal(out, pos) - - out = lib.maybe_convert_numeric( - np.array(['+' + infinity], dtype=object), - na_values, maybe_int) - tm.assert_numpy_array_equal(out, pos) - - # too many characters - with tm.assertRaisesRegexp(ValueError, msg): - lib.maybe_convert_numeric( - np.array(['foo_' + infinity], dtype=object), - na_values, maybe_int) - - def test_maybe_convert_numeric_post_floatify_nan(self): - # see gh-13314 - data = np.array(['1.200', '-999.000', '4.500'], dtype=object) - expected = np.array([1.2, np.nan, 4.5], dtype=np.float64) - nan_values = set([-999, -999.0]) - - for coerce_type in (True, False): - out = lib.maybe_convert_numeric(data, nan_values, coerce_type) - tm.assert_numpy_array_equal(out, expected) - - def test_convert_infs(self): - arr = np.array(['inf', 'inf', 'inf'], dtype='O') - result = lib.maybe_convert_numeric(arr, set(), False) - self.assertTrue(result.dtype == np.float64) - - arr = np.array(['-inf', '-inf', '-inf'], dtype='O') - result = lib.maybe_convert_numeric(arr, set(), False) - self.assertTrue(result.dtype == np.float64) - - def test_scientific_no_exponent(self): - # See PR 12215 - arr = np.array(['42E', '2E', '99e', '6e'], dtype='O') - result = lib.maybe_convert_numeric(arr, set(), False, True) - self.assertTrue(np.all(np.isnan(result))) - - def test_convert_non_hashable(self): - # GH13324 - # make sure that we are handing non-hashables - arr = np.array([[10.0, 2], 1.0, 'apple']) - result = lib.maybe_convert_numeric(arr, set(), False, True) - tm.assert_numpy_array_equal(result, np.array([np.nan, 1.0, np.nan])) - - -class TestTypeInference(tm.TestCase): - _multiprocess_can_split_ = True - - def test_length_zero(self): - result = lib.infer_dtype(np.array([], dtype='i4')) - self.assertEqual(result, 'integer') - - result = lib.infer_dtype([]) - self.assertEqual(result, 'empty') - - def test_integers(self): - arr = np.array([1, 2, 3, np.int64(4), np.int32(5)], dtype='O') - result = lib.infer_dtype(arr) - self.assertEqual(result, 'integer') - - arr = np.array([1, 2, 3, np.int64(4), np.int32(5), 'foo'], dtype='O') - result = lib.infer_dtype(arr) - self.assertEqual(result, 'mixed-integer') - - arr = np.array([1, 2, 3, 4, 5], dtype='i4') - result = lib.infer_dtype(arr) - self.assertEqual(result, 'integer') - - def test_bools(self): - arr = np.array([True, False, True, True, True], dtype='O') - result = lib.infer_dtype(arr) - self.assertEqual(result, 'boolean') - - arr = np.array([np.bool_(True), np.bool_(False)], dtype='O') - result = lib.infer_dtype(arr) - self.assertEqual(result, 'boolean') - - arr = np.array([True, False, True, 'foo'], dtype='O') - result = lib.infer_dtype(arr) - self.assertEqual(result, 'mixed') - - arr = np.array([True, False, True], dtype=bool) - result = lib.infer_dtype(arr) - self.assertEqual(result, 'boolean') - - def test_floats(self): - arr = np.array([1., 2., 3., np.float64(4), np.float32(5)], dtype='O') - result = lib.infer_dtype(arr) - self.assertEqual(result, 'floating') - - arr = np.array([1, 2, 3, np.float64(4), np.float32(5), 'foo'], - dtype='O') - result = lib.infer_dtype(arr) - self.assertEqual(result, 'mixed-integer') - - arr = np.array([1, 2, 3, 4, 5], dtype='f4') - result = lib.infer_dtype(arr) - self.assertEqual(result, 'floating') - - arr = np.array([1, 2, 3, 4, 5], dtype='f8') - result = lib.infer_dtype(arr) - self.assertEqual(result, 'floating') - - def test_string(self): - pass - - def test_unicode(self): - pass - - def test_datetime(self): - - dates = [datetime(2012, 1, x) for x in range(1, 20)] - index = Index(dates) - self.assertEqual(index.inferred_type, 'datetime64') - - def test_date(self): - - dates = [date(2012, 1, x) for x in range(1, 20)] - index = Index(dates) - self.assertEqual(index.inferred_type, 'date') - - def test_to_object_array_tuples(self): - r = (5, 6) - values = [r] - result = lib.to_object_array_tuples(values) - - try: - # make sure record array works - from collections import namedtuple - record = namedtuple('record', 'x y') - r = record(5, 6) - values = [r] - result = lib.to_object_array_tuples(values) # noqa - except ImportError: - pass - - def test_to_object_array_width(self): - # see gh-13320 - rows = [[1, 2, 3], [4, 5, 6]] - - expected = np.array(rows, dtype=object) - out = lib.to_object_array(rows) - tm.assert_numpy_array_equal(out, expected) - - expected = np.array(rows, dtype=object) - out = lib.to_object_array(rows, min_width=1) - tm.assert_numpy_array_equal(out, expected) - - expected = np.array([[1, 2, 3, None, None], - [4, 5, 6, None, None]], dtype=object) - out = lib.to_object_array(rows, min_width=5) - tm.assert_numpy_array_equal(out, expected) - - def test_object(self): - - # GH 7431 - # cannot infer more than this as only a single element - arr = np.array([None], dtype='O') - result = lib.infer_dtype(arr) - self.assertEqual(result, 'mixed') - - def test_categorical(self): - - # GH 8974 - from pandas import Categorical, Series - arr = Categorical(list('abc')) - result = lib.infer_dtype(arr) - self.assertEqual(result, 'categorical') - - result = lib.infer_dtype(Series(arr)) - self.assertEqual(result, 'categorical') - - arr = Categorical(list('abc'), categories=['cegfab'], ordered=True) - result = lib.infer_dtype(arr) - self.assertEqual(result, 'categorical') - - result = lib.infer_dtype(Series(arr)) - self.assertEqual(result, 'categorical') - - -class TestConvert(tm.TestCase): - - def test_convert_objects(self): - arr = np.array(['a', 'b', np.nan, np.nan, 'd', 'e', 'f'], dtype='O') - result = lib.maybe_convert_objects(arr) - self.assertTrue(result.dtype == np.object_) - - def test_convert_objects_ints(self): - # test that we can detect many kinds of integers - dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8'] - - for dtype_str in dtypes: - arr = np.array(list(np.arange(20, dtype=dtype_str)), dtype='O') - self.assertTrue(arr[0].dtype == np.dtype(dtype_str)) - result = lib.maybe_convert_objects(arr) - self.assertTrue(issubclass(result.dtype.type, np.integer)) - - def test_convert_objects_complex_number(self): - for dtype in np.sctypes['complex']: - arr = np.array(list(1j * np.arange(20, dtype=dtype)), dtype='O') - self.assertTrue(arr[0].dtype == np.dtype(dtype)) - result = lib.maybe_convert_objects(arr) - self.assertTrue(issubclass(result.dtype.type, np.complexfloating)) - - -class Testisscalar(tm.TestCase): - - def test_isscalar_builtin_scalars(self): - self.assertTrue(lib.isscalar(None)) - self.assertTrue(lib.isscalar(True)) - self.assertTrue(lib.isscalar(False)) - self.assertTrue(lib.isscalar(0.)) - self.assertTrue(lib.isscalar(np.nan)) - self.assertTrue(lib.isscalar('foobar')) - self.assertTrue(lib.isscalar(b'foobar')) - self.assertTrue(lib.isscalar(u('efoobar'))) - self.assertTrue(lib.isscalar(datetime(2014, 1, 1))) - self.assertTrue(lib.isscalar(date(2014, 1, 1))) - self.assertTrue(lib.isscalar(time(12, 0))) - self.assertTrue(lib.isscalar(timedelta(hours=1))) - self.assertTrue(lib.isscalar(pd.NaT)) - - def test_isscalar_builtin_nonscalars(self): - self.assertFalse(lib.isscalar({})) - self.assertFalse(lib.isscalar([])) - self.assertFalse(lib.isscalar([1])) - self.assertFalse(lib.isscalar(())) - self.assertFalse(lib.isscalar((1, ))) - self.assertFalse(lib.isscalar(slice(None))) - self.assertFalse(lib.isscalar(Ellipsis)) - - def test_isscalar_numpy_array_scalars(self): - self.assertTrue(lib.isscalar(np.int64(1))) - self.assertTrue(lib.isscalar(np.float64(1.))) - self.assertTrue(lib.isscalar(np.int32(1))) - self.assertTrue(lib.isscalar(np.object_('foobar'))) - self.assertTrue(lib.isscalar(np.str_('foobar'))) - self.assertTrue(lib.isscalar(np.unicode_(u('foobar')))) - self.assertTrue(lib.isscalar(np.bytes_(b'foobar'))) - self.assertTrue(lib.isscalar(np.datetime64('2014-01-01'))) - self.assertTrue(lib.isscalar(np.timedelta64(1, 'h'))) - - def test_isscalar_numpy_zerodim_arrays(self): - for zerodim in [np.array(1), np.array('foobar'), - np.array(np.datetime64('2014-01-01')), - np.array(np.timedelta64(1, 'h')), - np.array(np.datetime64('NaT'))]: - self.assertFalse(lib.isscalar(zerodim)) - self.assertTrue(lib.isscalar(lib.item_from_zerodim(zerodim))) - - def test_isscalar_numpy_arrays(self): - self.assertFalse(lib.isscalar(np.array([]))) - self.assertFalse(lib.isscalar(np.array([[]]))) - self.assertFalse(lib.isscalar(np.matrix('1; 2'))) - - def test_isscalar_pandas_scalars(self): - self.assertTrue(lib.isscalar(pd.Timestamp('2014-01-01'))) - self.assertTrue(lib.isscalar(pd.Timedelta(hours=1))) - self.assertTrue(lib.isscalar(pd.Period('2014-01-01'))) - - def test_lisscalar_pandas_containers(self): - self.assertFalse(lib.isscalar(pd.Series())) - self.assertFalse(lib.isscalar(pd.Series([1]))) - self.assertFalse(lib.isscalar(pd.DataFrame())) - self.assertFalse(lib.isscalar(pd.DataFrame([[1]]))) - self.assertFalse(lib.isscalar(pd.Panel())) - self.assertFalse(lib.isscalar(pd.Panel([[[1]]]))) - self.assertFalse(lib.isscalar(pd.Index([]))) - self.assertFalse(lib.isscalar(pd.Index([1]))) - - -class TestParseSQL(tm.TestCase): - - def test_convert_sql_column_floats(self): - arr = np.array([1.5, None, 3, 4.2], dtype=object) - result = lib.convert_sql_column(arr) - expected = np.array([1.5, np.nan, 3, 4.2], dtype='f8') - self.assert_numpy_array_equal(result, expected) - - def test_convert_sql_column_strings(self): - arr = np.array(['1.5', None, '3', '4.2'], dtype=object) - result = lib.convert_sql_column(arr) - expected = np.array(['1.5', np.nan, '3', '4.2'], dtype=object) - self.assert_numpy_array_equal(result, expected) - - def test_convert_sql_column_unicode(self): - arr = np.array([u('1.5'), None, u('3'), u('4.2')], - dtype=object) - result = lib.convert_sql_column(arr) - expected = np.array([u('1.5'), np.nan, u('3'), u('4.2')], - dtype=object) - self.assert_numpy_array_equal(result, expected) - - def test_convert_sql_column_ints(self): - arr = np.array([1, 2, 3, 4], dtype='O') - arr2 = np.array([1, 2, 3, 4], dtype='i4').astype('O') - result = lib.convert_sql_column(arr) - result2 = lib.convert_sql_column(arr2) - expected = np.array([1, 2, 3, 4], dtype='i8') - self.assert_numpy_array_equal(result, expected) - self.assert_numpy_array_equal(result2, expected) - - arr = np.array([1, 2, 3, None, 4], dtype='O') - result = lib.convert_sql_column(arr) - expected = np.array([1, 2, 3, np.nan, 4], dtype='f8') - self.assert_numpy_array_equal(result, expected) - - def test_convert_sql_column_longs(self): - arr = np.array([long(1), long(2), long(3), long(4)], dtype='O') - result = lib.convert_sql_column(arr) - expected = np.array([1, 2, 3, 4], dtype='i8') - self.assert_numpy_array_equal(result, expected) - - arr = np.array([long(1), long(2), long(3), None, long(4)], dtype='O') - result = lib.convert_sql_column(arr) - expected = np.array([1, 2, 3, np.nan, 4], dtype='f8') - self.assert_numpy_array_equal(result, expected) - - def test_convert_sql_column_bools(self): - arr = np.array([True, False, True, False], dtype='O') - result = lib.convert_sql_column(arr) - expected = np.array([True, False, True, False], dtype=bool) - self.assert_numpy_array_equal(result, expected) - - arr = np.array([True, False, None, False], dtype='O') - result = lib.convert_sql_column(arr) - expected = np.array([True, False, np.nan, False], dtype=object) - self.assert_numpy_array_equal(result, expected) - - def test_convert_sql_column_decimals(self): - from decimal import Decimal - arr = np.array([Decimal('1.5'), None, Decimal('3'), Decimal('4.2')]) - result = lib.convert_sql_column(arr) - expected = np.array([1.5, np.nan, 3, 4.2], dtype='f8') - self.assert_numpy_array_equal(result, expected) - -if __name__ == '__main__': - import nose - - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tests/test_join.py b/pandas/tests/test_join.py new file mode 100644 index 0000000000000..bfdb77f3fb350 --- /dev/null +++ b/pandas/tests/test_join.py @@ -0,0 +1,201 @@ +# -*- coding: utf-8 -*- + +import numpy as np +from pandas import Index + +import pandas._join as _join +import pandas.util.testing as tm +from pandas.util.testing import assert_almost_equal + + +class TestIndexer(tm.TestCase): + _multiprocess_can_split_ = True + + def test_outer_join_indexer(self): + typemap = [('int32', _join.outer_join_indexer_int32), + ('int64', _join.outer_join_indexer_int64), + ('float32', _join.outer_join_indexer_float32), + ('float64', _join.outer_join_indexer_float64), + ('object', _join.outer_join_indexer_object)] + + for dtype, indexer in typemap: + left = np.arange(3, dtype=dtype) + right = np.arange(2, 5, dtype=dtype) + empty = np.array([], dtype=dtype) + + result, lindexer, rindexer = indexer(left, right) + tm.assertIsInstance(result, np.ndarray) + tm.assertIsInstance(lindexer, np.ndarray) + tm.assertIsInstance(rindexer, np.ndarray) + tm.assert_numpy_array_equal(result, np.arange(5, dtype=dtype)) + exp = np.array([0, 1, 2, -1, -1], dtype=np.int64) + tm.assert_numpy_array_equal(lindexer, exp) + exp = np.array([-1, -1, 0, 1, 2], dtype=np.int64) + tm.assert_numpy_array_equal(rindexer, exp) + + result, lindexer, rindexer = indexer(empty, right) + tm.assert_numpy_array_equal(result, right) + exp = np.array([-1, -1, -1], dtype=np.int64) + tm.assert_numpy_array_equal(lindexer, exp) + exp = np.array([0, 1, 2], dtype=np.int64) + tm.assert_numpy_array_equal(rindexer, exp) + + result, lindexer, rindexer = indexer(left, empty) + tm.assert_numpy_array_equal(result, left) + exp = np.array([0, 1, 2], dtype=np.int64) + tm.assert_numpy_array_equal(lindexer, exp) + exp = np.array([-1, -1, -1], dtype=np.int64) + tm.assert_numpy_array_equal(rindexer, exp) + + +def test_left_join_indexer_unique(): + a = np.array([1, 2, 3, 4, 5], dtype=np.int64) + b = np.array([2, 2, 3, 4, 4], dtype=np.int64) + + result = _join.left_join_indexer_unique_int64(b, a) + expected = np.array([1, 1, 2, 3, 3], dtype=np.int64) + assert (np.array_equal(result, expected)) + + +def test_left_outer_join_bug(): + left = np.array([0, 1, 0, 1, 1, 2, 3, 1, 0, 2, 1, 2, 0, 1, 1, 2, 3, 2, 3, + 2, 1, 1, 3, 0, 3, 2, 3, 0, 0, 2, 3, 2, 0, 3, 1, 3, 0, 1, + 3, 0, 0, 1, 0, 3, 1, 0, 1, 0, 1, 1, 0, 2, 2, 2, 2, 2, 0, + 3, 1, 2, 0, 0, 3, 1, 3, 2, 2, 0, 1, 3, 0, 2, 3, 2, 3, 3, + 2, 3, 3, 1, 3, 2, 0, 0, 3, 1, 1, 1, 0, 2, 3, 3, 1, 2, 0, + 3, 1, 2, 0, 2], dtype=np.int64) + + right = np.array([3, 1], dtype=np.int64) + max_groups = 4 + + lidx, ridx = _join.left_outer_join(left, right, max_groups, sort=False) + + exp_lidx = np.arange(len(left)) + exp_ridx = -np.ones(len(left)) + exp_ridx[left == 1] = 1 + exp_ridx[left == 3] = 0 + + assert (np.array_equal(lidx, exp_lidx)) + assert (np.array_equal(ridx, exp_ridx)) + + +def test_inner_join_indexer(): + a = np.array([1, 2, 3, 4, 5], dtype=np.int64) + b = np.array([0, 3, 5, 7, 9], dtype=np.int64) + + index, ares, bres = _join.inner_join_indexer_int64(a, b) + + index_exp = np.array([3, 5], dtype=np.int64) + assert_almost_equal(index, index_exp) + + aexp = np.array([2, 4], dtype=np.int64) + bexp = np.array([1, 2], dtype=np.int64) + assert_almost_equal(ares, aexp) + assert_almost_equal(bres, bexp) + + a = np.array([5], dtype=np.int64) + b = np.array([5], dtype=np.int64) + + index, ares, bres = _join.inner_join_indexer_int64(a, b) + tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64)) + tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.int64)) + tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.int64)) + + +def test_outer_join_indexer(): + a = np.array([1, 2, 3, 4, 5], dtype=np.int64) + b = np.array([0, 3, 5, 7, 9], dtype=np.int64) + + index, ares, bres = _join.outer_join_indexer_int64(a, b) + + index_exp = np.array([0, 1, 2, 3, 4, 5, 7, 9], dtype=np.int64) + assert_almost_equal(index, index_exp) + + aexp = np.array([-1, 0, 1, 2, 3, 4, -1, -1], dtype=np.int64) + bexp = np.array([0, -1, -1, 1, -1, 2, 3, 4], dtype=np.int64) + assert_almost_equal(ares, aexp) + assert_almost_equal(bres, bexp) + + a = np.array([5], dtype=np.int64) + b = np.array([5], dtype=np.int64) + + index, ares, bres = _join.outer_join_indexer_int64(a, b) + tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64)) + tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.int64)) + tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.int64)) + + +def test_left_join_indexer(): + a = np.array([1, 2, 3, 4, 5], dtype=np.int64) + b = np.array([0, 3, 5, 7, 9], dtype=np.int64) + + index, ares, bres = _join.left_join_indexer_int64(a, b) + + assert_almost_equal(index, a) + + aexp = np.array([0, 1, 2, 3, 4], dtype=np.int64) + bexp = np.array([-1, -1, 1, -1, 2], dtype=np.int64) + assert_almost_equal(ares, aexp) + assert_almost_equal(bres, bexp) + + a = np.array([5], dtype=np.int64) + b = np.array([5], dtype=np.int64) + + index, ares, bres = _join.left_join_indexer_int64(a, b) + tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64)) + tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.int64)) + tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.int64)) + + +def test_left_join_indexer2(): + idx = Index([1, 1, 2, 5]) + idx2 = Index([1, 2, 5, 7, 9]) + + res, lidx, ridx = _join.left_join_indexer_int64(idx2.values, idx.values) + + exp_res = np.array([1, 1, 2, 5, 7, 9], dtype=np.int64) + assert_almost_equal(res, exp_res) + + exp_lidx = np.array([0, 0, 1, 2, 3, 4], dtype=np.int64) + assert_almost_equal(lidx, exp_lidx) + + exp_ridx = np.array([0, 1, 2, 3, -1, -1], dtype=np.int64) + assert_almost_equal(ridx, exp_ridx) + + +def test_outer_join_indexer2(): + idx = Index([1, 1, 2, 5]) + idx2 = Index([1, 2, 5, 7, 9]) + + res, lidx, ridx = _join.outer_join_indexer_int64(idx2.values, idx.values) + + exp_res = np.array([1, 1, 2, 5, 7, 9], dtype=np.int64) + assert_almost_equal(res, exp_res) + + exp_lidx = np.array([0, 0, 1, 2, 3, 4], dtype=np.int64) + assert_almost_equal(lidx, exp_lidx) + + exp_ridx = np.array([0, 1, 2, 3, -1, -1], dtype=np.int64) + assert_almost_equal(ridx, exp_ridx) + + +def test_inner_join_indexer2(): + idx = Index([1, 1, 2, 5]) + idx2 = Index([1, 2, 5, 7, 9]) + + res, lidx, ridx = _join.inner_join_indexer_int64(idx2.values, idx.values) + + exp_res = np.array([1, 1, 2, 5], dtype=np.int64) + assert_almost_equal(res, exp_res) + + exp_lidx = np.array([0, 0, 1, 2], dtype=np.int64) + assert_almost_equal(lidx, exp_lidx) + + exp_ridx = np.array([0, 1, 2, 3], dtype=np.int64) + assert_almost_equal(ridx, exp_ridx) + + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/test_lib.py b/pandas/tests/test_lib.py index 10a6bb5c75b01..945f8004687cd 100644 --- a/pandas/tests/test_lib.py +++ b/pandas/tests/test_lib.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- import numpy as np +import pandas as pd import pandas.lib as lib import pandas.util.testing as tm @@ -184,43 +185,54 @@ def test_get_reverse_indexer(self): self.assertTrue(np.array_equal(result, expected)) -def test_duplicated_with_nas(): - keys = np.array([0, 1, np.nan, 0, 2, np.nan], dtype=object) +class TestNullObj(tm.TestCase): - result = lib.duplicated(keys) - expected = [False, False, False, True, False, True] - assert (np.array_equal(result, expected)) + _1d_methods = ['isnullobj', 'isnullobj_old'] + _2d_methods = ['isnullobj2d', 'isnullobj2d_old'] - result = lib.duplicated(keys, keep='first') - expected = [False, False, False, True, False, True] - assert (np.array_equal(result, expected)) + def _check_behavior(self, arr, expected): + for method in TestNullObj._1d_methods: + result = getattr(lib, method)(arr) + tm.assert_numpy_array_equal(result, expected) - result = lib.duplicated(keys, keep='last') - expected = [True, False, True, False, False, False] - assert (np.array_equal(result, expected)) + arr = np.atleast_2d(arr) + expected = np.atleast_2d(expected) - result = lib.duplicated(keys, keep=False) - expected = [True, False, True, True, False, True] - assert (np.array_equal(result, expected)) + for method in TestNullObj._2d_methods: + result = getattr(lib, method)(arr) + tm.assert_numpy_array_equal(result, expected) - keys = np.empty(8, dtype=object) - for i, t in enumerate(zip([0, 0, np.nan, np.nan] * 2, - [0, np.nan, 0, np.nan] * 2)): - keys[i] = t + def test_basic(self): + arr = np.array([1, None, 'foo', -5.1, pd.NaT, np.nan]) + expected = np.array([False, True, False, False, True, True]) - result = lib.duplicated(keys) - falses = [False] * 4 - trues = [True] * 4 - expected = falses + trues - assert (np.array_equal(result, expected)) + self._check_behavior(arr, expected) - result = lib.duplicated(keys, keep='last') - expected = trues + falses - assert (np.array_equal(result, expected)) + def test_non_obj_dtype(self): + arr = np.array([1, 3, np.nan, 5], dtype=float) + expected = np.array([False, False, True, False]) + + self._check_behavior(arr, expected) + + def test_empty_arr(self): + arr = np.array([]) + expected = np.array([], dtype=bool) + + self._check_behavior(arr, expected) + + def test_empty_str_inp(self): + arr = np.array([""]) # empty but not null + expected = np.array([False]) + + self._check_behavior(arr, expected) + + def test_empty_like(self): + # see gh-13717: no segfaults! + arr = np.empty_like([None]) + expected = np.array([True]) + + self._check_behavior(arr, expected) - result = lib.duplicated(keys, keep=False) - expected = trues + trues - assert (np.array_equal(result, expected)) if __name__ == '__main__': import nose diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index c4ccef13f2844..f3b0becccf596 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -10,6 +10,7 @@ from pandas.core.index import Index, MultiIndex from pandas import Panel, DataFrame, Series, notnull, isnull, Timestamp +from pandas.types.common import is_float_dtype, is_integer_dtype from pandas.util.testing import (assert_almost_equal, assert_series_equal, assert_frame_equal, assertRaisesRegexp) import pandas.core.common as com @@ -787,8 +788,8 @@ def test_delevel_infer_dtype(self): df = DataFrame(np.random.randn(8, 3), columns=['A', 'B', 'C'], index=index) deleveled = df.reset_index() - self.assertTrue(com.is_integer_dtype(deleveled['prm1'])) - self.assertTrue(com.is_float_dtype(deleveled['prm2'])) + self.assertTrue(is_integer_dtype(deleveled['prm1'])) + self.assertTrue(is_float_dtype(deleveled['prm2'])) def test_reset_index_with_drop(self): deleveled = self.ymd.reset_index(drop=True) @@ -2365,7 +2366,7 @@ def test_reset_index_datetime(self): 'a': np.arange(6, dtype='int64')}, columns=['level_0', 'level_1', 'a']) expected['level_1'] = expected['level_1'].apply( - lambda d: pd.Timestamp(d, offset='D', tz=tz)) + lambda d: pd.Timestamp(d, freq='D', tz=tz)) assert_frame_equal(df.reset_index(), expected) def test_reset_index_period(self): diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index 904bedde03312..dd3a49de55d73 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -5,8 +5,8 @@ import warnings import numpy as np -from pandas import Series -from pandas.core.common import isnull, is_integer_dtype +from pandas import Series, isnull +from pandas.types.common import is_integer_dtype import pandas.core.nanops as nanops import pandas.util.testing as tm @@ -58,12 +58,14 @@ def setUp(self): 'O'), self.arr_utf.astype('O'), self.arr_date.astype('O'), self.arr_tdelta.astype('O')]) - self.arr_nan_nanj = self.arr_nan + self.arr_nan * 1j - self.arr_complex_nan = np.vstack([self.arr_complex, self.arr_nan_nanj]) + with np.errstate(invalid='ignore'): + self.arr_nan_nanj = self.arr_nan + self.arr_nan * 1j + self.arr_complex_nan = np.vstack([self.arr_complex, + self.arr_nan_nanj]) - self.arr_nan_infj = self.arr_inf * 1j - self.arr_complex_nan_infj = np.vstack([self.arr_complex, - self.arr_nan_infj]) + self.arr_nan_infj = self.arr_inf * 1j + self.arr_complex_nan_infj = np.vstack([self.arr_complex, + self.arr_nan_infj]) self.arr_float_2d = self.arr_float[:, :, 0] self.arr_float1_2d = self.arr_float1[:, :, 0] diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index b1f09ad2685e3..a197037789fd2 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -5,51 +5,31 @@ import operator import nose -from functools import wraps import numpy as np import pandas as pd -from pandas import Series, DataFrame, Index, isnull, notnull, pivot, MultiIndex -from pandas.core.datetools import bday +from pandas.types.common import is_float_dtype +from pandas import (Series, DataFrame, Index, date_range, isnull, notnull, + pivot, MultiIndex) from pandas.core.nanops import nanall, nanany from pandas.core.panel import Panel from pandas.core.series import remove_na -import pandas.core.common as com + from pandas.formats.printing import pprint_thing from pandas import compat from pandas.compat import range, lrange, StringIO, OrderedDict, signature -from pandas import SparsePanel +from pandas.tseries.offsets import BDay, MonthEnd from pandas.util.testing import (assert_panel_equal, assert_frame_equal, assert_series_equal, assert_almost_equal, - assert_produces_warning, ensure_clean, - assertRaisesRegexp, makeCustomDataframe as - mkdf, makeMixedDataFrame) + ensure_clean, assertRaisesRegexp, + makeCustomDataframe as mkdf, + makeMixedDataFrame) import pandas.core.panel as panelm import pandas.util.testing as tm -def ignore_sparse_panel_future_warning(func): - """ - decorator to ignore FutureWarning if we have a SparsePanel - - can be removed when SparsePanel is fully removed - """ - - @wraps(func) - def wrapper(self, *args, **kwargs): - - if isinstance(self.panel, SparsePanel): - with assert_produces_warning(FutureWarning, - check_stacklevel=False): - return func(self, *args, **kwargs) - else: - return func(self, *args, **kwargs) - - return wrapper - - class PanelTests(object): panel = None @@ -77,7 +57,6 @@ class SafeForLongAndSparse(object): def test_repr(self): repr(self.panel) - @ignore_sparse_panel_future_warning def test_copy_names(self): for attr in ('major_axis', 'minor_axis'): getattr(self.panel, attr).name = None @@ -156,16 +135,6 @@ def alt(x): self._check_stat_op('sem', alt) - # def test_skew(self): - # from scipy.stats import skew - - # def alt(x): - # if len(x) < 3: - # return np.nan - # return skew(x, bias=False) - - # self._check_stat_op('skew', alt) - def _check_stat_op(self, name, alternative, obj=None, has_skipna=True): if obj is None: obj = self.panel @@ -270,7 +239,6 @@ def test_get_plane_axes(self): index, columns = self.panel._get_plane_axes('minor_axis') index, columns = self.panel._get_plane_axes(0) - @ignore_sparse_panel_future_warning def test_truncate(self): dates = self.panel.major_axis start, end = dates[1], dates[5] @@ -331,7 +299,6 @@ def test_iteritems(self): self.assertEqual(len(list(self.panel.iteritems())), len(self.panel.items)) - @ignore_sparse_panel_future_warning def test_combineFrame(self): def check_op(op, name): # items @@ -361,18 +328,9 @@ def check_op(op, name): assert_frame_equal(result.minor_xs(idx), op(self.panel.minor_xs(idx), xs)) - ops = ['add', 'sub', 'mul', 'truediv', 'floordiv'] + ops = ['add', 'sub', 'mul', 'truediv', 'floordiv', 'pow', 'mod'] if not compat.PY3: ops.append('div') - # pow, mod not supported for SparsePanel as flex ops (for now) - if not isinstance(self.panel, SparsePanel): - ops.extend(['pow', 'mod']) - else: - idx = self.panel.minor_axis[1] - with assertRaisesRegexp(ValueError, "Simple arithmetic.*scalar"): - self.panel.pow(self.panel.minor_xs(idx), axis='minor') - with assertRaisesRegexp(ValueError, "Simple arithmetic.*scalar"): - self.panel.mod(self.panel.minor_xs(idx), axis='minor') for op in ops: try: @@ -387,12 +345,10 @@ def check_op(op, name): pprint_thing("Failing operation: %r" % 'div') raise - @ignore_sparse_panel_future_warning def test_combinePanel(self): result = self.panel.add(self.panel) self.assert_panel_equal(result, self.panel * 2) - @ignore_sparse_panel_future_warning def test_neg(self): self.assert_panel_equal(-self.panel, self.panel * -1) @@ -408,7 +364,6 @@ def test_raise_when_not_implemented(self): with self.assertRaises(NotImplementedError): getattr(p, op)(d, axis=0) - @ignore_sparse_panel_future_warning def test_select(self): p = self.panel @@ -440,7 +395,6 @@ def test_get_value(self): expected = self.panel[item][mnr][mjr] assert_almost_equal(result, expected) - @ignore_sparse_panel_future_warning def test_abs(self): result = self.panel.abs() @@ -547,11 +501,9 @@ def test_setitem(self): p[0] = np.random.randn(4, 2) def test_setitem_ndarray(self): - from pandas import date_range, datetools - timeidx = date_range(start=datetime(2009, 1, 1), end=datetime(2009, 12, 31), - freq=datetools.MonthEnd()) + freq=MonthEnd()) lons_coarse = np.linspace(-177.5, 177.5, 72) lats_coarse = np.linspace(-87.5, 87.5, 36) P = Panel(items=timeidx, major_axis=lons_coarse, @@ -589,7 +541,7 @@ def test_major_xs(self): self.assertEqual(result.name, 'ItemA') # not contained - idx = self.panel.major_axis[0] - bday + idx = self.panel.major_axis[0] - BDay() self.assertRaises(Exception, self.panel.major_xs, idx) def test_major_xs_mixed(self): @@ -871,12 +823,13 @@ def test_comp(func): self.assert_numpy_array_equal(result3.values, func(self.panel.values, 0)) - test_comp(operator.eq) - test_comp(operator.ne) - test_comp(operator.lt) - test_comp(operator.gt) - test_comp(operator.ge) - test_comp(operator.le) + with np.errstate(invalid='ignore'): + test_comp(operator.eq) + test_comp(operator.ne) + test_comp(operator.lt) + test_comp(operator.gt) + test_comp(operator.ge) + test_comp(operator.le) def test_get_value(self): for item in self.panel.items: @@ -903,7 +856,7 @@ def test_set_value(self): self.assertEqual(res.get_value('ItemE', 'foo', 'bar'), 1.5) res3 = self.panel.set_value('ItemE', 'foobar', 'baz', 5) - self.assertTrue(com.is_float_dtype(res3['ItemE'].values)) + self.assertTrue(is_float_dtype(res3['ItemE'].values)) with tm.assertRaisesRegexp(TypeError, "There must be an argument for each axis" " plus the value provided"): @@ -928,20 +881,6 @@ def setUp(self): self.panel.minor_axis.name = None self.panel.items.name = None - def test_panel_warnings(self): - with tm.assert_produces_warning(FutureWarning): - shifted1 = self.panel.shift(lags=1) - - with tm.assert_produces_warning(False): - shifted2 = self.panel.shift(periods=1) - - tm.assert_panel_equal(shifted1, shifted2) - - with tm.assert_produces_warning(False): - shifted3 = self.panel.shift() - - tm.assert_panel_equal(shifted1, shifted3) - def test_constructor(self): # with BlockManager wp = Panel(self.panel._data) @@ -1230,13 +1169,26 @@ def test_dtypes(self): expected = Series(np.dtype('float64'), index=self.panel.items) assert_series_equal(result, expected) + def test_astype(self): + # GH7271 + data = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]]) + panel = Panel(data, ['a', 'b'], ['c', 'd'], ['e', 'f']) + + str_data = np.array([[['1', '2'], ['3', '4']], + [['5', '6'], ['7', '8']]]) + expected = Panel(str_data, ['a', 'b'], ['c', 'd'], ['e', 'f']) + assert_panel_equal(panel.astype(str), expected) + + self.assertRaises(NotImplementedError, panel.astype, {0: str}) + def test_apply(self): # GH1148 # ufunc applied = self.panel.apply(np.sqrt) - self.assertTrue(assert_almost_equal(applied.values, np.sqrt( - self.panel.values))) + with np.errstate(invalid='ignore'): + expected = np.sqrt(self.panel.values) + assert_almost_equal(applied.values, expected) # ufunc same shape result = self.panel.apply(lambda x: x * 2, axis='items') @@ -1651,7 +1603,6 @@ def test_transpose_copy(self): panel.values[0, 1, 1] = np.nan self.assertTrue(notnull(result.values[1, 0, 1])) - @ignore_sparse_panel_future_warning def test_to_frame(self): # filtered filtered = self.panel.to_frame() @@ -1926,7 +1877,7 @@ def test_tshift(self): shifted2 = ps.tshift(freq='B') assert_panel_equal(shifted, shifted2) - shifted3 = ps.tshift(freq=bday) + shifted3 = ps.tshift(freq=BDay()) assert_panel_equal(shifted, shifted3) assertRaisesRegexp(ValueError, 'does not match', ps.tshift, freq='M') @@ -2429,7 +2380,12 @@ def test_to_string(self): buf = StringIO() self.panel.to_string(buf) - @ignore_sparse_panel_future_warning + def test_to_sparse(self): + if isinstance(self.panel, Panel): + msg = 'sparsifying is not supported' + tm.assertRaisesRegexp(NotImplementedError, msg, + self.panel.to_sparse) + def test_truncate(self): dates = self.panel.index.levels[0] start, end = dates[1], dates[5] @@ -2472,18 +2428,18 @@ def test_truncate(self): def test_axis_dummies(self): from pandas.core.reshape import make_axis_dummies - minor_dummies = make_axis_dummies(self.panel, 'minor') + minor_dummies = make_axis_dummies(self.panel, 'minor').astype(np.uint8) self.assertEqual(len(minor_dummies.columns), len(self.panel.index.levels[1])) - major_dummies = make_axis_dummies(self.panel, 'major') + major_dummies = make_axis_dummies(self.panel, 'major').astype(np.uint8) self.assertEqual(len(major_dummies.columns), len(self.panel.index.levels[0])) mapping = {'A': 'one', 'B': 'one', 'C': 'two', 'D': 'two'} transformed = make_axis_dummies(self.panel, 'minor', - transform=mapping.get) + transform=mapping.get).astype(np.uint8) self.assertEqual(len(transformed.columns), 2) self.assert_index_equal(transformed.columns, Index(['one', 'two'])) @@ -2493,7 +2449,7 @@ def test_get_dummies(self): from pandas.core.reshape import get_dummies, make_axis_dummies self.panel['Label'] = self.panel.index.labels[1] - minor_dummies = make_axis_dummies(self.panel, 'minor') + minor_dummies = make_axis_dummies(self.panel, 'minor').astype(np.uint8) dummies = get_dummies(self.panel['Label']) self.assert_numpy_array_equal(dummies.values, minor_dummies.values) @@ -2584,13 +2540,6 @@ def test_panel_index(): tm.assert_index_equal(index, expected) -def test_import_warnings(): - # GH8152 - panel = Panel(np.random.rand(3, 3, 3)) - with assert_produces_warning(): - panel.major_xs(1, copy=False) - - if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) diff --git a/pandas/tests/test_panel4d.py b/pandas/tests/test_panel4d.py index 607048df29faa..1b5a7b6ee1e83 100644 --- a/pandas/tests/test_panel4d.py +++ b/pandas/tests/test_panel4d.py @@ -6,12 +6,12 @@ import numpy as np +from pandas.types.common import is_float_dtype from pandas import Series, Index, isnull, notnull -from pandas.core.datetools import bday from pandas.core.panel import Panel from pandas.core.panel4d import Panel4D from pandas.core.series import remove_na -import pandas.core.common as com +from pandas.tseries.offsets import BDay from pandas.util.testing import (assert_panel_equal, assert_panel4d_equal, @@ -205,20 +205,22 @@ def test_get_axis_name(self): self.assertEqual(self.panel4d._get_axis_name(3), 'minor_axis') def test_arith(self): - self._test_op(self.panel4d, operator.add) - self._test_op(self.panel4d, operator.sub) - self._test_op(self.panel4d, operator.mul) - self._test_op(self.panel4d, operator.truediv) - self._test_op(self.panel4d, operator.floordiv) - self._test_op(self.panel4d, operator.pow) - - self._test_op(self.panel4d, lambda x, y: y + x) - self._test_op(self.panel4d, lambda x, y: y - x) - self._test_op(self.panel4d, lambda x, y: y * x) - self._test_op(self.panel4d, lambda x, y: y / x) - self._test_op(self.panel4d, lambda x, y: y ** x) - - self.assertRaises(Exception, self.panel4d.__add__, self.panel4d['l1']) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + self._test_op(self.panel4d, operator.add) + self._test_op(self.panel4d, operator.sub) + self._test_op(self.panel4d, operator.mul) + self._test_op(self.panel4d, operator.truediv) + self._test_op(self.panel4d, operator.floordiv) + self._test_op(self.panel4d, operator.pow) + + self._test_op(self.panel4d, lambda x, y: y + x) + self._test_op(self.panel4d, lambda x, y: y - x) + self._test_op(self.panel4d, lambda x, y: y * x) + self._test_op(self.panel4d, lambda x, y: y / x) + self._test_op(self.panel4d, lambda x, y: y ** x) + + self.assertRaises(Exception, self.panel4d.__add__, + self.panel4d['l1']) @staticmethod def _test_op(panel4d, op): @@ -235,41 +237,47 @@ def test_iteritems(self): len(self.panel4d.labels)) def test_combinePanel4d(self): - result = self.panel4d.add(self.panel4d) - self.assert_panel4d_equal(result, self.panel4d * 2) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = self.panel4d.add(self.panel4d) + self.assert_panel4d_equal(result, self.panel4d * 2) def test_neg(self): - self.assert_panel4d_equal(-self.panel4d, self.panel4d * -1) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + self.assert_panel4d_equal(-self.panel4d, self.panel4d * -1) def test_select(self): - p = self.panel4d + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + + p = self.panel4d - # select labels - result = p.select(lambda x: x in ('l1', 'l3'), axis='labels') - expected = p.reindex(labels=['l1', 'l3']) - self.assert_panel4d_equal(result, expected) + # select labels + result = p.select(lambda x: x in ('l1', 'l3'), axis='labels') + expected = p.reindex(labels=['l1', 'l3']) + self.assert_panel4d_equal(result, expected) - # select items - result = p.select(lambda x: x in ('ItemA', 'ItemC'), axis='items') - expected = p.reindex(items=['ItemA', 'ItemC']) - self.assert_panel4d_equal(result, expected) + # select items + result = p.select(lambda x: x in ('ItemA', 'ItemC'), axis='items') + expected = p.reindex(items=['ItemA', 'ItemC']) + self.assert_panel4d_equal(result, expected) - # select major_axis - result = p.select(lambda x: x >= datetime(2000, 1, 15), axis='major') - new_major = p.major_axis[p.major_axis >= datetime(2000, 1, 15)] - expected = p.reindex(major=new_major) - self.assert_panel4d_equal(result, expected) + # select major_axis + result = p.select(lambda x: x >= datetime(2000, 1, 15), + axis='major') + new_major = p.major_axis[p.major_axis >= datetime(2000, 1, 15)] + expected = p.reindex(major=new_major) + self.assert_panel4d_equal(result, expected) - # select minor_axis - result = p.select(lambda x: x in ('D', 'A'), axis=3) - expected = p.reindex(minor=['A', 'D']) - self.assert_panel4d_equal(result, expected) + # select minor_axis + result = p.select(lambda x: x in ('D', 'A'), axis=3) + expected = p.reindex(minor=['A', 'D']) + self.assert_panel4d_equal(result, expected) - # corner case, empty thing - result = p.select(lambda x: x in ('foo',), axis='items') - self.assert_panel4d_equal(result, p.reindex(items=[])) + # corner case, empty thing + result = p.select(lambda x: x in ('foo',), axis='items') + self.assert_panel4d_equal(result, p.reindex(items=[])) def test_get_value(self): + for item in self.panel.items: for mjr in self.panel.major_axis[::2]: for mnr in self.panel.minor_axis: @@ -278,19 +286,21 @@ def test_get_value(self): assert_almost_equal(result, expected) def test_abs(self): - result = self.panel4d.abs() - expected = np.abs(self.panel4d) - self.assert_panel4d_equal(result, expected) - p = self.panel4d['l1'] - result = p.abs() - expected = np.abs(p) - assert_panel_equal(result, expected) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = self.panel4d.abs() + expected = np.abs(self.panel4d) + self.assert_panel4d_equal(result, expected) + + p = self.panel4d['l1'] + result = p.abs() + expected = np.abs(p) + assert_panel_equal(result, expected) - df = p['ItemA'] - result = df.abs() - expected = np.abs(df) - assert_frame_equal(result, expected) + df = p['ItemA'] + result = df.abs() + expected = np.abs(df) + assert_frame_equal(result, expected) class CheckIndexing(object): @@ -301,48 +311,50 @@ def test_getitem(self): self.assertRaises(Exception, self.panel4d.__getitem__, 'ItemQ') def test_delitem_and_pop(self): - expected = self.panel4d['l2'] - result = self.panel4d.pop('l2') - assert_panel_equal(expected, result) - self.assertNotIn('l2', self.panel4d.labels) - - del self.panel4d['l3'] - self.assertNotIn('l3', self.panel4d.labels) - self.assertRaises(Exception, self.panel4d.__delitem__, 'l3') - - values = np.empty((4, 4, 4, 4)) - values[0] = 0 - values[1] = 1 - values[2] = 2 - values[3] = 3 - - panel4d = Panel4D(values, lrange(4), lrange(4), lrange(4), lrange(4)) - - # did we delete the right row? - - panel4dc = panel4d.copy() - del panel4dc[0] - assert_panel_equal(panel4dc[1], panel4d[1]) - assert_panel_equal(panel4dc[2], panel4d[2]) - assert_panel_equal(panel4dc[3], panel4d[3]) - - panel4dc = panel4d.copy() - del panel4dc[1] - assert_panel_equal(panel4dc[0], panel4d[0]) - assert_panel_equal(panel4dc[2], panel4d[2]) - assert_panel_equal(panel4dc[3], panel4d[3]) - - panel4dc = panel4d.copy() - del panel4dc[2] - assert_panel_equal(panel4dc[1], panel4d[1]) - assert_panel_equal(panel4dc[0], panel4d[0]) - assert_panel_equal(panel4dc[3], panel4d[3]) - - panel4dc = panel4d.copy() - del panel4dc[3] - assert_panel_equal(panel4dc[1], panel4d[1]) - assert_panel_equal(panel4dc[2], panel4d[2]) - assert_panel_equal(panel4dc[0], panel4d[0]) + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + expected = self.panel4d['l2'] + result = self.panel4d.pop('l2') + assert_panel_equal(expected, result) + self.assertNotIn('l2', self.panel4d.labels) + + del self.panel4d['l3'] + self.assertNotIn('l3', self.panel4d.labels) + self.assertRaises(Exception, self.panel4d.__delitem__, 'l3') + + values = np.empty((4, 4, 4, 4)) + values[0] = 0 + values[1] = 1 + values[2] = 2 + values[3] = 3 + + panel4d = Panel4D(values, lrange(4), lrange(4), + lrange(4), lrange(4)) + + # did we delete the right row? + panel4dc = panel4d.copy() + del panel4dc[0] + assert_panel_equal(panel4dc[1], panel4d[1]) + assert_panel_equal(panel4dc[2], panel4d[2]) + assert_panel_equal(panel4dc[3], panel4d[3]) + + panel4dc = panel4d.copy() + del panel4dc[1] + assert_panel_equal(panel4dc[0], panel4d[0]) + assert_panel_equal(panel4dc[2], panel4d[2]) + assert_panel_equal(panel4dc[3], panel4d[3]) + + panel4dc = panel4d.copy() + del panel4dc[2] + assert_panel_equal(panel4dc[1], panel4d[1]) + assert_panel_equal(panel4dc[0], panel4d[0]) + assert_panel_equal(panel4dc[3], panel4d[3]) + + panel4dc = panel4d.copy() + del panel4dc[3] + assert_panel_equal(panel4dc[1], panel4d[1]) + assert_panel_equal(panel4dc[2], panel4d[2]) + assert_panel_equal(panel4dc[0], panel4d[0]) def test_setitem(self): # LongPanel with one item @@ -378,95 +390,84 @@ def test_setitem(self): def test_setitem_by_indexer(self): - # Panel - panel4dc = self.panel4d.copy() - p = panel4dc.iloc[0] - - def func(): - self.panel4d.iloc[0] = p - self.assertRaises(NotImplementedError, func) - - # DataFrame - panel4dc = self.panel4d.copy() - df = panel4dc.iloc[0, 0] - df.iloc[:] = 1 - panel4dc.iloc[0, 0] = df - self.assertTrue((panel4dc.iloc[0, 0].values == 1).all()) - - # Series - panel4dc = self.panel4d.copy() - s = panel4dc.iloc[0, 0, :, 0] - s.iloc[:] = 1 - panel4dc.iloc[0, 0, :, 0] = s - self.assertTrue((panel4dc.iloc[0, 0, :, 0].values == 1).all()) - - # scalar - panel4dc = self.panel4d.copy() - panel4dc.iloc[0] = 1 - panel4dc.iloc[1] = True - panel4dc.iloc[2] = 'foo' - self.assertTrue((panel4dc.iloc[0].values == 1).all()) - self.assertTrue(panel4dc.iloc[1].values.all()) - self.assertTrue((panel4dc.iloc[2].values == 'foo').all()) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + + # Panel + panel4dc = self.panel4d.copy() + p = panel4dc.iloc[0] + + def func(): + self.panel4d.iloc[0] = p + self.assertRaises(NotImplementedError, func) + + # DataFrame + panel4dc = self.panel4d.copy() + df = panel4dc.iloc[0, 0] + df.iloc[:] = 1 + panel4dc.iloc[0, 0] = df + self.assertTrue((panel4dc.iloc[0, 0].values == 1).all()) + + # Series + panel4dc = self.panel4d.copy() + s = panel4dc.iloc[0, 0, :, 0] + s.iloc[:] = 1 + panel4dc.iloc[0, 0, :, 0] = s + self.assertTrue((panel4dc.iloc[0, 0, :, 0].values == 1).all()) + + # scalar + panel4dc = self.panel4d.copy() + panel4dc.iloc[0] = 1 + panel4dc.iloc[1] = True + panel4dc.iloc[2] = 'foo' + self.assertTrue((panel4dc.iloc[0].values == 1).all()) + self.assertTrue(panel4dc.iloc[1].values.all()) + self.assertTrue((panel4dc.iloc[2].values == 'foo').all()) def test_setitem_by_indexer_mixed_type(self): - # GH 8702 - self.panel4d['foo'] = 'bar' - # scalar - panel4dc = self.panel4d.copy() - panel4dc.iloc[0] = 1 - panel4dc.iloc[1] = True - panel4dc.iloc[2] = 'foo' - self.assertTrue((panel4dc.iloc[0].values == 1).all()) - self.assertTrue(panel4dc.iloc[1].values.all()) - self.assertTrue((panel4dc.iloc[2].values == 'foo').all()) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + # GH 8702 + self.panel4d['foo'] = 'bar' + + # scalar + panel4dc = self.panel4d.copy() + panel4dc.iloc[0] = 1 + panel4dc.iloc[1] = True + panel4dc.iloc[2] = 'foo' + self.assertTrue((panel4dc.iloc[0].values == 1).all()) + self.assertTrue(panel4dc.iloc[1].values.all()) + self.assertTrue((panel4dc.iloc[2].values == 'foo').all()) def test_comparisons(self): - p1 = tm.makePanel4D() - p2 = tm.makePanel4D() - - tp = p1.reindex(labels=p1.labels.tolist() + ['foo']) - p = p1[p1.labels[0]] - - def test_comp(func): - result = func(p1, p2) - self.assert_numpy_array_equal(result.values, - func(p1.values, p2.values)) - - # versus non-indexed same objs - self.assertRaises(Exception, func, p1, tp) - - # versus different objs - self.assertRaises(Exception, func, p1, p) - - result3 = func(self.panel4d, 0) - self.assert_numpy_array_equal(result3.values, - func(self.panel4d.values, 0)) - - test_comp(operator.eq) - test_comp(operator.ne) - test_comp(operator.lt) - test_comp(operator.gt) - test_comp(operator.ge) - test_comp(operator.le) - - def test_setitem_ndarray(self): - raise nose.SkipTest("skipping for now") - # from pandas import DateRange, datetools - - # timeidx = DateRange(start=datetime(2009,1,1), - # end=datetime(2009,12,31), - # offset=datetools.MonthEnd()) - # lons_coarse = np.linspace(-177.5, 177.5, 72) - # lats_coarse = np.linspace(-87.5, 87.5, 36) - # P = Panel(items=timeidx, major_axis=lons_coarse, - # minor_axis=lats_coarse) - # data = np.random.randn(72*36).reshape((72,36)) - # key = datetime(2009,2,28) - # P[key] = data# - - # assert_almost_equal(P[key].values, data) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + p1 = tm.makePanel4D() + p2 = tm.makePanel4D() + + tp = p1.reindex(labels=p1.labels.tolist() + ['foo']) + p = p1[p1.labels[0]] + + def test_comp(func): + result = func(p1, p2) + self.assert_numpy_array_equal(result.values, + func(p1.values, p2.values)) + + # versus non-indexed same objs + self.assertRaises(Exception, func, p1, tp) + + # versus different objs + self.assertRaises(Exception, func, p1, p) + + result3 = func(self.panel4d, 0) + self.assert_numpy_array_equal(result3.values, + func(self.panel4d.values, 0)) + + with np.errstate(invalid='ignore'): + test_comp(operator.eq) + test_comp(operator.ne) + test_comp(operator.lt) + test_comp(operator.gt) + test_comp(operator.ge) + test_comp(operator.le) def test_major_xs(self): ref = self.panel4d['l1']['ItemA'] @@ -478,7 +479,7 @@ def test_major_xs(self): ref.xs(idx), check_names=False) # not contained - idx = self.panel4d.major_axis[0] - bday + idx = self.panel4d.major_axis[0] - BDay() self.assertRaises(Exception, self.panel4d.major_xs, idx) def test_major_xs_mixed(self): @@ -521,42 +522,43 @@ def test_xs(self): self.assertIsNotNone(result.is_copy) def test_getitem_fancy_labels(self): - panel4d = self.panel4d + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + panel4d = self.panel4d - labels = panel4d.labels[[1, 0]] - items = panel4d.items[[1, 0]] - dates = panel4d.major_axis[::2] - cols = ['D', 'C', 'F'] + labels = panel4d.labels[[1, 0]] + items = panel4d.items[[1, 0]] + dates = panel4d.major_axis[::2] + cols = ['D', 'C', 'F'] - # all 4 specified - assert_panel4d_equal(panel4d.ix[labels, items, dates, cols], - panel4d.reindex(labels=labels, items=items, - major=dates, minor=cols)) + # all 4 specified + assert_panel4d_equal(panel4d.ix[labels, items, dates, cols], + panel4d.reindex(labels=labels, items=items, + major=dates, minor=cols)) - # 3 specified - assert_panel4d_equal(panel4d.ix[:, items, dates, cols], - panel4d.reindex(items=items, major=dates, - minor=cols)) + # 3 specified + assert_panel4d_equal(panel4d.ix[:, items, dates, cols], + panel4d.reindex(items=items, major=dates, + minor=cols)) - # 2 specified - assert_panel4d_equal(panel4d.ix[:, :, dates, cols], - panel4d.reindex(major=dates, minor=cols)) + # 2 specified + assert_panel4d_equal(panel4d.ix[:, :, dates, cols], + panel4d.reindex(major=dates, minor=cols)) - assert_panel4d_equal(panel4d.ix[:, items, :, cols], - panel4d.reindex(items=items, minor=cols)) + assert_panel4d_equal(panel4d.ix[:, items, :, cols], + panel4d.reindex(items=items, minor=cols)) - assert_panel4d_equal(panel4d.ix[:, items, dates, :], - panel4d.reindex(items=items, major=dates)) + assert_panel4d_equal(panel4d.ix[:, items, dates, :], + panel4d.reindex(items=items, major=dates)) - # only 1 - assert_panel4d_equal(panel4d.ix[:, items, :, :], - panel4d.reindex(items=items)) + # only 1 + assert_panel4d_equal(panel4d.ix[:, items, :, :], + panel4d.reindex(items=items)) - assert_panel4d_equal(panel4d.ix[:, :, dates, :], - panel4d.reindex(major=dates)) + assert_panel4d_equal(panel4d.ix[:, :, dates, :], + panel4d.reindex(major=dates)) - assert_panel4d_equal(panel4d.ix[:, :, :, cols], - panel4d.reindex(minor=cols)) + assert_panel4d_equal(panel4d.ix[:, :, :, cols], + panel4d.reindex(minor=cols)) def test_getitem_fancy_slice(self): pass @@ -564,11 +566,6 @@ def test_getitem_fancy_slice(self): def test_getitem_fancy_ints(self): pass - def test_getitem_fancy_xs(self): - raise nose.SkipTest("skipping for now") - # self.assertRaises(NotImplementedError, self.panel4d.major_xs) - # self.assertRaises(NotImplementedError, self.panel4d.minor_xs) - def test_get_value(self): for label in self.panel4d.labels: for item in self.panel4d.items: @@ -580,22 +577,28 @@ def test_get_value(self): assert_almost_equal(result, expected) def test_set_value(self): - for label in self.panel4d.labels: - for item in self.panel4d.items: - for mjr in self.panel4d.major_axis[::2]: - for mnr in self.panel4d.minor_axis: - self.panel4d.set_value(label, item, mjr, mnr, 1.) - assert_almost_equal( - self.panel4d[label][item][mnr][mjr], 1.) - # resize - res = self.panel4d.set_value('l4', 'ItemE', 'foo', 'bar', 1.5) - tm.assertIsInstance(res, Panel4D) - self.assertIsNot(res, self.panel4d) - self.assertEqual(res.get_value('l4', 'ItemE', 'foo', 'bar'), 1.5) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - res3 = self.panel4d.set_value('l4', 'ItemE', 'foobar', 'baz', 5) - self.assertTrue(com.is_float_dtype(res3['l4'].values)) + for label in self.panel4d.labels: + for item in self.panel4d.items: + for mjr in self.panel4d.major_axis[::2]: + for mnr in self.panel4d.minor_axis: + self.panel4d.set_value(label, item, mjr, mnr, 1.) + assert_almost_equal( + self.panel4d[label][item][mnr][mjr], 1.) + + res3 = self.panel4d.set_value('l4', 'ItemE', 'foobar', 'baz', 5) + self.assertTrue(is_float_dtype(res3['l4'].values)) + + # resize + res = self.panel4d.set_value('l4', 'ItemE', 'foo', 'bar', 1.5) + tm.assertIsInstance(res, Panel4D) + self.assertIsNot(res, self.panel4d) + self.assertEqual(res.get_value('l4', 'ItemE', 'foo', 'bar'), 1.5) + + res3 = self.panel4d.set_value('l4', 'ItemE', 'foobar', 'baz', 5) + self.assertTrue(is_float_dtype(res3['l4'].values)) class TestPanel4d(tm.TestCase, CheckIndexing, SafeForSparse, @@ -608,194 +611,150 @@ def assert_panel4d_equal(cls, x, y): assert_panel4d_equal(x, y) def setUp(self): - self.panel4d = tm.makePanel4D(nper=8) - add_nans(self.panel4d) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + self.panel4d = tm.makePanel4D(nper=8) + add_nans(self.panel4d) def test_constructor(self): - # with BlockManager - panel4d = Panel4D(self.panel4d._data) - self.assertIs(panel4d._data, self.panel4d._data) - - panel4d = Panel4D(self.panel4d._data, copy=True) - self.assertIsNot(panel4d._data, self.panel4d._data) - assert_panel4d_equal(panel4d, self.panel4d) - - # strings handled prop - # panel4d = Panel4D([[['foo', 'foo', 'foo',], - # ['foo', 'foo', 'foo']]]) - # self.assertEqual(wp.values.dtype, np.object_) - - vals = self.panel4d.values - - # no copy - panel4d = Panel4D(vals) - self.assertIs(panel4d.values, vals) - - # copy - panel4d = Panel4D(vals, copy=True) - self.assertIsNot(panel4d.values, vals) - - # GH #8285, test when scalar data is used to construct a Panel4D - # if dtype is not passed, it should be inferred - value_and_dtype = [(1, 'int64'), (3.14, 'float64'), - ('foo', np.object_)] - for (val, dtype) in value_and_dtype: - panel4d = Panel4D(val, labels=range(2), items=range( - 3), major_axis=range(4), minor_axis=range(5)) - vals = np.empty((2, 3, 4, 5), dtype=dtype) - vals.fill(val) - assert_panel4d_equal(panel4d, Panel4D(vals, dtype=dtype)) - - # test the case when dtype is passed - panel4d = Panel4D(1, labels=range(2), items=range( - 3), major_axis=range(4), minor_axis=range(5), dtype='float32') - vals = np.empty((2, 3, 4, 5), dtype='float32') - vals.fill(1) - assert_panel4d_equal(panel4d, Panel4D(vals, dtype='float32')) - def test_constructor_cast(self): - zero_filled = self.panel4d.fillna(0) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + panel4d = Panel4D(self.panel4d._data) + self.assertIs(panel4d._data, self.panel4d._data) + + panel4d = Panel4D(self.panel4d._data, copy=True) + self.assertIsNot(panel4d._data, self.panel4d._data) + assert_panel4d_equal(panel4d, self.panel4d) + + vals = self.panel4d.values + + # no copy + panel4d = Panel4D(vals) + self.assertIs(panel4d.values, vals) + + # copy + panel4d = Panel4D(vals, copy=True) + self.assertIsNot(panel4d.values, vals) + + # GH #8285, test when scalar data is used to construct a Panel4D + # if dtype is not passed, it should be inferred + value_and_dtype = [(1, 'int64'), (3.14, 'float64'), + ('foo', np.object_)] + for (val, dtype) in value_and_dtype: + panel4d = Panel4D(val, labels=range(2), items=range( + 3), major_axis=range(4), minor_axis=range(5)) + vals = np.empty((2, 3, 4, 5), dtype=dtype) + vals.fill(val) + expected = Panel4D(vals, dtype=dtype) + assert_panel4d_equal(panel4d, expected) + + # test the case when dtype is passed + panel4d = Panel4D(1, labels=range(2), items=range( + 3), major_axis=range(4), minor_axis=range(5), dtype='float32') + vals = np.empty((2, 3, 4, 5), dtype='float32') + vals.fill(1) + + expected = Panel4D(vals, dtype='float32') + assert_panel4d_equal(panel4d, expected) - casted = Panel4D(zero_filled._data, dtype=int) - casted2 = Panel4D(zero_filled.values, dtype=int) - - exp_values = zero_filled.values.astype(int) - assert_almost_equal(casted.values, exp_values) - assert_almost_equal(casted2.values, exp_values) + def test_constructor_cast(self): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + zero_filled = self.panel4d.fillna(0) - casted = Panel4D(zero_filled._data, dtype=np.int32) - casted2 = Panel4D(zero_filled.values, dtype=np.int32) + casted = Panel4D(zero_filled._data, dtype=int) + casted2 = Panel4D(zero_filled.values, dtype=int) - exp_values = zero_filled.values.astype(np.int32) - assert_almost_equal(casted.values, exp_values) - assert_almost_equal(casted2.values, exp_values) + exp_values = zero_filled.values.astype(int) + assert_almost_equal(casted.values, exp_values) + assert_almost_equal(casted2.values, exp_values) - # can't cast - data = [[['foo', 'bar', 'baz']]] - self.assertRaises(ValueError, Panel, data, dtype=float) + casted = Panel4D(zero_filled._data, dtype=np.int32) + casted2 = Panel4D(zero_filled.values, dtype=np.int32) - def test_constructor_empty_panel(self): - empty = Panel() - self.assertEqual(len(empty.items), 0) - self.assertEqual(len(empty.major_axis), 0) - self.assertEqual(len(empty.minor_axis), 0) + exp_values = zero_filled.values.astype(np.int32) + assert_almost_equal(casted.values, exp_values) + assert_almost_equal(casted2.values, exp_values) - def test_constructor_observe_dtype(self): - # GH #411 - panel = Panel(items=lrange(3), major_axis=lrange(3), - minor_axis=lrange(3), dtype='O') - self.assertEqual(panel.values.dtype, np.object_) + # can't cast + data = [[['foo', 'bar', 'baz']]] + self.assertRaises(ValueError, Panel, data, dtype=float) def test_consolidate(self): - self.assertTrue(self.panel4d._data.is_consolidated()) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + self.assertTrue(self.panel4d._data.is_consolidated()) - self.panel4d['foo'] = 1. - self.assertFalse(self.panel4d._data.is_consolidated()) + self.panel4d['foo'] = 1. + self.assertFalse(self.panel4d._data.is_consolidated()) - panel4d = self.panel4d.consolidate() - self.assertTrue(panel4d._data.is_consolidated()) + panel4d = self.panel4d.consolidate() + self.assertTrue(panel4d._data.is_consolidated()) def test_ctor_dict(self): - l1 = self.panel4d['l1'] - l2 = self.panel4d['l2'] - - d = {'A': l1, 'B': l2.ix[['ItemB'], :, :]} - # d2 = {'A' : itema._series, 'B' : itemb[5:]._series} - # d3 = {'A' : DataFrame(itema._series), - # 'B' : DataFrame(itemb[5:]._series)} - - panel4d = Panel4D(d) - # wp2 = Panel.from_dict(d2) # nested Dict - # wp3 = Panel.from_dict(d3) - # self.assertTrue(wp.major_axis.equals(self.panel.major_axis)) - assert_panel_equal(panel4d['A'], self.panel4d['l1']) - assert_frame_equal(panel4d.ix['B', 'ItemB', :, :], - self.panel4d.ix['l2', ['ItemB'], :, :]['ItemB']) - - # intersect - # wp = Panel.from_dict(d, intersect=True) - # self.assertTrue(wp.major_axis.equals(itemb.index[5:])) - - # use constructor - # assert_panel_equal(Panel(d), Panel.from_dict(d)) - # assert_panel_equal(Panel(d2), Panel.from_dict(d2)) - # assert_panel_equal(Panel(d3), Panel.from_dict(d3)) - - # cast - # dcasted = dict((k, v.reindex(wp.major_axis).fillna(0)) - # for k, v in d.iteritems()) - # result = Panel(dcasted, dtype=int) - # expected = Panel(dict((k, v.astype(int)) - # for k, v in dcasted.iteritems())) - # assert_panel_equal(result, expected) - - def test_constructor_dict_mixed(self): - data = dict((k, v.values) for k, v in self.panel4d.iteritems()) - result = Panel4D(data) - exp_major = Index(np.arange(len(self.panel4d.major_axis))) - self.assert_index_equal(result.major_axis, exp_major) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + l1 = self.panel4d['l1'] + l2 = self.panel4d['l2'] - result = Panel4D(data, - labels=self.panel4d.labels, - items=self.panel4d.items, - major_axis=self.panel4d.major_axis, - minor_axis=self.panel4d.minor_axis) - assert_panel4d_equal(result, self.panel4d) + d = {'A': l1, 'B': l2.ix[['ItemB'], :, :]} + panel4d = Panel4D(d) - data['l2'] = self.panel4d['l2'] - result = Panel4D(data) - assert_panel4d_equal(result, self.panel4d) - - # corner, blow up - data['l2'] = data['l2']['ItemB'] - self.assertRaises(Exception, Panel4D, data) - - data['l2'] = self.panel4d['l2'].values[:, :, :-1] - self.assertRaises(Exception, Panel4D, data) - - def test_constructor_resize(self): - data = self.panel4d._data - labels = self.panel4d.labels[:-1] - items = self.panel4d.items[:-1] - major = self.panel4d.major_axis[:-1] - minor = self.panel4d.minor_axis[:-1] + assert_panel_equal(panel4d['A'], self.panel4d['l1']) + assert_frame_equal(panel4d.ix['B', 'ItemB', :, :], + self.panel4d.ix['l2', ['ItemB'], :, :]['ItemB']) - result = Panel4D(data, labels=labels, items=items, - major_axis=major, minor_axis=minor) - expected = self.panel4d.reindex( - labels=labels, items=items, major=major, minor=minor) - assert_panel4d_equal(result, expected) - - result = Panel4D(data, items=items, major_axis=major) - expected = self.panel4d.reindex(items=items, major=major) - assert_panel4d_equal(result, expected) + def test_constructor_dict_mixed(self): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + data = dict((k, v.values) for k, v in self.panel4d.iteritems()) + result = Panel4D(data) - result = Panel4D(data, items=items) - expected = self.panel4d.reindex(items=items) - assert_panel4d_equal(result, expected) + exp_major = Index(np.arange(len(self.panel4d.major_axis))) + self.assert_index_equal(result.major_axis, exp_major) - result = Panel4D(data, minor_axis=minor) - expected = self.panel4d.reindex(minor=minor) - assert_panel4d_equal(result, expected) + result = Panel4D(data, + labels=self.panel4d.labels, + items=self.panel4d.items, + major_axis=self.panel4d.major_axis, + minor_axis=self.panel4d.minor_axis) + assert_panel4d_equal(result, self.panel4d) - def test_from_dict_mixed_orient(self): - raise nose.SkipTest("skipping for now") - # df = tm.makeDataFrame() - # df['foo'] = 'bar' + data['l2'] = self.panel4d['l2'] - # data = {'k1' : df, - # 'k2' : df} + result = Panel4D(data) + assert_panel4d_equal(result, self.panel4d) - # panel = Panel.from_dict(data, orient='minor') + # corner, blow up + data['l2'] = data['l2']['ItemB'] + self.assertRaises(Exception, Panel4D, data) - # self.assertEqual(panel['foo'].values.dtype, np.object_) - # self.assertEqual(panel['A'].values.dtype, np.float64) + data['l2'] = self.panel4d['l2'].values[:, :, :-1] + self.assertRaises(Exception, Panel4D, data) - def test_values(self): - self.assertRaises(Exception, Panel, np.random.randn(5, 5, 5), - lrange(5), lrange(5), lrange(4)) + def test_constructor_resize(self): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + data = self.panel4d._data + labels = self.panel4d.labels[:-1] + items = self.panel4d.items[:-1] + major = self.panel4d.major_axis[:-1] + minor = self.panel4d.minor_axis[:-1] + + result = Panel4D(data, labels=labels, items=items, + major_axis=major, minor_axis=minor) + expected = self.panel4d.reindex( + labels=labels, items=items, major=major, minor=minor) + assert_panel4d_equal(result, expected) + + result = Panel4D(data, items=items, major_axis=major) + expected = self.panel4d.reindex(items=items, major=major) + assert_panel4d_equal(result, expected) + + result = Panel4D(data, items=items) + expected = self.panel4d.reindex(items=items) + assert_panel4d_equal(result, expected) + + result = Panel4D(data, minor_axis=minor) + expected = self.panel4d.reindex(minor=minor) + assert_panel4d_equal(result, expected) def test_conform(self): + p = self.panel4d['l1'].filter(items=['ItemA', 'ItemB']) conformed = self.panel4d.conform(p) @@ -804,208 +763,155 @@ def test_conform(self): tm.assert_index_equal(conformed.minor_axis, self.panel4d.minor_axis) def test_reindex(self): - ref = self.panel4d['l2'] - - # labels - result = self.panel4d.reindex(labels=['l1', 'l2']) - assert_panel_equal(result['l2'], ref) - - # items - result = self.panel4d.reindex(items=['ItemA', 'ItemB']) - assert_frame_equal(result['l2']['ItemB'], ref['ItemB']) - - # major - new_major = list(self.panel4d.major_axis[:10]) - result = self.panel4d.reindex(major=new_major) - assert_frame_equal( - result['l2']['ItemB'], ref['ItemB'].reindex(index=new_major)) - - # raise exception put both major and major_axis - self.assertRaises(Exception, self.panel4d.reindex, - major_axis=new_major, major=new_major) - - # minor - new_minor = list(self.panel4d.minor_axis[:2]) - result = self.panel4d.reindex(minor=new_minor) - assert_frame_equal( - result['l2']['ItemB'], ref['ItemB'].reindex(columns=new_minor)) - - result = self.panel4d.reindex(labels=self.panel4d.labels, - items=self.panel4d.items, - major=self.panel4d.major_axis, - minor=self.panel4d.minor_axis) - - # don't necessarily copy - result = self.panel4d.reindex() - assert_panel4d_equal(result, self.panel4d) - self.assertFalse(result is self.panel4d) - - # with filling - smaller_major = self.panel4d.major_axis[::5] - smaller = self.panel4d.reindex(major=smaller_major) - - larger = smaller.reindex(major=self.panel4d.major_axis, - method='pad') - - assert_panel_equal(larger.ix[:, :, self.panel4d.major_axis[1], :], - smaller.ix[:, :, smaller_major[0], :]) - - # don't necessarily copy - result = self.panel4d.reindex( - major=self.panel4d.major_axis, copy=False) - assert_panel4d_equal(result, self.panel4d) - self.assertTrue(result is self.panel4d) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + ref = self.panel4d['l2'] + + # labels + result = self.panel4d.reindex(labels=['l1', 'l2']) + assert_panel_equal(result['l2'], ref) + + # items + result = self.panel4d.reindex(items=['ItemA', 'ItemB']) + assert_frame_equal(result['l2']['ItemB'], ref['ItemB']) + + # major + new_major = list(self.panel4d.major_axis[:10]) + result = self.panel4d.reindex(major=new_major) + assert_frame_equal( + result['l2']['ItemB'], ref['ItemB'].reindex(index=new_major)) + + # raise exception put both major and major_axis + self.assertRaises(Exception, self.panel4d.reindex, + major_axis=new_major, major=new_major) + + # minor + new_minor = list(self.panel4d.minor_axis[:2]) + result = self.panel4d.reindex(minor=new_minor) + assert_frame_equal( + result['l2']['ItemB'], ref['ItemB'].reindex(columns=new_minor)) + + result = self.panel4d.reindex(labels=self.panel4d.labels, + items=self.panel4d.items, + major=self.panel4d.major_axis, + minor=self.panel4d.minor_axis) + + # don't necessarily copy + result = self.panel4d.reindex() + assert_panel4d_equal(result, self.panel4d) + self.assertFalse(result is self.panel4d) + + # with filling + smaller_major = self.panel4d.major_axis[::5] + smaller = self.panel4d.reindex(major=smaller_major) + + larger = smaller.reindex(major=self.panel4d.major_axis, + method='pad') + + assert_panel_equal(larger.ix[:, :, self.panel4d.major_axis[1], :], + smaller.ix[:, :, smaller_major[0], :]) + + # don't necessarily copy + result = self.panel4d.reindex( + major=self.panel4d.major_axis, copy=False) + assert_panel4d_equal(result, self.panel4d) + self.assertTrue(result is self.panel4d) def test_not_hashable(self): - p4D_empty = Panel4D() - self.assertRaises(TypeError, hash, p4D_empty) - self.assertRaises(TypeError, hash, self.panel4d) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + p4D_empty = Panel4D() + self.assertRaises(TypeError, hash, p4D_empty) + self.assertRaises(TypeError, hash, self.panel4d) def test_reindex_like(self): # reindex_like - smaller = self.panel4d.reindex(labels=self.panel4d.labels[:-1], - items=self.panel4d.items[:-1], - major=self.panel4d.major_axis[:-1], - minor=self.panel4d.minor_axis[:-1]) - smaller_like = self.panel4d.reindex_like(smaller) - assert_panel4d_equal(smaller, smaller_like) - - def test_take(self): - raise nose.SkipTest("skipping for now") - - # # axis == 0 - # result = self.panel.take([2, 0, 1], axis=0) - # expected = self.panel.reindex(items=['ItemC', 'ItemA', 'ItemB']) - # assert_panel_equal(result, expected)# - - # # axis >= 1 - # result = self.panel.take([3, 0, 1, 2], axis=2) - # expected = self.panel.reindex(minor=['D', 'A', 'B', 'C']) - # assert_panel_equal(result, expected) - - # self.assertRaises(Exception, self.panel.take, [3, -1, 1, 2], axis=2) - # self.assertRaises(Exception, self.panel.take, [4, 0, 1, 2], axis=2) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + smaller = self.panel4d.reindex(labels=self.panel4d.labels[:-1], + items=self.panel4d.items[:-1], + major=self.panel4d.major_axis[:-1], + minor=self.panel4d.minor_axis[:-1]) + smaller_like = self.panel4d.reindex_like(smaller) + assert_panel4d_equal(smaller, smaller_like) def test_sort_index(self): - import random - - rlabels = list(self.panel4d.labels) - ritems = list(self.panel4d.items) - rmajor = list(self.panel4d.major_axis) - rminor = list(self.panel4d.minor_axis) - random.shuffle(rlabels) - random.shuffle(ritems) - random.shuffle(rmajor) - random.shuffle(rminor) - - random_order = self.panel4d.reindex(labels=rlabels) - sorted_panel4d = random_order.sort_index(axis=0) - assert_panel4d_equal(sorted_panel4d, self.panel4d) - - # descending - # random_order = self.panel.reindex(items=ritems) - # sorted_panel = random_order.sort_index(axis=0, ascending=False) - # assert_panel_equal(sorted_panel, - # self.panel.reindex(items=self.panel.items[::-1])) - - # random_order = self.panel.reindex(major=rmajor) - # sorted_panel = random_order.sort_index(axis=1) - # assert_panel_equal(sorted_panel, self.panel) - - # random_order = self.panel.reindex(minor=rminor) - # sorted_panel = random_order.sort_index(axis=2) - # assert_panel_equal(sorted_panel, self.panel) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + import random + + rlabels = list(self.panel4d.labels) + ritems = list(self.panel4d.items) + rmajor = list(self.panel4d.major_axis) + rminor = list(self.panel4d.minor_axis) + random.shuffle(rlabels) + random.shuffle(ritems) + random.shuffle(rmajor) + random.shuffle(rminor) + + random_order = self.panel4d.reindex(labels=rlabels) + sorted_panel4d = random_order.sort_index(axis=0) + assert_panel4d_equal(sorted_panel4d, self.panel4d) def test_fillna(self): - self.assertFalse(np.isfinite(self.panel4d.values).all()) - filled = self.panel4d.fillna(0) - self.assertTrue(np.isfinite(filled.values).all()) - self.assertRaises(NotImplementedError, - self.panel4d.fillna, method='pad') + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + self.assertFalse(np.isfinite(self.panel4d.values).all()) + filled = self.panel4d.fillna(0) + self.assertTrue(np.isfinite(filled.values).all()) - def test_swapaxes(self): - result = self.panel4d.swapaxes('labels', 'items') - self.assertIs(result.items, self.panel4d.labels) + self.assertRaises(NotImplementedError, + self.panel4d.fillna, method='pad') - result = self.panel4d.swapaxes('labels', 'minor') - self.assertIs(result.labels, self.panel4d.minor_axis) - - result = self.panel4d.swapaxes('items', 'minor') - self.assertIs(result.items, self.panel4d.minor_axis) - - result = self.panel4d.swapaxes('items', 'major') - self.assertIs(result.items, self.panel4d.major_axis) - - result = self.panel4d.swapaxes('major', 'minor') - self.assertIs(result.major_axis, self.panel4d.minor_axis) - - # this should also work - result = self.panel4d.swapaxes(0, 1) - self.assertIs(result.labels, self.panel4d.items) + def test_swapaxes(self): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = self.panel4d.swapaxes('labels', 'items') + self.assertIs(result.items, self.panel4d.labels) - # this works, but return a copy - result = self.panel4d.swapaxes('items', 'items') - assert_panel4d_equal(self.panel4d, result) - self.assertNotEqual(id(self.panel4d), id(result)) + result = self.panel4d.swapaxes('labels', 'minor') + self.assertIs(result.labels, self.panel4d.minor_axis) - def test_to_frame(self): - raise nose.SkipTest("skipping for now") - # # filtered - # filtered = self.panel.to_frame() - # expected = self.panel.to_frame().dropna(how='any') - # assert_frame_equal(filtered, expected) + result = self.panel4d.swapaxes('items', 'minor') + self.assertIs(result.items, self.panel4d.minor_axis) - # # unfiltered - # unfiltered = self.panel.to_frame(filter_observations=False) - # assert_panel_equal(unfiltered.to_panel(), self.panel) + result = self.panel4d.swapaxes('items', 'major') + self.assertIs(result.items, self.panel4d.major_axis) - # # names - # self.assertEqual(unfiltered.index.names, ('major', 'minor')) + result = self.panel4d.swapaxes('major', 'minor') + self.assertIs(result.major_axis, self.panel4d.minor_axis) - def test_to_frame_mixed(self): - raise nose.SkipTest("skipping for now") - # panel = self.panel.fillna(0) - # panel['str'] = 'foo' - # panel['bool'] = panel['ItemA'] > 0 + # this should also work + result = self.panel4d.swapaxes(0, 1) + self.assertIs(result.labels, self.panel4d.items) - # lp = panel.to_frame() - # wp = lp.to_panel() - # self.assertEqual(wp['bool'].values.dtype, np.bool_) - # assert_frame_equal(wp['bool'], panel['bool']) + # this works, but return a copy + result = self.panel4d.swapaxes('items', 'items') + assert_panel4d_equal(self.panel4d, result) + self.assertNotEqual(id(self.panel4d), id(result)) def test_update(self): - p4d = Panel4D([[[[1.5, np.nan, 3.], - [1.5, np.nan, 3.], - [1.5, np.nan, 3.], - [1.5, np.nan, 3.]], - [[1.5, np.nan, 3.], - [1.5, np.nan, 3.], - [1.5, np.nan, 3.], - [1.5, np.nan, 3.]]]]) - - other = Panel4D([[[[3.6, 2., np.nan]], - [[np.nan, np.nan, 7]]]]) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + p4d = Panel4D([[[[1.5, np.nan, 3.], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.]], + [[1.5, np.nan, 3.], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.]]]]) - p4d.update(other) + other = Panel4D([[[[3.6, 2., np.nan]], + [[np.nan, np.nan, 7]]]]) - expected = Panel4D([[[[3.6, 2, 3.], - [1.5, np.nan, 3.], - [1.5, np.nan, 3.], - [1.5, np.nan, 3.]], - [[1.5, np.nan, 7], - [1.5, np.nan, 3.], - [1.5, np.nan, 3.], - [1.5, np.nan, 3.]]]]) + p4d.update(other) - assert_panel4d_equal(p4d, expected) + expected = Panel4D([[[[3.6, 2, 3.], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.]], + [[1.5, np.nan, 7], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.], + [1.5, np.nan, 3.]]]]) - def test_filter(self): - raise nose.SkipTest("skipping for now") - - def test_apply(self): - raise nose.SkipTest("skipping for now") + assert_panel4d_equal(p4d, expected) def test_dtypes(self): @@ -1013,98 +919,36 @@ def test_dtypes(self): expected = Series(np.dtype('float64'), index=self.panel4d.labels) assert_series_equal(result, expected) - def test_compound(self): - raise nose.SkipTest("skipping for now") - # compounded = self.panel.compound() - - # assert_series_equal(compounded['ItemA'], - # (1 + self.panel['ItemA']).product(0) - 1) - - def test_shift(self): - raise nose.SkipTest("skipping for now") - # # major - # idx = self.panel.major_axis[0] - # idx_lag = self.panel.major_axis[1] - - # shifted = self.panel.shift(1) - - # assert_frame_equal(self.panel.major_xs(idx), - # shifted.major_xs(idx_lag)) - - # # minor - # idx = self.panel.minor_axis[0] - # idx_lag = self.panel.minor_axis[1] - - # shifted = self.panel.shift(1, axis='minor') - - # assert_frame_equal(self.panel.minor_xs(idx), - # shifted.minor_xs(idx_lag)) - - # self.assertRaises(Exception, self.panel.shift, 1, axis='items') - - def test_multiindex_get(self): - raise nose.SkipTest("skipping for now") - # ind = MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b',2)], - # names=['first', 'second']) - # wp = Panel(np.random.random((4,5,5)), - # items=ind, - # major_axis=np.arange(5), - # minor_axis=np.arange(5)) - # f1 = wp['a'] - # f2 = wp.ix['a'] - # assert_panel_equal(f1, f2) - - # self.assertTrue((f1.items == [1, 2]).all()) - # self.assertTrue((f2.items == [1, 2]).all()) - - # ind = MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)], - # names=['first', 'second']) - - def test_multiindex_blocks(self): - raise nose.SkipTest("skipping for now") - # ind = MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)], - # names=['first', 'second']) - # wp = Panel(self.panel._data) - # wp.items = ind - # f1 = wp['a'] - # self.assertTrue((f1.items == [1, 2]).all()) - - # f1 = wp[('b',1)] - # self.assertTrue((f1.columns == ['A', 'B', 'C', 'D']).all()) - def test_repr_empty(self): - empty = Panel4D() - repr(empty) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + empty = Panel4D() + repr(empty) def test_rename(self): - mapper = { - 'l1': 'foo', - 'l2': 'bar', - 'l3': 'baz' - } + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + + mapper = {'l1': 'foo', + 'l2': 'bar', + 'l3': 'baz'} - renamed = self.panel4d.rename_axis(mapper, axis=0) - exp = Index(['foo', 'bar', 'baz']) - self.assert_index_equal(renamed.labels, exp) + renamed = self.panel4d.rename_axis(mapper, axis=0) + exp = Index(['foo', 'bar', 'baz']) + self.assert_index_equal(renamed.labels, exp) - renamed = self.panel4d.rename_axis(str.lower, axis=3) - exp = Index(['a', 'b', 'c', 'd']) - self.assert_index_equal(renamed.minor_axis, exp) + renamed = self.panel4d.rename_axis(str.lower, axis=3) + exp = Index(['a', 'b', 'c', 'd']) + self.assert_index_equal(renamed.minor_axis, exp) - # don't copy - renamed_nocopy = self.panel4d.rename_axis(mapper, axis=0, copy=False) - renamed_nocopy['foo'] = 3. - self.assertTrue((self.panel4d['l1'].values == 3).all()) + # don't copy + renamed_nocopy = self.panel4d.rename_axis(mapper, + axis=0, + copy=False) + renamed_nocopy['foo'] = 3. + self.assertTrue((self.panel4d['l1'].values == 3).all()) def test_get_attr(self): assert_panel_equal(self.panel4d['l1'], self.panel4d.l1) - def test_from_frame_level1_unsorted(self): - raise nose.SkipTest("skipping for now") - - def test_to_excel(self): - raise nose.SkipTest("skipping for now") - if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/tests/test_panelnd.py b/pandas/tests/test_panelnd.py index ac497bc580585..2a69a65e8d55e 100644 --- a/pandas/tests/test_panelnd.py +++ b/pandas/tests/test_panelnd.py @@ -15,31 +15,35 @@ def setUp(self): def test_4d_construction(self): - # create a 4D - Panel4D = panelnd.create_nd_panel_factory( - klass_name='Panel4D', - orders=['labels', 'items', 'major_axis', 'minor_axis'], - slices={'items': 'items', 'major_axis': 'major_axis', - 'minor_axis': 'minor_axis'}, - slicer=Panel, - aliases={'major': 'major_axis', 'minor': 'minor_axis'}, - stat_axis=2) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + + # create a 4D + Panel4D = panelnd.create_nd_panel_factory( + klass_name='Panel4D', + orders=['labels', 'items', 'major_axis', 'minor_axis'], + slices={'items': 'items', 'major_axis': 'major_axis', + 'minor_axis': 'minor_axis'}, + slicer=Panel, + aliases={'major': 'major_axis', 'minor': 'minor_axis'}, + stat_axis=2) - p4d = Panel4D(dict(L1=tm.makePanel(), L2=tm.makePanel())) # noqa + p4d = Panel4D(dict(L1=tm.makePanel(), L2=tm.makePanel())) # noqa def test_4d_construction_alt(self): - # create a 4D - Panel4D = panelnd.create_nd_panel_factory( - klass_name='Panel4D', - orders=['labels', 'items', 'major_axis', 'minor_axis'], - slices={'items': 'items', 'major_axis': 'major_axis', - 'minor_axis': 'minor_axis'}, - slicer='Panel', - aliases={'major': 'major_axis', 'minor': 'minor_axis'}, - stat_axis=2) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - p4d = Panel4D(dict(L1=tm.makePanel(), L2=tm.makePanel())) # noqa + # create a 4D + Panel4D = panelnd.create_nd_panel_factory( + klass_name='Panel4D', + orders=['labels', 'items', 'major_axis', 'minor_axis'], + slices={'items': 'items', 'major_axis': 'major_axis', + 'minor_axis': 'minor_axis'}, + slicer='Panel', + aliases={'major': 'major_axis', 'minor': 'minor_axis'}, + stat_axis=2) + + p4d = Panel4D(dict(L1=tm.makePanel(), L2=tm.makePanel())) # noqa def test_4d_construction_error(self): @@ -59,40 +63,44 @@ def test_4d_construction_error(self): def test_5d_construction(self): - # create a 4D - Panel4D = panelnd.create_nd_panel_factory( - klass_name='Panel4D', - orders=['labels1', 'items', 'major_axis', 'minor_axis'], - slices={'items': 'items', 'major_axis': 'major_axis', - 'minor_axis': 'minor_axis'}, - slicer=Panel, - aliases={'major': 'major_axis', 'minor': 'minor_axis'}, - stat_axis=2) - - p4d = Panel4D(dict(L1=tm.makePanel(), L2=tm.makePanel())) - - # create a 5D - Panel5D = panelnd.create_nd_panel_factory( - klass_name='Panel5D', - orders=['cool1', 'labels1', 'items', 'major_axis', - 'minor_axis'], - slices={'labels1': 'labels1', 'items': 'items', - 'major_axis': 'major_axis', - 'minor_axis': 'minor_axis'}, - slicer=Panel4D, - aliases={'major': 'major_axis', 'minor': 'minor_axis'}, - stat_axis=2) - - p5d = Panel5D(dict(C1=p4d)) - - # slice back to 4d - results = p5d.ix['C1', :, :, 0:3, :] - expected = p4d.ix[:, :, 0:3, :] - assert_panel_equal(results['L1'], expected['L1']) - - # test a transpose - # results = p5d.transpose(1,2,3,4,0) - # expected = + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + + # create a 4D + Panel4D = panelnd.create_nd_panel_factory( + klass_name='Panel4D', + orders=['labels1', 'items', 'major_axis', 'minor_axis'], + slices={'items': 'items', 'major_axis': 'major_axis', + 'minor_axis': 'minor_axis'}, + slicer=Panel, + aliases={'major': 'major_axis', 'minor': 'minor_axis'}, + stat_axis=2) + + # deprecation GH13564 + p4d = Panel4D(dict(L1=tm.makePanel(), L2=tm.makePanel())) + + # create a 5D + Panel5D = panelnd.create_nd_panel_factory( + klass_name='Panel5D', + orders=['cool1', 'labels1', 'items', 'major_axis', + 'minor_axis'], + slices={'labels1': 'labels1', 'items': 'items', + 'major_axis': 'major_axis', + 'minor_axis': 'minor_axis'}, + slicer=Panel4D, + aliases={'major': 'major_axis', 'minor': 'minor_axis'}, + stat_axis=2) + + # deprecation GH13564 + p5d = Panel5D(dict(C1=p4d)) + + # slice back to 4d + results = p5d.ix['C1', :, :, 0:3, :] + expected = p4d.ix[:, :, 0:3, :] + assert_panel_equal(results['L1'], expected['L1']) + + # test a transpose + # results = p5d.transpose(1,2,3,4,0) + # expected = if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/tests/test_reshape.py b/pandas/tests/test_reshape.py index 7136d7effc1fc..80d1f5f76e5a9 100644 --- a/pandas/tests/test_reshape.py +++ b/pandas/tests/test_reshape.py @@ -174,15 +174,15 @@ def test_basic(self): s_series = Series(s_list) s_series_index = Series(s_list, list('ABC')) - expected = DataFrame({'a': {0: 1.0, - 1: 0.0, - 2: 0.0}, - 'b': {0: 0.0, - 1: 1.0, - 2: 0.0}, - 'c': {0: 0.0, - 1: 0.0, - 2: 1.0}}) + expected = DataFrame({'a': {0: 1, + 1: 0, + 2: 0}, + 'b': {0: 0, + 1: 1, + 2: 0}, + 'c': {0: 0, + 1: 0, + 2: 1}}, dtype=np.uint8) assert_frame_equal(get_dummies(s_list, sparse=self.sparse), expected) assert_frame_equal(get_dummies(s_series, sparse=self.sparse), expected) @@ -200,7 +200,7 @@ def test_basic_types(self): if not self.sparse: exp_df_type = DataFrame - exp_blk_type = pd.core.internals.FloatBlock + exp_blk_type = pd.core.internals.IntBlock else: exp_df_type = SparseDataFrame exp_blk_type = pd.core.internals.SparseBlock @@ -239,22 +239,24 @@ def test_just_na(self): def test_include_na(self): s = ['a', 'b', np.nan] res = get_dummies(s, sparse=self.sparse) - exp = DataFrame({'a': {0: 1.0, 1: 0.0, 2: 0.0}, - 'b': {0: 0.0, 1: 1.0, 2: 0.0}}) + exp = DataFrame({'a': {0: 1, 1: 0, 2: 0}, + 'b': {0: 0, 1: 1, 2: 0}}, dtype=np.uint8) assert_frame_equal(res, exp) # Sparse dataframes do not allow nan labelled columns, see #GH8822 res_na = get_dummies(s, dummy_na=True, sparse=self.sparse) - exp_na = DataFrame({nan: {0: 0.0, 1: 0.0, 2: 1.0}, - 'a': {0: 1.0, 1: 0.0, 2: 0.0}, - 'b': {0: 0.0, 1: 1.0, 2: 0.0}}) + exp_na = DataFrame({nan: {0: 0, 1: 0, 2: 1}, + 'a': {0: 1, 1: 0, 2: 0}, + 'b': {0: 0, 1: 1, 2: 0}}, + dtype=np.uint8) exp_na = exp_na.reindex_axis(['a', 'b', nan], 1) # hack (NaN handling in assert_index_equal) exp_na.columns = res_na.columns assert_frame_equal(res_na, exp_na) res_just_na = get_dummies([nan], dummy_na=True, sparse=self.sparse) - exp_just_na = DataFrame(Series(1.0, index=[0]), columns=[nan]) + exp_just_na = DataFrame(Series(1, index=[0]), columns=[nan], + dtype=np.uint8) tm.assert_numpy_array_equal(res_just_na.values, exp_just_na.values) def test_unicode(self @@ -264,31 +266,34 @@ def test_unicode(self eacute = unicodedata.lookup('LATIN SMALL LETTER E WITH ACUTE') s = [e, eacute, eacute] res = get_dummies(s, prefix='letter', sparse=self.sparse) - exp = DataFrame({'letter_e': {0: 1.0, - 1: 0.0, - 2: 0.0}, - u('letter_%s') % eacute: {0: 0.0, - 1: 1.0, - 2: 1.0}}) + exp = DataFrame({'letter_e': {0: 1, + 1: 0, + 2: 0}, + u('letter_%s') % eacute: {0: 0, + 1: 1, + 2: 1}}, + dtype=np.uint8) assert_frame_equal(res, exp) def test_dataframe_dummies_all_obj(self): df = self.df[['A', 'B']] result = get_dummies(df, sparse=self.sparse) - expected = DataFrame({'A_a': [1., 0, 1], - 'A_b': [0., 1, 0], - 'B_b': [1., 1, 0], - 'B_c': [0., 0, 1]}) + expected = DataFrame({'A_a': [1, 0, 1], + 'A_b': [0, 1, 0], + 'B_b': [1, 1, 0], + 'B_c': [0, 0, 1]}, dtype=np.uint8) assert_frame_equal(result, expected) def test_dataframe_dummies_mix_default(self): df = self.df result = get_dummies(df, sparse=self.sparse) expected = DataFrame({'C': [1, 2, 3], - 'A_a': [1., 0, 1], - 'A_b': [0., 1, 0], - 'B_b': [1., 1, 0], - 'B_c': [0., 0, 1]}) + 'A_a': [1, 0, 1], + 'A_b': [0, 1, 0], + 'B_b': [1, 1, 0], + 'B_c': [0, 0, 1]}) + cols = ['A_a', 'A_b', 'B_b', 'B_c'] + expected[cols] = expected[cols].astype(np.uint8) expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c']] assert_frame_equal(result, expected) @@ -299,10 +304,12 @@ def test_dataframe_dummies_prefix_list(self): 'C': [1, 2, 3]}) result = get_dummies(df, prefix=prefixes, sparse=self.sparse) expected = DataFrame({'C': [1, 2, 3], - 'from_A_a': [1., 0, 1], - 'from_A_b': [0., 1, 0], - 'from_B_b': [1., 1, 0], - 'from_B_c': [0., 0, 1]}) + 'from_A_a': [1, 0, 1], + 'from_A_b': [0, 1, 0], + 'from_B_b': [1, 1, 0], + 'from_B_c': [0, 0, 1]}) + cols = expected.columns[1:] + expected[cols] = expected[cols].astype(np.uint8) expected = expected[['C', 'from_A_a', 'from_A_b', 'from_B_b', 'from_B_c']] assert_frame_equal(result, expected) @@ -311,31 +318,37 @@ def test_dataframe_dummies_prefix_str(self): # not that you should do this... df = self.df result = get_dummies(df, prefix='bad', sparse=self.sparse) - expected = DataFrame([[1, 1., 0., 1., 0.], - [2, 0., 1., 1., 0.], - [3, 1., 0., 0., 1.]], - columns=['C', 'bad_a', 'bad_b', 'bad_b', 'bad_c']) + expected = DataFrame([[1, 1, 0, 1, 0], + [2, 0, 1, 1, 0], + [3, 1, 0, 0, 1]], + columns=['C', 'bad_a', 'bad_b', 'bad_b', 'bad_c'], + dtype=np.uint8) + expected = expected.astype({"C": np.int64}) assert_frame_equal(result, expected) def test_dataframe_dummies_subset(self): df = self.df result = get_dummies(df, prefix=['from_A'], columns=['A'], sparse=self.sparse) - expected = DataFrame({'from_A_a': [1., 0, 1], - 'from_A_b': [0., 1, 0], + expected = DataFrame({'from_A_a': [1, 0, 1], + 'from_A_b': [0, 1, 0], 'B': ['b', 'b', 'c'], 'C': [1, 2, 3]}) + cols = ['from_A_a', 'from_A_b'] + expected[cols] = expected[cols].astype(np.uint8) assert_frame_equal(result, expected) def test_dataframe_dummies_prefix_sep(self): df = self.df result = get_dummies(df, prefix_sep='..', sparse=self.sparse) expected = DataFrame({'C': [1, 2, 3], - 'A..a': [1., 0, 1], - 'A..b': [0., 1, 0], - 'B..b': [1., 1, 0], - 'B..c': [0., 0, 1]}) + 'A..a': [1, 0, 1], + 'A..b': [0, 1, 0], + 'B..b': [1, 1, 0], + 'B..c': [0, 0, 1]}) expected = expected[['C', 'A..a', 'A..b', 'B..b', 'B..c']] + cols = expected.columns[1:] + expected[cols] = expected[cols].astype(np.uint8) assert_frame_equal(result, expected) result = get_dummies(df, prefix_sep=['..', '__'], sparse=self.sparse) @@ -360,11 +373,13 @@ def test_dataframe_dummies_prefix_dict(self): 'B': ['b', 'b', 'c'], 'C': [1, 2, 3]}) result = get_dummies(df, prefix=prefixes, sparse=self.sparse) - expected = DataFrame({'from_A_a': [1., 0, 1], - 'from_A_b': [0., 1, 0], - 'from_B_b': [1., 1, 0], - 'from_B_c': [0., 0, 1], + expected = DataFrame({'from_A_a': [1, 0, 1], + 'from_A_b': [0, 1, 0], + 'from_B_b': [1, 1, 0], + 'from_B_c': [0, 0, 1], 'C': [1, 2, 3]}) + cols = ['from_A_a', 'from_A_b', 'from_B_b', 'from_B_c'] + expected[cols] = expected[cols].astype(np.uint8) assert_frame_equal(result, expected) def test_dataframe_dummies_with_na(self): @@ -372,12 +387,14 @@ def test_dataframe_dummies_with_na(self): df.loc[3, :] = [np.nan, np.nan, np.nan] result = get_dummies(df, dummy_na=True, sparse=self.sparse) expected = DataFrame({'C': [1, 2, 3, np.nan], - 'A_a': [1., 0, 1, 0], - 'A_b': [0., 1, 0, 0], - 'A_nan': [0., 0, 0, 1], - 'B_b': [1., 1, 0, 0], - 'B_c': [0., 0, 1, 0], - 'B_nan': [0., 0, 0, 1]}) + 'A_a': [1, 0, 1, 0], + 'A_b': [0, 1, 0, 0], + 'A_nan': [0, 0, 0, 1], + 'B_b': [1, 1, 0, 0], + 'B_c': [0, 0, 1, 0], + 'B_nan': [0, 0, 0, 1]}) + cols = ['A_a', 'A_b', 'A_nan', 'B_b', 'B_c', 'B_nan'] + expected[cols] = expected[cols].astype(np.uint8) expected = expected[['C', 'A_a', 'A_b', 'A_nan', 'B_b', 'B_c', 'B_nan']] assert_frame_equal(result, expected) @@ -391,12 +408,14 @@ def test_dataframe_dummies_with_categorical(self): df['cat'] = pd.Categorical(['x', 'y', 'y']) result = get_dummies(df, sparse=self.sparse) expected = DataFrame({'C': [1, 2, 3], - 'A_a': [1., 0, 1], - 'A_b': [0., 1, 0], - 'B_b': [1., 1, 0], - 'B_c': [0., 0, 1], - 'cat_x': [1., 0, 0], - 'cat_y': [0., 1, 1]}) + 'A_a': [1, 0, 1], + 'A_b': [0, 1, 0], + 'B_b': [1, 1, 0], + 'B_c': [0, 0, 1], + 'cat_x': [1, 0, 0], + 'cat_y': [0, 1, 1]}) + cols = ['A_a', 'A_b', 'B_b', 'B_c', 'cat_x', 'cat_y'] + expected[cols] = expected[cols].astype(np.uint8) expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c', 'cat_x', 'cat_y']] assert_frame_equal(result, expected) @@ -408,12 +427,12 @@ def test_basic_drop_first(self): s_series = Series(s_list) s_series_index = Series(s_list, list('ABC')) - expected = DataFrame({'b': {0: 0.0, - 1: 1.0, - 2: 0.0}, - 'c': {0: 0.0, - 1: 0.0, - 2: 1.0}}) + expected = DataFrame({'b': {0: 0, + 1: 1, + 2: 0}, + 'c': {0: 0, + 1: 0, + 2: 1}}, dtype=np.uint8) result = get_dummies(s_list, sparse=self.sparse, drop_first=True) assert_frame_equal(result, expected) @@ -449,19 +468,19 @@ def test_basic_drop_first_NA(self): # Test NA hadling together with drop_first s_NA = ['a', 'b', np.nan] res = get_dummies(s_NA, sparse=self.sparse, drop_first=True) - exp = DataFrame({'b': {0: 0.0, - 1: 1.0, - 2: 0.0}}) + exp = DataFrame({'b': {0: 0, + 1: 1, + 2: 0}}, dtype=np.uint8) assert_frame_equal(res, exp) res_na = get_dummies(s_NA, dummy_na=True, sparse=self.sparse, drop_first=True) - exp_na = DataFrame({'b': {0: 0.0, - 1: 1.0, - 2: 0.0}, - nan: {0: 0.0, - 1: 0.0, - 2: 1.0}}).reindex_axis( + exp_na = DataFrame({'b': {0: 0, + 1: 1, + 2: 0}, + nan: {0: 0, + 1: 0, + 2: 1}}, dtype=np.uint8).reindex_axis( ['b', nan], 1) assert_frame_equal(res_na, exp_na) @@ -473,8 +492,8 @@ def test_basic_drop_first_NA(self): def test_dataframe_dummies_drop_first(self): df = self.df[['A', 'B']] result = get_dummies(df, sparse=self.sparse, drop_first=True) - expected = DataFrame({'A_b': [0., 1, 0], - 'B_c': [0., 0, 1]}) + expected = DataFrame({'A_b': [0, 1, 0], + 'B_c': [0, 0, 1]}, dtype=np.uint8) assert_frame_equal(result, expected) def test_dataframe_dummies_drop_first_with_categorical(self): @@ -482,9 +501,11 @@ def test_dataframe_dummies_drop_first_with_categorical(self): df['cat'] = pd.Categorical(['x', 'y', 'y']) result = get_dummies(df, sparse=self.sparse, drop_first=True) expected = DataFrame({'C': [1, 2, 3], - 'A_b': [0., 1, 0], - 'B_c': [0., 0, 1], - 'cat_y': [0., 1, 1]}) + 'A_b': [0, 1, 0], + 'B_c': [0, 0, 1], + 'cat_y': [0, 1, 1]}) + cols = ['A_b', 'B_c', 'cat_y'] + expected[cols] = expected[cols].astype(np.uint8) expected = expected[['C', 'A_b', 'B_c', 'cat_y']] assert_frame_equal(result, expected) @@ -494,10 +515,13 @@ def test_dataframe_dummies_drop_first_with_na(self): result = get_dummies(df, dummy_na=True, sparse=self.sparse, drop_first=True) expected = DataFrame({'C': [1, 2, 3, np.nan], - 'A_b': [0., 1, 0, 0], - 'A_nan': [0., 0, 0, 1], - 'B_c': [0., 0, 1, 0], - 'B_nan': [0., 0, 0, 1]}) + 'A_b': [0, 1, 0, 0], + 'A_nan': [0, 0, 0, 1], + 'B_c': [0, 0, 1, 0], + 'B_nan': [0, 0, 0, 1]}) + cols = ['A_b', 'A_nan', 'B_c', 'B_nan'] + expected[cols] = expected[cols].astype(np.uint8) + expected = expected[['C', 'A_b', 'A_nan', 'B_c', 'B_nan']] assert_frame_equal(result, expected) @@ -506,11 +530,79 @@ def test_dataframe_dummies_drop_first_with_na(self): expected = expected[['C', 'A_b', 'B_c']] assert_frame_equal(result, expected) + def test_int_int(self): + data = Series([1, 2, 1]) + result = pd.get_dummies(data) + expected = DataFrame([[1, 0], [0, 1], [1, 0]], columns=[1, 2], + dtype=np.uint8) + tm.assert_frame_equal(result, expected) + + data = Series(pd.Categorical(['a', 'b', 'a'])) + result = pd.get_dummies(data) + expected = DataFrame([[1, 0], [0, 1], [1, 0]], + columns=pd.Categorical(['a', 'b']), + dtype=np.uint8) + tm.assert_frame_equal(result, expected) + + def test_int_df(self): + data = DataFrame( + {'A': [1, 2, 1], + 'B': pd.Categorical(['a', 'b', 'a']), + 'C': [1, 2, 1], + 'D': [1., 2., 1.] + } + ) + columns = ['C', 'D', 'A_1', 'A_2', 'B_a', 'B_b'] + expected = DataFrame([ + [1, 1., 1, 0, 1, 0], + [2, 2., 0, 1, 0, 1], + [1, 1., 1, 0, 1, 0] + ], columns=columns) + expected[columns[2:]] = expected[columns[2:]].astype(np.uint8) + result = pd.get_dummies(data, columns=['A', 'B']) + tm.assert_frame_equal(result, expected) + + def test_dataframe_dummies_preserve_categorical_dtype(self): + # GH13854 + for ordered in [False, True]: + cat = pd.Categorical(list("xy"), categories=list("xyz"), + ordered=ordered) + result = get_dummies(cat) + + data = np.array([[1, 0, 0], [0, 1, 0]], dtype=np.uint8) + cols = pd.CategoricalIndex(cat.categories, + categories=cat.categories, + ordered=ordered) + expected = DataFrame(data, columns=cols) + + tm.assert_frame_equal(result, expected) + class TestGetDummiesSparse(TestGetDummies): sparse = True +class TestMakeAxisDummies(tm.TestCase): + + def test_preserve_categorical_dtype(self): + # GH13854 + for ordered in [False, True]: + cidx = pd.CategoricalIndex(list("xyz"), ordered=ordered) + midx = pd.MultiIndex(levels=[['a'], cidx], + labels=[[0, 0], [0, 1]]) + df = DataFrame([[10, 11]], index=midx) + + expected = DataFrame([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]], + index=midx, columns=cidx) + + from pandas.core.reshape import make_axis_dummies + result = make_axis_dummies(df) + tm.assert_frame_equal(result, expected) + + result = make_axis_dummies(df, transform=lambda x: x) + tm.assert_frame_equal(result, expected) + + class TestLreshape(tm.TestCase): def test_pairs(self): diff --git a/pandas/tests/test_rplot.py b/pandas/tests/test_rplot.py deleted file mode 100644 index 6be6c53cbb201..0000000000000 --- a/pandas/tests/test_rplot.py +++ /dev/null @@ -1,316 +0,0 @@ -# -*- coding: utf-8 -*- -from pandas.compat import range -import pandas.util.testing as tm -from pandas import read_csv -import os - -with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - import pandas.tools.rplot as rplot - - -def curpath(): - pth, _ = os.path.split(os.path.abspath(__file__)) - return pth - - -def between(a, b, x): - """Check if x is in the somewhere between a and b. - - Parameters: - ----------- - a: float, interval start - b: float, interval end - x: float, value to test for - - Returns: - -------- - True if x is between a and b, False otherwise - """ - if a < b: - return x >= a and x <= b - else: - return x <= a and x >= b - - -@tm.mplskip -class TestUtilityFunctions(tm.TestCase): - """ - Tests for RPlot utility functions. - """ - - def setUp(self): - path = os.path.join(curpath(), 'data/iris.csv') - self.data = read_csv(path, sep=',') - - def test_make_aes1(self): - aes = rplot.make_aes() - self.assertTrue(aes['x'] is None) - self.assertTrue(aes['y'] is None) - self.assertTrue(aes['size'] is None) - self.assertTrue(aes['colour'] is None) - self.assertTrue(aes['shape'] is None) - self.assertTrue(aes['alpha'] is None) - self.assertTrue(isinstance(aes, dict)) - - def test_make_aes2(self): - self.assertRaises(ValueError, rplot.make_aes, - size=rplot.ScaleShape('test')) - self.assertRaises(ValueError, rplot.make_aes, - colour=rplot.ScaleShape('test')) - self.assertRaises(ValueError, rplot.make_aes, - shape=rplot.ScaleSize('test')) - self.assertRaises(ValueError, rplot.make_aes, - alpha=rplot.ScaleShape('test')) - - def test_dictionary_union(self): - dict1 = {1: 1, 2: 2, 3: 3} - dict2 = {1: 1, 2: 2, 4: 4} - union = rplot.dictionary_union(dict1, dict2) - self.assertEqual(len(union), 4) - keys = list(union.keys()) - self.assertTrue(1 in keys) - self.assertTrue(2 in keys) - self.assertTrue(3 in keys) - self.assertTrue(4 in keys) - self.assertEqual(rplot.dictionary_union(dict1, {}), dict1) - self.assertEqual(rplot.dictionary_union({}, dict1), dict1) - self.assertEqual(rplot.dictionary_union({}, {}), {}) - - def test_merge_aes(self): - layer1 = rplot.Layer(size=rplot.ScaleSize('test')) - layer2 = rplot.Layer(shape=rplot.ScaleShape('test')) - rplot.merge_aes(layer1, layer2) - self.assertTrue(isinstance(layer2.aes['size'], rplot.ScaleSize)) - self.assertTrue(isinstance(layer2.aes['shape'], rplot.ScaleShape)) - self.assertEqual(layer2.aes['size'], layer1.aes['size']) - for key in layer2.aes.keys(): - if key != 'size' and key != 'shape': - self.assertTrue(layer2.aes[key] is None) - - def test_sequence_layers(self): - layer1 = rplot.Layer(self.data) - layer2 = rplot.GeomPoint(x='SepalLength', y='SepalWidth', - size=rplot.ScaleSize('PetalLength')) - layer3 = rplot.GeomPolyFit(2) - result = rplot.sequence_layers([layer1, layer2, layer3]) - self.assertEqual(len(result), 3) - last = result[-1] - self.assertEqual(last.aes['x'], 'SepalLength') - self.assertEqual(last.aes['y'], 'SepalWidth') - self.assertTrue(isinstance(last.aes['size'], rplot.ScaleSize)) - self.assertTrue(self.data is last.data) - self.assertTrue(rplot.sequence_layers([layer1])[0] is layer1) - - -@tm.mplskip -class TestTrellis(tm.TestCase): - - def setUp(self): - path = os.path.join(curpath(), 'data/tips.csv') - self.data = read_csv(path, sep=',') - layer1 = rplot.Layer(self.data) - layer2 = rplot.GeomPoint(x='total_bill', y='tip') - layer3 = rplot.GeomPolyFit(2) - self.layers = rplot.sequence_layers([layer1, layer2, layer3]) - self.trellis1 = rplot.TrellisGrid(['sex', 'smoker']) - self.trellis2 = rplot.TrellisGrid(['sex', '.']) - self.trellis3 = rplot.TrellisGrid(['.', 'smoker']) - self.trellised1 = self.trellis1.trellis(self.layers) - self.trellised2 = self.trellis2.trellis(self.layers) - self.trellised3 = self.trellis3.trellis(self.layers) - - def test_grid_sizes(self): - self.assertEqual(len(self.trellised1), 3) - self.assertEqual(len(self.trellised2), 3) - self.assertEqual(len(self.trellised3), 3) - self.assertEqual(len(self.trellised1[0]), 2) - self.assertEqual(len(self.trellised1[0][0]), 2) - self.assertEqual(len(self.trellised2[0]), 2) - self.assertEqual(len(self.trellised2[0][0]), 1) - self.assertEqual(len(self.trellised3[0]), 1) - self.assertEqual(len(self.trellised3[0][0]), 2) - self.assertEqual(len(self.trellised1[1]), 2) - self.assertEqual(len(self.trellised1[1][0]), 2) - self.assertEqual(len(self.trellised2[1]), 2) - self.assertEqual(len(self.trellised2[1][0]), 1) - self.assertEqual(len(self.trellised3[1]), 1) - self.assertEqual(len(self.trellised3[1][0]), 2) - self.assertEqual(len(self.trellised1[2]), 2) - self.assertEqual(len(self.trellised1[2][0]), 2) - self.assertEqual(len(self.trellised2[2]), 2) - self.assertEqual(len(self.trellised2[2][0]), 1) - self.assertEqual(len(self.trellised3[2]), 1) - self.assertEqual(len(self.trellised3[2][0]), 2) - - def test_trellis_cols_rows(self): - self.assertEqual(self.trellis1.cols, 2) - self.assertEqual(self.trellis1.rows, 2) - self.assertEqual(self.trellis2.cols, 1) - self.assertEqual(self.trellis2.rows, 2) - self.assertEqual(self.trellis3.cols, 2) - self.assertEqual(self.trellis3.rows, 1) - - -@tm.mplskip -class TestScaleGradient(tm.TestCase): - - def setUp(self): - path = os.path.join(curpath(), 'data/iris.csv') - self.data = read_csv(path, sep=',') - self.gradient = rplot.ScaleGradient("SepalLength", colour1=(0.2, 0.3, - 0.4), - colour2=(0.8, 0.7, 0.6)) - - def test_gradient(self): - for index in range(len(self.data)): - # row = self.data.iloc[index] - r, g, b = self.gradient(self.data, index) - r1, g1, b1 = self.gradient.colour1 - r2, g2, b2 = self.gradient.colour2 - self.assertTrue(between(r1, r2, r)) - self.assertTrue(between(g1, g2, g)) - self.assertTrue(between(b1, b2, b)) - - -@tm.mplskip -class TestScaleGradient2(tm.TestCase): - - def setUp(self): - path = os.path.join(curpath(), 'data/iris.csv') - self.data = read_csv(path, sep=',') - self.gradient = rplot.ScaleGradient2("SepalLength", colour1=( - 0.2, 0.3, 0.4), colour2=(0.8, 0.7, 0.6), colour3=(0.5, 0.5, 0.5)) - - def test_gradient2(self): - for index in range(len(self.data)): - row = self.data.iloc[index] - r, g, b = self.gradient(self.data, index) - r1, g1, b1 = self.gradient.colour1 - r2, g2, b2 = self.gradient.colour2 - r3, g3, b3 = self.gradient.colour3 - value = row[self.gradient.column] - a_ = min(self.data[self.gradient.column]) - b_ = max(self.data[self.gradient.column]) - scaled = (value - a_) / (b_ - a_) - if scaled < 0.5: - self.assertTrue(between(r1, r2, r)) - self.assertTrue(between(g1, g2, g)) - self.assertTrue(between(b1, b2, b)) - else: - self.assertTrue(between(r2, r3, r)) - self.assertTrue(between(g2, g3, g)) - self.assertTrue(between(b2, b3, b)) - - -@tm.mplskip -class TestScaleRandomColour(tm.TestCase): - - def setUp(self): - path = os.path.join(curpath(), 'data/iris.csv') - self.data = read_csv(path, sep=',') - self.colour = rplot.ScaleRandomColour('SepalLength') - - def test_random_colour(self): - for index in range(len(self.data)): - colour = self.colour(self.data, index) - self.assertEqual(len(colour), 3) - r, g, b = colour - self.assertTrue(r >= 0.0) - self.assertTrue(g >= 0.0) - self.assertTrue(b >= 0.0) - self.assertTrue(r <= 1.0) - self.assertTrue(g <= 1.0) - self.assertTrue(b <= 1.0) - - -@tm.mplskip -class TestScaleConstant(tm.TestCase): - - def test_scale_constant(self): - scale = rplot.ScaleConstant(1.0) - self.assertEqual(scale(None, None), 1.0) - scale = rplot.ScaleConstant("test") - self.assertEqual(scale(None, None), "test") - - -class TestScaleSize(tm.TestCase): - - def setUp(self): - path = os.path.join(curpath(), 'data/iris.csv') - self.data = read_csv(path, sep=',') - self.scale1 = rplot.ScaleShape('Name') - self.scale2 = rplot.ScaleShape('PetalLength') - - def test_scale_size(self): - for index in range(len(self.data)): - marker = self.scale1(self.data, index) - self.assertTrue( - marker in ['o', '+', 's', '*', '^', '<', '>', 'v', '|', 'x']) - - def test_scale_overflow(self): - def f(): - for index in range(len(self.data)): - self.scale2(self.data, index) - - self.assertRaises(ValueError, f) - - -@tm.mplskip -class TestRPlot(tm.TestCase): - - def test_rplot1(self): - import matplotlib.pyplot as plt - path = os.path.join(curpath(), 'data/tips.csv') - plt.figure() - self.data = read_csv(path, sep=',') - self.plot = rplot.RPlot(self.data, x='tip', y='total_bill') - self.plot.add(rplot.TrellisGrid(['sex', 'smoker'])) - self.plot.add(rplot.GeomPoint(colour=rplot.ScaleRandomColour( - 'day'), shape=rplot.ScaleShape('size'))) - self.fig = plt.gcf() - self.plot.render(self.fig) - - def test_rplot2(self): - import matplotlib.pyplot as plt - path = os.path.join(curpath(), 'data/tips.csv') - plt.figure() - self.data = read_csv(path, sep=',') - self.plot = rplot.RPlot(self.data, x='tip', y='total_bill') - self.plot.add(rplot.TrellisGrid(['.', 'smoker'])) - self.plot.add(rplot.GeomPoint(colour=rplot.ScaleRandomColour( - 'day'), shape=rplot.ScaleShape('size'))) - self.fig = plt.gcf() - self.plot.render(self.fig) - - def test_rplot3(self): - import matplotlib.pyplot as plt - path = os.path.join(curpath(), 'data/tips.csv') - plt.figure() - self.data = read_csv(path, sep=',') - self.plot = rplot.RPlot(self.data, x='tip', y='total_bill') - self.plot.add(rplot.TrellisGrid(['sex', '.'])) - self.plot.add(rplot.GeomPoint(colour=rplot.ScaleRandomColour( - 'day'), shape=rplot.ScaleShape('size'))) - self.fig = plt.gcf() - self.plot.render(self.fig) - - def test_rplot_iris(self): - import matplotlib.pyplot as plt - path = os.path.join(curpath(), 'data/iris.csv') - plt.figure() - self.data = read_csv(path, sep=',') - plot = rplot.RPlot(self.data, x='SepalLength', y='SepalWidth') - plot.add(rplot.GeomPoint( - colour=rplot.ScaleGradient('PetalLength', - colour1=(0.0, 1.0, 0.5), - colour2=(1.0, 0.0, 0.5)), - size=rplot.ScaleSize('PetalWidth', min_size=10.0, - max_size=200.0), - shape=rplot.ScaleShape('Name'))) - self.fig = plt.gcf() - plot.render(self.fig) - - -if __name__ == '__main__': - import unittest - unittest.main() diff --git a/pandas/tests/test_stats.py b/pandas/tests/test_stats.py index 85ce1d5127512..41d25b9662b5b 100644 --- a/pandas/tests/test_stats.py +++ b/pandas/tests/test_stats.py @@ -179,6 +179,13 @@ def test_rank_int(self): expected.index = result.index assert_series_equal(result, expected) + def test_rank_object_bug(self): + # GH 13445 + + # smoke tests + Series([np.nan] * 32).astype(object).rank(ascending=True) + Series([np.nan] * 32).astype(object).rank(ascending=False) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 3d1851966afd0..4019bbe20ea1a 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -12,8 +12,7 @@ from pandas.compat import range, u import pandas.compat as compat -from pandas import (Index, Series, DataFrame, isnull, MultiIndex) -import pandas.core.common as com +from pandas import (Index, Series, DataFrame, isnull, MultiIndex, notnull) from pandas.util.testing import assert_series_equal import pandas.util.testing as tm @@ -430,6 +429,13 @@ def test_replace(self): result = values.str.replace("(?<=\w),(?=\w)", ", ", flags=re.UNICODE) tm.assert_series_equal(result, exp) + # GH 13438 + for klass in (Series, Index): + for repl in (None, 3, {'a': 'b'}): + for data in (['a', 'b', None], ['a', 'b', 'c', 'ad']): + values = klass(data) + self.assertRaises(TypeError, values.str.replace, 'a', repl) + def test_repeat(self): values = Series(['a', 'b', NA, 'c', NA, 'd']) @@ -977,6 +983,20 @@ def test_extractall_single_group(self): e = DataFrame(['a', 'b', 'd', 'c'], i) tm.assert_frame_equal(r, e) + def test_extractall_single_group_with_quantifier(self): + # extractall(one un-named group with quantifier) returns + # DataFrame with one un-named column (GH13382). + s = Series(['ab3', 'abc3', 'd4cd2'], name='series_name') + r = s.str.extractall(r'([a-z]+)') + i = MultiIndex.from_tuples([ + (0, 0), + (1, 0), + (2, 0), + (2, 1), + ], names=(None, "match")) + e = DataFrame(['ab', 'abc', 'd', 'cd'], i) + tm.assert_frame_equal(r, e) + def test_extractall_no_matches(self): s = Series(['a3', 'b3', 'd4c2'], name='series_name') # one un-named group. @@ -1329,7 +1349,7 @@ def test_len(self): values = Series(['foo', 'fooo', 'fooooo', np.nan, 'fooooooo']) result = values.str.len() - exp = values.map(lambda x: len(x) if com.notnull(x) else NA) + exp = values.map(lambda x: len(x) if notnull(x) else NA) tm.assert_series_equal(result, exp) # mixed @@ -1347,7 +1367,7 @@ def test_len(self): 'fooooooo')]) result = values.str.len() - exp = values.map(lambda x: len(x) if com.notnull(x) else NA) + exp = values.map(lambda x: len(x) if notnull(x) else NA) tm.assert_series_equal(result, exp) def test_findall(self): @@ -1583,6 +1603,15 @@ def test_pad_fillchar(self): "fillchar must be a character, not int"): result = values.str.pad(5, fillchar=5) + def test_pad_width(self): + # GH 13598 + s = Series(['1', '22', 'a', 'bb']) + + for f in ['center', 'ljust', 'rjust', 'zfill', 'pad']: + with tm.assertRaisesRegexp(TypeError, + "width must be of integer type, not*"): + getattr(s.str, f)('f') + def test_translate(self): def _check(result, expected): @@ -1877,45 +1906,6 @@ def test_split_no_pat_with_nonzero_n(self): def test_split_to_dataframe(self): s = Series(['nosplit', 'alsonosplit']) - - with tm.assert_produces_warning(FutureWarning): - result = s.str.split('_', return_type='frame') - - exp = DataFrame({0: Series(['nosplit', 'alsonosplit'])}) - tm.assert_frame_equal(result, exp) - - s = Series(['some_equal_splits', 'with_no_nans']) - with tm.assert_produces_warning(FutureWarning): - result = s.str.split('_', return_type='frame') - exp = DataFrame({0: ['some', 'with'], - 1: ['equal', 'no'], - 2: ['splits', 'nans']}) - tm.assert_frame_equal(result, exp) - - s = Series(['some_unequal_splits', 'one_of_these_things_is_not']) - with tm.assert_produces_warning(FutureWarning): - result = s.str.split('_', return_type='frame') - exp = DataFrame({0: ['some', 'one'], - 1: ['unequal', 'of'], - 2: ['splits', 'these'], - 3: [NA, 'things'], - 4: [NA, 'is'], - 5: [NA, 'not']}) - tm.assert_frame_equal(result, exp) - - s = Series(['some_splits', 'with_index'], index=['preserve', 'me']) - with tm.assert_produces_warning(FutureWarning): - result = s.str.split('_', return_type='frame') - exp = DataFrame({0: ['some', 'with'], 1: ['splits', 'index']}, - index=['preserve', 'me']) - tm.assert_frame_equal(result, exp) - - with tm.assertRaisesRegexp(ValueError, "expand must be"): - with tm.assert_produces_warning(FutureWarning): - s.str.split('_', return_type="some_invalid_type") - - def test_split_to_dataframe_expand(self): - s = Series(['nosplit', 'alsonosplit']) result = s.str.split('_', expand=True) exp = DataFrame({0: Series(['nosplit', 'alsonosplit'])}) tm.assert_frame_equal(result, exp) @@ -1944,8 +1934,7 @@ def test_split_to_dataframe_expand(self): tm.assert_frame_equal(result, exp) with tm.assertRaisesRegexp(ValueError, "expand must be"): - with tm.assert_produces_warning(FutureWarning): - s.str.split('_', return_type="some_invalid_type") + s.str.split('_', expand="not_a_boolean") def test_split_to_multiindex_expand(self): idx = Index(['nosplit', 'alsonosplit']) @@ -1970,8 +1959,7 @@ def test_split_to_multiindex_expand(self): self.assertEqual(result.nlevels, 6) with tm.assertRaisesRegexp(ValueError, "expand must be"): - with tm.assert_produces_warning(FutureWarning): - idx.str.split('_', return_type="some_invalid_type") + idx.str.split('_', expand="not_a_boolean") def test_rsplit_to_dataframe_expand(self): s = Series(['nosplit', 'alsonosplit']) @@ -2451,6 +2439,26 @@ def test_more_contains(self): True, False, False]) assert_series_equal(result, expected) + def test_contains_nan(self): + # PR #14171 + s = Series([np.nan, np.nan, np.nan], dtype=np.object_) + + result = s.str.contains('foo', na=False) + expected = Series([False, False, False], dtype=np.bool_) + assert_series_equal(result, expected) + + result = s.str.contains('foo', na=True) + expected = Series([True, True, True], dtype=np.bool_) + assert_series_equal(result, expected) + + result = s.str.contains('foo', na="foo") + expected = Series(["foo", "foo", "foo"], dtype=np.object_) + assert_series_equal(result, expected) + + result = s.str.contains('foo') + expected = Series([np.nan, np.nan, np.nan], dtype=np.object_) + assert_series_equal(result, expected) + def test_more_replace(self): # PR #1179 s = Series(['A', 'B', 'C', 'Aaba', 'Baca', '', NA, 'CABA', diff --git a/pandas/tests/test_testing.py b/pandas/tests/test_testing.py index c4e864a909c03..c242213ee226f 100644 --- a/pandas/tests/test_testing.py +++ b/pandas/tests/test_testing.py @@ -315,6 +315,17 @@ def test_numpy_array_equal_object_message(self): with assertRaisesRegexp(AssertionError, expected): assert_almost_equal(a, b) + def test_numpy_array_equal_copy_flag(self): + a = np.array([1, 2, 3]) + b = a.copy() + c = a.view() + expected = 'array\(\[1, 2, 3\]\) is not array\(\[1, 2, 3\]\)' + with assertRaisesRegexp(AssertionError, expected): + assert_numpy_array_equal(a, b, check_same='same') + expected = 'array\(\[1, 2, 3\]\) is array\(\[1, 2, 3\]\)' + with assertRaisesRegexp(AssertionError, expected): + assert_numpy_array_equal(a, c, check_same='copy') + def test_assert_almost_equal_iterable_message(self): expected = """Iterable are different diff --git a/pandas/tests/test_util.py b/pandas/tests/test_util.py index d6baa720bac19..9193880df7feb 100644 --- a/pandas/tests/test_util.py +++ b/pandas/tests/test_util.py @@ -326,6 +326,16 @@ def test_exactly_one_ref(self): self.assertEqual(bytearray(as_stolen_buf), b'test') +def test_numpy_errstate_is_default(): + # The defaults since numpy 1.6.0 + expected = {'over': 'warn', 'divide': 'warn', 'invalid': 'warn', + 'under': 'ignore'} + import numpy as np + from pandas.compat import numpy # noqa + # The errstate should be unchanged after that import. + tm.assert_equal(np.geterr(), expected) + + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index 2ec419221c6d8..929ff43bfaaad 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -11,10 +11,10 @@ import pandas as pd from pandas import (Series, DataFrame, Panel, bdate_range, isnull, - notnull, concat) -import pandas.core.datetools as datetools + notnull, concat, Timestamp) import pandas.stats.moments as mom import pandas.core.window as rwindow +import pandas.tseries.offsets as offsets from pandas.core.base import SpecificationError from pandas.core.common import UnsupportedFunctionCall import pandas.util.testing as tm @@ -101,7 +101,7 @@ def tests_skip_nuisance(self): expected = pd.concat([r[['A', 'B']].sum(), df[['C']]], axis=1) result = r.sum() - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected, check_like=True) def test_agg(self): df = DataFrame({'A': range(5), 'B': range(0, 10, 2)}) @@ -319,6 +319,13 @@ class TestRolling(Base): def setUp(self): self._create_data() + def test_doc_string(self): + + df = DataFrame({'B': [0, 1, 2, np.nan, 4]}) + df + df.rolling(2).sum() + df.rolling(2, min_periods=1).sum() + def test_constructor(self): # GH 12669 @@ -331,6 +338,11 @@ def test_constructor(self): c(window=2, min_periods=1, center=True) c(window=2, min_periods=1, center=False) + # GH 13383 + c(0) + with self.assertRaises(ValueError): + c(-1) + # not valid for w in [2., 'foo', np.array([2])]: with self.assertRaises(ValueError): @@ -340,6 +352,15 @@ def test_constructor(self): with self.assertRaises(ValueError): c(window=2, min_periods=1, center=w) + def test_constructor_with_win_type(self): + # GH 13383 + tm._skip_if_no_scipy() + for o in [self.series, self.frame]: + c = o.rolling + c(0, win_type='boxcar') + with self.assertRaises(ValueError): + c(-1, win_type='boxcar') + def test_numpy_compat(self): # see gh-12811 r = rwindow.Rolling(Series([2, 4, 6]), window=2) @@ -358,6 +379,12 @@ class TestExpanding(Base): def setUp(self): self._create_data() + def test_doc_string(self): + + df = DataFrame({'B': [0, 1, 2, np.nan, 4]}) + df + df.expanding(2).sum() + def test_constructor(self): # GH 12669 @@ -394,6 +421,12 @@ class TestEWM(Base): def setUp(self): self._create_data() + def test_doc_string(self): + + df = DataFrame({'B': [0, 1, 2, np.nan, 4]}) + df + df.ewm(com=0.5).mean() + def test_constructor(self): for o in [self.series, self.frame]: c = o.ewm @@ -551,6 +584,7 @@ def _create_data(self): def test_dtypes(self): self._create_data() for f_name, d_name in product(self.funcs.keys(), self.data.keys()): + f = self.funcs[f_name] d = self.data[d_name] exp = self.expects[d_name][f_name] @@ -944,6 +978,7 @@ def test_rolling_median(self): name='median') def test_rolling_min(self): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): self._check_moment_func(mom.rolling_min, np.min, name='min') @@ -956,6 +991,7 @@ def test_rolling_min(self): window=3, min_periods=5) def test_rolling_max(self): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): self._check_moment_func(mom.rolling_max, np.max, name='max') @@ -1285,7 +1321,7 @@ def get_result(obj, window, min_periods=None, freq=None, center=False): freq='B') last_date = series_result.index[-1] - prev_date = last_date - 24 * datetools.bday + prev_date = last_date - 24 * offsets.BDay() trunc_series = self.series[::2].truncate(prev_date, last_date) trunc_frame = self.frame[::2].truncate(prev_date, last_date) @@ -2876,6 +2912,7 @@ def test_rolling_median_memory_error(self): Series(np.random.randn(n)).rolling(window=2, center=False).median() def test_rolling_min_max_numeric_types(self): + # GH12373 types_test = [np.dtype("f{}".format(width)) for width in [4, 8]] types_test.extend([np.dtype("{}{}".format(sign, width)) @@ -2947,6 +2984,7 @@ def test_rolling(self): r = g.rolling(window=4) for f in ['sum', 'mean', 'min', 'max', 'count', 'kurt', 'skew']: + result = getattr(r, f)() expected = g.apply(lambda x: getattr(x.rolling(4), f)()) tm.assert_frame_equal(result, expected) @@ -2993,6 +3031,7 @@ def test_expanding(self): r = g.expanding() for f in ['sum', 'mean', 'min', 'max', 'count', 'kurt', 'skew']: + result = getattr(r, f)() expected = g.apply(lambda x: getattr(x.expanding(), f)()) tm.assert_frame_equal(result, expected) @@ -3033,3 +3072,547 @@ def test_expanding_apply(self): result = r.apply(lambda x: x.sum()) expected = g.apply(lambda x: x.expanding().apply(lambda y: y.sum())) tm.assert_frame_equal(result, expected) + + +class TestRollingTS(tm.TestCase): + + # rolling time-series friendly + # xref GH13327 + + def setUp(self): + + self.regular = DataFrame({'A': pd.date_range('20130101', + periods=5, + freq='s'), + 'B': range(5)}).set_index('A') + + self.ragged = DataFrame({'B': range(5)}) + self.ragged.index = [Timestamp('20130101 09:00:00'), + Timestamp('20130101 09:00:02'), + Timestamp('20130101 09:00:03'), + Timestamp('20130101 09:00:05'), + Timestamp('20130101 09:00:06')] + + def test_doc_string(self): + + df = DataFrame({'B': [0, 1, 2, np.nan, 4]}, + index=[Timestamp('20130101 09:00:00'), + Timestamp('20130101 09:00:02'), + Timestamp('20130101 09:00:03'), + Timestamp('20130101 09:00:05'), + Timestamp('20130101 09:00:06')]) + df + df.rolling('2s').sum() + + def test_valid(self): + + df = self.regular + + # not a valid freq + with self.assertRaises(ValueError): + df.rolling(window='foobar') + + # not a datetimelike index + with self.assertRaises(ValueError): + df.reset_index().rolling(window='foobar') + + # non-fixed freqs + for freq in ['2MS', pd.offsets.MonthBegin(2)]: + with self.assertRaises(ValueError): + df.rolling(window=freq) + + for freq in ['1D', pd.offsets.Day(2), '2ms']: + df.rolling(window=freq) + + # non-integer min_periods + for minp in [1.0, 'foo', np.array([1, 2, 3])]: + with self.assertRaises(ValueError): + df.rolling(window='1D', min_periods=minp) + + # center is not implemented + with self.assertRaises(NotImplementedError): + df.rolling(window='1D', center=True) + + def test_on(self): + + df = self.regular + + # not a valid column + with self.assertRaises(ValueError): + df.rolling(window='2s', on='foobar') + + # column is valid + df = df.copy() + df['C'] = pd.date_range('20130101', periods=len(df)) + df.rolling(window='2d', on='C').sum() + + # invalid columns + with self.assertRaises(ValueError): + df.rolling(window='2d', on='B') + + # ok even though on non-selected + df.rolling(window='2d', on='C').B.sum() + + def test_monotonic_on(self): + + # on/index must be monotonic + df = DataFrame({'A': pd.date_range('20130101', + periods=5, + freq='s'), + 'B': range(5)}) + + self.assertTrue(df.A.is_monotonic) + df.rolling('2s', on='A').sum() + + df = df.set_index('A') + self.assertTrue(df.index.is_monotonic) + df.rolling('2s').sum() + + # non-monotonic + df.index = reversed(df.index.tolist()) + self.assertFalse(df.index.is_monotonic) + + with self.assertRaises(ValueError): + df.rolling('2s').sum() + + df = df.reset_index() + with self.assertRaises(ValueError): + df.rolling('2s', on='A').sum() + + def test_frame_on(self): + + df = DataFrame({'B': range(5), + 'C': pd.date_range('20130101 09:00:00', + periods=5, + freq='3s')}) + + df['A'] = [Timestamp('20130101 09:00:00'), + Timestamp('20130101 09:00:02'), + Timestamp('20130101 09:00:03'), + Timestamp('20130101 09:00:05'), + Timestamp('20130101 09:00:06')] + + # we are doing simulating using 'on' + expected = (df.set_index('A') + .rolling('2s') + .B + .sum() + .reset_index(drop=True) + ) + + result = (df.rolling('2s', on='A') + .B + .sum() + ) + tm.assert_series_equal(result, expected) + + # test as a frame + # we should be ignoring the 'on' as an aggregation column + # note that the expected is setting, computing, and reseting + # so the columns need to be switched compared + # to the actual result where they are ordered as in the + # original + expected = (df.set_index('A') + .rolling('2s')[['B']] + .sum() + .reset_index()[['B', 'A']] + ) + + result = (df.rolling('2s', on='A')[['B']] + .sum() + ) + tm.assert_frame_equal(result, expected) + + def test_frame_on2(self): + + # using multiple aggregation columns + df = DataFrame({'A': [0, 1, 2, 3, 4], + 'B': [0, 1, 2, np.nan, 4], + 'C': pd.Index([pd.Timestamp('20130101 09:00:00'), + pd.Timestamp('20130101 09:00:02'), + pd.Timestamp('20130101 09:00:03'), + pd.Timestamp('20130101 09:00:05'), + pd.Timestamp('20130101 09:00:06')])}, + columns=['A', 'C', 'B']) + + expected1 = DataFrame({'A': [0., 1, 3, 3, 7], + 'B': [0, 1, 3, np.nan, 4], + 'C': df['C']}, + columns=['A', 'C', 'B']) + + result = df.rolling('2s', on='C').sum() + expected = expected1 + tm.assert_frame_equal(result, expected) + + expected = Series([0, 1, 3, np.nan, 4], name='B') + result = df.rolling('2s', on='C').B.sum() + tm.assert_series_equal(result, expected) + + expected = expected1[['A', 'B', 'C']] + result = df.rolling('2s', on='C')[['A', 'B', 'C']].sum() + tm.assert_frame_equal(result, expected) + + def test_basic_regular(self): + + df = self.regular.copy() + + df.index = pd.date_range('20130101', periods=5, freq='D') + expected = df.rolling(window=1, min_periods=1).sum() + result = df.rolling(window='1D').sum() + tm.assert_frame_equal(result, expected) + + df.index = pd.date_range('20130101', periods=5, freq='2D') + expected = df.rolling(window=1, min_periods=1).sum() + result = df.rolling(window='2D', min_periods=1).sum() + tm.assert_frame_equal(result, expected) + + expected = df.rolling(window=1, min_periods=1).sum() + result = df.rolling(window='2D', min_periods=1).sum() + tm.assert_frame_equal(result, expected) + + expected = df.rolling(window=1).sum() + result = df.rolling(window='2D').sum() + tm.assert_frame_equal(result, expected) + + def test_min_periods(self): + + # compare for min_periods + df = self.regular + + # these slightly different + expected = df.rolling(2, min_periods=1).sum() + result = df.rolling('2s').sum() + tm.assert_frame_equal(result, expected) + + expected = df.rolling(2, min_periods=1).sum() + result = df.rolling('2s', min_periods=1).sum() + tm.assert_frame_equal(result, expected) + + def test_ragged_sum(self): + + df = self.ragged + result = df.rolling(window='1s', min_periods=1).sum() + expected = df.copy() + expected['B'] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='2s', min_periods=1).sum() + expected = df.copy() + expected['B'] = [0.0, 1, 3, 3, 7] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='2s', min_periods=2).sum() + expected = df.copy() + expected['B'] = [np.nan, np.nan, 3, np.nan, 7] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='3s', min_periods=1).sum() + expected = df.copy() + expected['B'] = [0.0, 1, 3, 5, 7] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='3s').sum() + expected = df.copy() + expected['B'] = [0.0, 1, 3, 5, 7] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='4s', min_periods=1).sum() + expected = df.copy() + expected['B'] = [0.0, 1, 3, 6, 9] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='4s', min_periods=3).sum() + expected = df.copy() + expected['B'] = [np.nan, np.nan, 3, 6, 9] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='5s', min_periods=1).sum() + expected = df.copy() + expected['B'] = [0.0, 1, 3, 6, 10] + tm.assert_frame_equal(result, expected) + + def test_ragged_mean(self): + + df = self.ragged + result = df.rolling(window='1s', min_periods=1).mean() + expected = df.copy() + expected['B'] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='2s', min_periods=1).mean() + expected = df.copy() + expected['B'] = [0.0, 1, 1.5, 3.0, 3.5] + tm.assert_frame_equal(result, expected) + + def test_ragged_median(self): + + df = self.ragged + result = df.rolling(window='1s', min_periods=1).median() + expected = df.copy() + expected['B'] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='2s', min_periods=1).median() + expected = df.copy() + expected['B'] = [0.0, 1, 1.5, 3.0, 3.5] + tm.assert_frame_equal(result, expected) + + def test_ragged_quantile(self): + + df = self.ragged + result = df.rolling(window='1s', min_periods=1).quantile(0.5) + expected = df.copy() + expected['B'] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='2s', min_periods=1).quantile(0.5) + expected = df.copy() + expected['B'] = [0.0, 1, 1.0, 3.0, 3.0] + tm.assert_frame_equal(result, expected) + + def test_ragged_std(self): + + df = self.ragged + result = df.rolling(window='1s', min_periods=1).std(ddof=0) + expected = df.copy() + expected['B'] = [0.0] * 5 + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='1s', min_periods=1).std(ddof=1) + expected = df.copy() + expected['B'] = [np.nan] * 5 + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='3s', min_periods=1).std(ddof=0) + expected = df.copy() + expected['B'] = [0.0] + [0.5] * 4 + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='5s', min_periods=1).std(ddof=1) + expected = df.copy() + expected['B'] = [np.nan, 0.707107, 1.0, 1.0, 1.290994] + tm.assert_frame_equal(result, expected) + + def test_ragged_var(self): + + df = self.ragged + result = df.rolling(window='1s', min_periods=1).var(ddof=0) + expected = df.copy() + expected['B'] = [0.0] * 5 + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='1s', min_periods=1).var(ddof=1) + expected = df.copy() + expected['B'] = [np.nan] * 5 + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='3s', min_periods=1).var(ddof=0) + expected = df.copy() + expected['B'] = [0.0] + [0.25] * 4 + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='5s', min_periods=1).var(ddof=1) + expected = df.copy() + expected['B'] = [np.nan, 0.5, 1.0, 1.0, 1 + 2 / 3.] + tm.assert_frame_equal(result, expected) + + def test_ragged_skew(self): + + df = self.ragged + result = df.rolling(window='3s', min_periods=1).skew() + expected = df.copy() + expected['B'] = [np.nan] * 5 + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='5s', min_periods=1).skew() + expected = df.copy() + expected['B'] = [np.nan] * 2 + [0.0, 0.0, 0.0] + tm.assert_frame_equal(result, expected) + + def test_ragged_kurt(self): + + df = self.ragged + result = df.rolling(window='3s', min_periods=1).kurt() + expected = df.copy() + expected['B'] = [np.nan] * 5 + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='5s', min_periods=1).kurt() + expected = df.copy() + expected['B'] = [np.nan] * 4 + [-1.2] + tm.assert_frame_equal(result, expected) + + def test_ragged_count(self): + + df = self.ragged + result = df.rolling(window='1s', min_periods=1).count() + expected = df.copy() + expected['B'] = [1.0, 1, 1, 1, 1] + tm.assert_frame_equal(result, expected) + + df = self.ragged + result = df.rolling(window='1s').count() + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='2s', min_periods=1).count() + expected = df.copy() + expected['B'] = [1.0, 1, 2, 1, 2] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='2s', min_periods=2).count() + expected = df.copy() + expected['B'] = [np.nan, np.nan, 2, np.nan, 2] + tm.assert_frame_equal(result, expected) + + def test_regular_min(self): + + df = DataFrame({'A': pd.date_range('20130101', + periods=5, + freq='s'), + 'B': [0.0, 1, 2, 3, 4]}).set_index('A') + result = df.rolling('1s').min() + expected = df.copy() + expected['B'] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + df = DataFrame({'A': pd.date_range('20130101', + periods=5, + freq='s'), + 'B': [5, 4, 3, 4, 5]}).set_index('A') + + tm.assert_frame_equal(result, expected) + result = df.rolling('2s').min() + expected = df.copy() + expected['B'] = [5.0, 4, 3, 3, 4] + tm.assert_frame_equal(result, expected) + + result = df.rolling('5s').min() + expected = df.copy() + expected['B'] = [5.0, 4, 3, 3, 3] + tm.assert_frame_equal(result, expected) + + def test_ragged_min(self): + + df = self.ragged + + result = df.rolling(window='1s', min_periods=1).min() + expected = df.copy() + expected['B'] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='2s', min_periods=1).min() + expected = df.copy() + expected['B'] = [0.0, 1, 1, 3, 3] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='5s', min_periods=1).min() + expected = df.copy() + expected['B'] = [0.0, 0, 0, 1, 1] + tm.assert_frame_equal(result, expected) + + def test_perf_min(self): + + N = 10000 + + dfp = DataFrame({'B': np.random.randn(N)}, + index=pd.date_range('20130101', + periods=N, + freq='s')) + expected = dfp.rolling(2, min_periods=1).min() + result = dfp.rolling('2s').min() + self.assertTrue(((result - expected) < 0.01).all().bool()) + + expected = dfp.rolling(200, min_periods=1).min() + result = dfp.rolling('200s').min() + self.assertTrue(((result - expected) < 0.01).all().bool()) + + def test_ragged_max(self): + + df = self.ragged + + result = df.rolling(window='1s', min_periods=1).max() + expected = df.copy() + expected['B'] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='2s', min_periods=1).max() + expected = df.copy() + expected['B'] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='5s', min_periods=1).max() + expected = df.copy() + expected['B'] = [0.0, 1, 2, 3, 4] + tm.assert_frame_equal(result, expected) + + def test_ragged_apply(self): + + df = self.ragged + + f = lambda x: 1 + result = df.rolling(window='1s', min_periods=1).apply(f) + expected = df.copy() + expected['B'] = 1. + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='2s', min_periods=1).apply(f) + expected = df.copy() + expected['B'] = 1. + tm.assert_frame_equal(result, expected) + + result = df.rolling(window='5s', min_periods=1).apply(f) + expected = df.copy() + expected['B'] = 1. + tm.assert_frame_equal(result, expected) + + def test_all(self): + + # simple comparision of integer vs time-based windowing + df = self.regular * 2 + er = df.rolling(window=1) + r = df.rolling(window='1s') + + for f in ['sum', 'mean', 'count', 'median', 'std', + 'var', 'kurt', 'skew', 'min', 'max']: + + result = getattr(r, f)() + expected = getattr(er, f)() + tm.assert_frame_equal(result, expected) + + result = r.quantile(0.5) + expected = er.quantile(0.5) + tm.assert_frame_equal(result, expected) + + result = r.apply(lambda x: 1) + expected = er.apply(lambda x: 1) + tm.assert_frame_equal(result, expected) + + def test_all2(self): + + # more sophisticated comparision of integer vs. + # time-based windowing + df = DataFrame({'B': np.arange(50)}, + index=pd.date_range('20130101', + periods=50, freq='H') + ) + # in-range data + dft = df.between_time("09:00", "16:00") + + r = dft.rolling(window='5H') + + for f in ['sum', 'mean', 'count', 'median', 'std', + 'var', 'kurt', 'skew', 'min', 'max']: + + result = getattr(r, f)() + + # we need to roll the days separately + # to compare with a time-based roll + # finally groupby-apply will return a multi-index + # so we need to drop the day + def agg_by_day(x): + x = x.between_time("09:00", "16:00") + return getattr(x.rolling(5, min_periods=1), f)() + expected = df.groupby(df.index.day).apply( + agg_by_day).reset_index(level=0, drop=True) + + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/types/test_cast.py b/pandas/tests/types/test_cast.py new file mode 100644 index 0000000000000..56a14a51105ca --- /dev/null +++ b/pandas/tests/types/test_cast.py @@ -0,0 +1,285 @@ +# -*- coding: utf-8 -*- + +""" +These test the private routines in types/cast.py + +""" + + +import nose +from datetime import datetime +import numpy as np + +from pandas import Timedelta, Timestamp +from pandas.types.cast import (_possibly_downcast_to_dtype, + _possibly_convert_objects, + _infer_dtype_from_scalar, + _maybe_convert_string_to_object, + _maybe_convert_scalar, + _find_common_type) +from pandas.types.dtypes import (CategoricalDtype, + DatetimeTZDtype, PeriodDtype) +from pandas.util import testing as tm + +_multiprocess_can_split_ = True + + +class TestPossiblyDowncast(tm.TestCase): + + def test_downcast_conv(self): + # test downcasting + + arr = np.array([8.5, 8.6, 8.7, 8.8, 8.9999999999995]) + result = _possibly_downcast_to_dtype(arr, 'infer') + assert (np.array_equal(result, arr)) + + arr = np.array([8., 8., 8., 8., 8.9999999999995]) + result = _possibly_downcast_to_dtype(arr, 'infer') + expected = np.array([8, 8, 8, 8, 9]) + assert (np.array_equal(result, expected)) + + arr = np.array([8., 8., 8., 8., 9.0000000000005]) + result = _possibly_downcast_to_dtype(arr, 'infer') + expected = np.array([8, 8, 8, 8, 9]) + assert (np.array_equal(result, expected)) + + # conversions + + expected = np.array([1, 2]) + for dtype in [np.float64, object, np.int64]: + arr = np.array([1.0, 2.0], dtype=dtype) + result = _possibly_downcast_to_dtype(arr, 'infer') + tm.assert_almost_equal(result, expected, check_dtype=False) + + for dtype in [np.float64, object]: + expected = np.array([1.0, 2.0, np.nan], dtype=dtype) + arr = np.array([1.0, 2.0, np.nan], dtype=dtype) + result = _possibly_downcast_to_dtype(arr, 'infer') + tm.assert_almost_equal(result, expected) + + # empties + for dtype in [np.int32, np.float64, np.float32, np.bool_, + np.int64, object]: + arr = np.array([], dtype=dtype) + result = _possibly_downcast_to_dtype(arr, 'int64') + tm.assert_almost_equal(result, np.array([], dtype=np.int64)) + assert result.dtype == np.int64 + + def test_datetimelikes_nan(self): + arr = np.array([1, 2, np.nan]) + exp = np.array([1, 2, np.datetime64('NaT')], dtype='datetime64[ns]') + res = _possibly_downcast_to_dtype(arr, 'datetime64[ns]') + tm.assert_numpy_array_equal(res, exp) + + exp = np.array([1, 2, np.timedelta64('NaT')], dtype='timedelta64[ns]') + res = _possibly_downcast_to_dtype(arr, 'timedelta64[ns]') + tm.assert_numpy_array_equal(res, exp) + + +class TestInferDtype(tm.TestCase): + + def test_infer_dtype_from_scalar(self): + # Test that _infer_dtype_from_scalar is returning correct dtype for int + # and float. + + for dtypec in [np.uint8, np.int8, np.uint16, np.int16, np.uint32, + np.int32, np.uint64, np.int64]: + data = dtypec(12) + dtype, val = _infer_dtype_from_scalar(data) + self.assertEqual(dtype, type(data)) + + data = 12 + dtype, val = _infer_dtype_from_scalar(data) + self.assertEqual(dtype, np.int64) + + for dtypec in [np.float16, np.float32, np.float64]: + data = dtypec(12) + dtype, val = _infer_dtype_from_scalar(data) + self.assertEqual(dtype, dtypec) + + data = np.float(12) + dtype, val = _infer_dtype_from_scalar(data) + self.assertEqual(dtype, np.float64) + + for data in [True, False]: + dtype, val = _infer_dtype_from_scalar(data) + self.assertEqual(dtype, np.bool_) + + for data in [np.complex64(1), np.complex128(1)]: + dtype, val = _infer_dtype_from_scalar(data) + self.assertEqual(dtype, np.complex_) + + import datetime + for data in [np.datetime64(1, 'ns'), Timestamp(1), + datetime.datetime(2000, 1, 1, 0, 0)]: + dtype, val = _infer_dtype_from_scalar(data) + self.assertEqual(dtype, 'M8[ns]') + + for data in [np.timedelta64(1, 'ns'), Timedelta(1), + datetime.timedelta(1)]: + dtype, val = _infer_dtype_from_scalar(data) + self.assertEqual(dtype, 'm8[ns]') + + for data in [datetime.date(2000, 1, 1), + Timestamp(1, tz='US/Eastern'), 'foo']: + dtype, val = _infer_dtype_from_scalar(data) + self.assertEqual(dtype, np.object_) + + +class TestMaybe(tm.TestCase): + + def test_maybe_convert_string_to_array(self): + result = _maybe_convert_string_to_object('x') + tm.assert_numpy_array_equal(result, np.array(['x'], dtype=object)) + self.assertTrue(result.dtype == object) + + result = _maybe_convert_string_to_object(1) + self.assertEqual(result, 1) + + arr = np.array(['x', 'y'], dtype=str) + result = _maybe_convert_string_to_object(arr) + tm.assert_numpy_array_equal(result, np.array(['x', 'y'], dtype=object)) + self.assertTrue(result.dtype == object) + + # unicode + arr = np.array(['x', 'y']).astype('U') + result = _maybe_convert_string_to_object(arr) + tm.assert_numpy_array_equal(result, np.array(['x', 'y'], dtype=object)) + self.assertTrue(result.dtype == object) + + # object + arr = np.array(['x', 2], dtype=object) + result = _maybe_convert_string_to_object(arr) + tm.assert_numpy_array_equal(result, np.array(['x', 2], dtype=object)) + self.assertTrue(result.dtype == object) + + def test_maybe_convert_scalar(self): + + # pass thru + result = _maybe_convert_scalar('x') + self.assertEqual(result, 'x') + result = _maybe_convert_scalar(np.array([1])) + self.assertEqual(result, np.array([1])) + + # leave scalar dtype + result = _maybe_convert_scalar(np.int64(1)) + self.assertEqual(result, np.int64(1)) + result = _maybe_convert_scalar(np.int32(1)) + self.assertEqual(result, np.int32(1)) + result = _maybe_convert_scalar(np.float32(1)) + self.assertEqual(result, np.float32(1)) + result = _maybe_convert_scalar(np.int64(1)) + self.assertEqual(result, np.float64(1)) + + # coerce + result = _maybe_convert_scalar(1) + self.assertEqual(result, np.int64(1)) + result = _maybe_convert_scalar(1.0) + self.assertEqual(result, np.float64(1)) + result = _maybe_convert_scalar(Timestamp('20130101')) + self.assertEqual(result, Timestamp('20130101').value) + result = _maybe_convert_scalar(datetime(2013, 1, 1)) + self.assertEqual(result, Timestamp('20130101').value) + result = _maybe_convert_scalar(Timedelta('1 day 1 min')) + self.assertEqual(result, Timedelta('1 day 1 min').value) + + +class TestConvert(tm.TestCase): + + def test_possibly_convert_objects_copy(self): + values = np.array([1, 2]) + + out = _possibly_convert_objects(values, copy=False) + self.assertTrue(values is out) + + out = _possibly_convert_objects(values, copy=True) + self.assertTrue(values is not out) + + values = np.array(['apply', 'banana']) + out = _possibly_convert_objects(values, copy=False) + self.assertTrue(values is out) + + out = _possibly_convert_objects(values, copy=True) + self.assertTrue(values is not out) + + +class TestCommonTypes(tm.TestCase): + + def test_numpy_dtypes(self): + # (source_types, destination_type) + testcases = ( + # identity + ((np.int64,), np.int64), + ((np.uint64,), np.uint64), + ((np.float32,), np.float32), + ((np.object,), np.object), + + # into ints + ((np.int16, np.int64), np.int64), + ((np.int32, np.uint32), np.int64), + ((np.uint16, np.uint64), np.uint64), + + # into floats + ((np.float16, np.float32), np.float32), + ((np.float16, np.int16), np.float32), + ((np.float32, np.int16), np.float32), + ((np.uint64, np.int64), np.float64), + ((np.int16, np.float64), np.float64), + ((np.float16, np.int64), np.float64), + + # into others + ((np.complex128, np.int32), np.complex128), + ((np.object, np.float32), np.object), + ((np.object, np.int16), np.object), + + ((np.dtype('datetime64[ns]'), np.dtype('datetime64[ns]')), + np.dtype('datetime64[ns]')), + ((np.dtype('timedelta64[ns]'), np.dtype('timedelta64[ns]')), + np.dtype('timedelta64[ns]')), + + ((np.dtype('datetime64[ns]'), np.dtype('datetime64[ms]')), + np.dtype('datetime64[ns]')), + ((np.dtype('timedelta64[ms]'), np.dtype('timedelta64[ns]')), + np.dtype('timedelta64[ns]')), + + ((np.dtype('datetime64[ns]'), np.dtype('timedelta64[ns]')), + np.object), + ((np.dtype('datetime64[ns]'), np.int64), np.object) + ) + for src, common in testcases: + self.assertEqual(_find_common_type(src), common) + + with tm.assertRaises(ValueError): + # empty + _find_common_type([]) + + def test_categorical_dtype(self): + dtype = CategoricalDtype() + self.assertEqual(_find_common_type([dtype]), 'category') + self.assertEqual(_find_common_type([dtype, dtype]), 'category') + self.assertEqual(_find_common_type([np.object, dtype]), np.object) + + def test_datetimetz_dtype(self): + dtype = DatetimeTZDtype(unit='ns', tz='US/Eastern') + self.assertEqual(_find_common_type([dtype, dtype]), + 'datetime64[ns, US/Eastern]') + + for dtype2 in [DatetimeTZDtype(unit='ns', tz='Asia/Tokyo'), + np.dtype('datetime64[ns]'), np.object, np.int64]: + self.assertEqual(_find_common_type([dtype, dtype2]), np.object) + self.assertEqual(_find_common_type([dtype2, dtype]), np.object) + + def test_period_dtype(self): + dtype = PeriodDtype(freq='D') + self.assertEqual(_find_common_type([dtype, dtype]), 'period[D]') + + for dtype2 in [DatetimeTZDtype(unit='ns', tz='Asia/Tokyo'), + PeriodDtype(freq='2D'), PeriodDtype(freq='H'), + np.dtype('datetime64[ns]'), np.object, np.int64]: + self.assertEqual(_find_common_type([dtype, dtype2]), np.object) + self.assertEqual(_find_common_type([dtype2, dtype]), np.object) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/types/test_common.py b/pandas/tests/types/test_common.py new file mode 100644 index 0000000000000..4d6f50862c562 --- /dev/null +++ b/pandas/tests/types/test_common.py @@ -0,0 +1,62 @@ +# -*- coding: utf-8 -*- + +import nose +import numpy as np + +from pandas.types.dtypes import DatetimeTZDtype, PeriodDtype, CategoricalDtype +from pandas.types.common import pandas_dtype, is_dtype_equal + +import pandas.util.testing as tm + +_multiprocess_can_split_ = True + + +class TestPandasDtype(tm.TestCase): + + def test_numpy_dtype(self): + for dtype in ['M8[ns]', 'm8[ns]', 'object', 'float64', 'int64']: + self.assertEqual(pandas_dtype(dtype), np.dtype(dtype)) + + def test_numpy_string_dtype(self): + # do not parse freq-like string as period dtype + self.assertEqual(pandas_dtype('U'), np.dtype('U')) + self.assertEqual(pandas_dtype('S'), np.dtype('S')) + + def test_datetimetz_dtype(self): + for dtype in ['datetime64[ns, US/Eastern]', + 'datetime64[ns, Asia/Tokyo]', + 'datetime64[ns, UTC]']: + self.assertIs(pandas_dtype(dtype), DatetimeTZDtype(dtype)) + self.assertEqual(pandas_dtype(dtype), DatetimeTZDtype(dtype)) + self.assertEqual(pandas_dtype(dtype), dtype) + + def test_categorical_dtype(self): + self.assertEqual(pandas_dtype('category'), CategoricalDtype()) + + def test_period_dtype(self): + for dtype in ['period[D]', 'period[3M]', 'period[U]', + 'Period[D]', 'Period[3M]', 'Period[U]']: + self.assertIs(pandas_dtype(dtype), PeriodDtype(dtype)) + self.assertEqual(pandas_dtype(dtype), PeriodDtype(dtype)) + self.assertEqual(pandas_dtype(dtype), dtype) + + +def test_dtype_equal(): + assert is_dtype_equal(np.int64, np.int64) + assert not is_dtype_equal(np.int64, np.float64) + + p1 = PeriodDtype('D') + p2 = PeriodDtype('D') + assert is_dtype_equal(p1, p2) + assert not is_dtype_equal(np.int64, p1) + + p3 = PeriodDtype('2D') + assert not is_dtype_equal(p1, p3) + + assert not DatetimeTZDtype.is_dtype(np.int64) + assert not PeriodDtype.is_dtype(np.int64) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/types/test_concat.py b/pandas/tests/types/test_concat.py new file mode 100644 index 0000000000000..6403dcb5a5350 --- /dev/null +++ b/pandas/tests/types/test_concat.py @@ -0,0 +1,86 @@ +# -*- coding: utf-8 -*- + +import nose +import pandas as pd +import pandas.types.concat as _concat +import pandas.util.testing as tm + + +class TestConcatCompat(tm.TestCase): + + _multiprocess_can_split_ = True + + def check_concat(self, to_concat, exp): + for klass in [pd.Index, pd.Series]: + to_concat_klass = [klass(c) for c in to_concat] + res = _concat.get_dtype_kinds(to_concat_klass) + self.assertEqual(res, set(exp)) + + def test_get_dtype_kinds(self): + to_concat = [['a'], [1, 2]] + self.check_concat(to_concat, ['i', 'object']) + + to_concat = [[3, 4], [1, 2]] + self.check_concat(to_concat, ['i']) + + to_concat = [[3, 4], [1, 2.1]] + self.check_concat(to_concat, ['i', 'f']) + + def test_get_dtype_kinds_datetimelike(self): + to_concat = [pd.DatetimeIndex(['2011-01-01']), + pd.DatetimeIndex(['2011-01-02'])] + self.check_concat(to_concat, ['datetime']) + + to_concat = [pd.TimedeltaIndex(['1 days']), + pd.TimedeltaIndex(['2 days'])] + self.check_concat(to_concat, ['timedelta']) + + def test_get_dtype_kinds_datetimelike_object(self): + to_concat = [pd.DatetimeIndex(['2011-01-01']), + pd.DatetimeIndex(['2011-01-02'], tz='US/Eastern')] + self.check_concat(to_concat, + ['datetime', 'datetime64[ns, US/Eastern]']) + + to_concat = [pd.DatetimeIndex(['2011-01-01'], tz='Asia/Tokyo'), + pd.DatetimeIndex(['2011-01-02'], tz='US/Eastern')] + self.check_concat(to_concat, + ['datetime64[ns, Asia/Tokyo]', + 'datetime64[ns, US/Eastern]']) + + # timedelta has single type + to_concat = [pd.TimedeltaIndex(['1 days']), + pd.TimedeltaIndex(['2 hours'])] + self.check_concat(to_concat, ['timedelta']) + + to_concat = [pd.DatetimeIndex(['2011-01-01'], tz='Asia/Tokyo'), + pd.TimedeltaIndex(['1 days'])] + self.check_concat(to_concat, + ['datetime64[ns, Asia/Tokyo]', 'timedelta']) + + def test_get_dtype_kinds_period(self): + # because we don't have Period dtype (yet), + # Series results in object dtype + to_concat = [pd.PeriodIndex(['2011-01'], freq='M'), + pd.PeriodIndex(['2011-01'], freq='M')] + res = _concat.get_dtype_kinds(to_concat) + self.assertEqual(res, set(['period[M]'])) + + to_concat = [pd.Series([pd.Period('2011-01', freq='M')]), + pd.Series([pd.Period('2011-02', freq='M')])] + res = _concat.get_dtype_kinds(to_concat) + self.assertEqual(res, set(['object'])) + + to_concat = [pd.PeriodIndex(['2011-01'], freq='M'), + pd.PeriodIndex(['2011-01'], freq='D')] + res = _concat.get_dtype_kinds(to_concat) + self.assertEqual(res, set(['period[M]', 'period[D]'])) + + to_concat = [pd.Series([pd.Period('2011-01', freq='M')]), + pd.Series([pd.Period('2011-02', freq='D')])] + res = _concat.get_dtype_kinds(to_concat) + self.assertEqual(res, set(['object'])) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/types/test_dtypes.py b/pandas/tests/types/test_dtypes.py index d48b9baf64777..a2b0a9ebfa6cc 100644 --- a/pandas/tests/types/test_dtypes.py +++ b/pandas/tests/types/test_dtypes.py @@ -3,14 +3,16 @@ import nose import numpy as np +import pandas as pd from pandas import Series, Categorical, date_range -import pandas.core.common as com -from pandas.types.api import CategoricalDtype -from pandas.core.common import (is_categorical_dtype, - is_categorical, DatetimeTZDtype, - is_datetime64tz_dtype, is_datetimetz, - is_dtype_equal, is_datetime64_ns_dtype, - is_datetime64_dtype) + +from pandas.types.dtypes import DatetimeTZDtype, PeriodDtype, CategoricalDtype +from pandas.types.common import (is_categorical_dtype, is_categorical, + is_datetime64tz_dtype, is_datetimetz, + is_period_dtype, is_period, + is_dtype_equal, is_datetime64_ns_dtype, + is_datetime64_dtype, is_string_dtype, + _coerce_to_dtype) import pandas.util.testing as tm _multiprocess_can_split_ = True @@ -23,6 +25,7 @@ def test_hash(self): def test_equality_invalid(self): self.assertRaises(self.dtype == 'foo') + self.assertFalse(is_dtype_equal(self.dtype, np.int64)) def test_numpy_informed(self): @@ -77,8 +80,7 @@ def test_basic(self): self.assertTrue(is_categorical_dtype(self.dtype)) - factor = Categorical.from_array(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c' - ]) + factor = Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']) s = Series(factor, name='A') @@ -124,9 +126,9 @@ def test_subclass(self): self.assertTrue(issubclass(type(a), type(b))) def test_coerce_to_dtype(self): - self.assertEqual(com._coerce_to_dtype('datetime64[ns, US/Eastern]'), + self.assertEqual(_coerce_to_dtype('datetime64[ns, US/Eastern]'), DatetimeTZDtype('ns', 'US/Eastern')) - self.assertEqual(com._coerce_to_dtype('datetime64[ns, Asia/Tokyo]'), + self.assertEqual(_coerce_to_dtype('datetime64[ns, Asia/Tokyo]'), DatetimeTZDtype('ns', 'Asia/Tokyo')) def test_compat(self): @@ -205,6 +207,148 @@ def test_parser(self): DatetimeTZDtype('ns', tz), ) + def test_empty(self): + dt = DatetimeTZDtype() + with tm.assertRaises(AttributeError): + str(dt) + + +class TestPeriodDtype(Base, tm.TestCase): + + def setUp(self): + self.dtype = PeriodDtype('D') + + def test_construction(self): + with tm.assertRaises(ValueError): + PeriodDtype('xx') + + for s in ['period[D]', 'Period[D]', 'D']: + dt = PeriodDtype(s) + self.assertEqual(dt.freq, pd.tseries.offsets.Day()) + self.assertTrue(is_period_dtype(dt)) + + for s in ['period[3D]', 'Period[3D]', '3D']: + dt = PeriodDtype(s) + self.assertEqual(dt.freq, pd.tseries.offsets.Day(3)) + self.assertTrue(is_period_dtype(dt)) + + for s in ['period[26H]', 'Period[26H]', '26H', + 'period[1D2H]', 'Period[1D2H]', '1D2H']: + dt = PeriodDtype(s) + self.assertEqual(dt.freq, pd.tseries.offsets.Hour(26)) + self.assertTrue(is_period_dtype(dt)) + + def test_subclass(self): + a = PeriodDtype('period[D]') + b = PeriodDtype('period[3D]') + + self.assertTrue(issubclass(type(a), type(a))) + self.assertTrue(issubclass(type(a), type(b))) + + def test_identity(self): + self.assertEqual(PeriodDtype('period[D]'), + PeriodDtype('period[D]')) + self.assertIs(PeriodDtype('period[D]'), + PeriodDtype('period[D]')) + + self.assertEqual(PeriodDtype('period[3D]'), + PeriodDtype('period[3D]')) + self.assertIs(PeriodDtype('period[3D]'), + PeriodDtype('period[3D]')) + + self.assertEqual(PeriodDtype('period[1S1U]'), + PeriodDtype('period[1000001U]')) + self.assertIs(PeriodDtype('period[1S1U]'), + PeriodDtype('period[1000001U]')) + + def test_coerce_to_dtype(self): + self.assertEqual(_coerce_to_dtype('period[D]'), + PeriodDtype('period[D]')) + self.assertEqual(_coerce_to_dtype('period[3M]'), + PeriodDtype('period[3M]')) + + def test_compat(self): + self.assertFalse(is_datetime64_ns_dtype(self.dtype)) + self.assertFalse(is_datetime64_ns_dtype('period[D]')) + self.assertFalse(is_datetime64_dtype(self.dtype)) + self.assertFalse(is_datetime64_dtype('period[D]')) + + def test_construction_from_string(self): + result = PeriodDtype('period[D]') + self.assertTrue(is_dtype_equal(self.dtype, result)) + result = PeriodDtype.construct_from_string('period[D]') + self.assertTrue(is_dtype_equal(self.dtype, result)) + with tm.assertRaises(TypeError): + PeriodDtype.construct_from_string('foo') + with tm.assertRaises(TypeError): + PeriodDtype.construct_from_string('period[foo]') + with tm.assertRaises(TypeError): + PeriodDtype.construct_from_string('foo[D]') + + with tm.assertRaises(TypeError): + PeriodDtype.construct_from_string('datetime64[ns]') + with tm.assertRaises(TypeError): + PeriodDtype.construct_from_string('datetime64[ns, US/Eastern]') + + def test_is_dtype(self): + self.assertTrue(PeriodDtype.is_dtype(self.dtype)) + self.assertTrue(PeriodDtype.is_dtype('period[D]')) + self.assertTrue(PeriodDtype.is_dtype('period[3D]')) + self.assertTrue(PeriodDtype.is_dtype(PeriodDtype('3D'))) + self.assertTrue(PeriodDtype.is_dtype('period[U]')) + self.assertTrue(PeriodDtype.is_dtype('period[S]')) + self.assertTrue(PeriodDtype.is_dtype(PeriodDtype('U'))) + self.assertTrue(PeriodDtype.is_dtype(PeriodDtype('S'))) + + self.assertFalse(PeriodDtype.is_dtype('D')) + self.assertFalse(PeriodDtype.is_dtype('3D')) + self.assertFalse(PeriodDtype.is_dtype('U')) + self.assertFalse(PeriodDtype.is_dtype('S')) + self.assertFalse(PeriodDtype.is_dtype('foo')) + self.assertFalse(PeriodDtype.is_dtype(np.object_)) + self.assertFalse(PeriodDtype.is_dtype(np.int64)) + self.assertFalse(PeriodDtype.is_dtype(np.float64)) + + def test_equality(self): + self.assertTrue(is_dtype_equal(self.dtype, 'period[D]')) + self.assertTrue(is_dtype_equal(self.dtype, PeriodDtype('D'))) + self.assertTrue(is_dtype_equal(self.dtype, PeriodDtype('D'))) + self.assertTrue(is_dtype_equal(PeriodDtype('D'), PeriodDtype('D'))) + + self.assertFalse(is_dtype_equal(self.dtype, 'D')) + self.assertFalse(is_dtype_equal(PeriodDtype('D'), PeriodDtype('2D'))) + + def test_basic(self): + self.assertTrue(is_period_dtype(self.dtype)) + + pidx = pd.period_range('2013-01-01 09:00', periods=5, freq='H') + + self.assertTrue(is_period_dtype(pidx.dtype)) + self.assertTrue(is_period_dtype(pidx)) + self.assertTrue(is_period(pidx)) + + s = Series(pidx, name='A') + # dtypes + # series results in object dtype currently, + # is_period checks period_arraylike + self.assertFalse(is_period_dtype(s.dtype)) + self.assertFalse(is_period_dtype(s)) + self.assertTrue(is_period(s)) + + self.assertFalse(is_period_dtype(np.dtype('float64'))) + self.assertFalse(is_period_dtype(1.0)) + self.assertFalse(is_period(np.dtype('float64'))) + self.assertFalse(is_period(1.0)) + + def test_empty(self): + dt = PeriodDtype() + with tm.assertRaises(AttributeError): + str(dt) + + def test_not_string(self): + # though PeriodDtype has object kind, it cannot be string + self.assertFalse(is_string_dtype(PeriodDtype('D'))) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/tests/types/test_generic.py b/pandas/tests/types/test_generic.py index 5549a3a376992..89913de6f6069 100644 --- a/pandas/tests/types/test_generic.py +++ b/pandas/tests/types/test_generic.py @@ -3,8 +3,8 @@ import nose import numpy as np import pandas as pd -import pandas.core.common as com import pandas.util.testing as tm +from pandas.types import generic as gt _multiprocess_can_split_ = True @@ -22,24 +22,24 @@ class TestABCClasses(tm.TestCase): sparse_array = pd.SparseArray(np.random.randn(10)) def test_abc_types(self): - self.assertIsInstance(pd.Index(['a', 'b', 'c']), com.ABCIndex) - self.assertIsInstance(pd.Int64Index([1, 2, 3]), com.ABCInt64Index) - self.assertIsInstance(pd.Float64Index([1, 2, 3]), com.ABCFloat64Index) - self.assertIsInstance(self.multi_index, com.ABCMultiIndex) - self.assertIsInstance(self.datetime_index, com.ABCDatetimeIndex) - self.assertIsInstance(self.timedelta_index, com.ABCTimedeltaIndex) - self.assertIsInstance(self.period_index, com.ABCPeriodIndex) + self.assertIsInstance(pd.Index(['a', 'b', 'c']), gt.ABCIndex) + self.assertIsInstance(pd.Int64Index([1, 2, 3]), gt.ABCInt64Index) + self.assertIsInstance(pd.Float64Index([1, 2, 3]), gt.ABCFloat64Index) + self.assertIsInstance(self.multi_index, gt.ABCMultiIndex) + self.assertIsInstance(self.datetime_index, gt.ABCDatetimeIndex) + self.assertIsInstance(self.timedelta_index, gt.ABCTimedeltaIndex) + self.assertIsInstance(self.period_index, gt.ABCPeriodIndex) self.assertIsInstance(self.categorical_df.index, - com.ABCCategoricalIndex) - self.assertIsInstance(pd.Index(['a', 'b', 'c']), com.ABCIndexClass) - self.assertIsInstance(pd.Int64Index([1, 2, 3]), com.ABCIndexClass) - self.assertIsInstance(pd.Series([1, 2, 3]), com.ABCSeries) - self.assertIsInstance(self.df, com.ABCDataFrame) - self.assertIsInstance(self.df.to_panel(), com.ABCPanel) - self.assertIsInstance(self.sparse_series, com.ABCSparseSeries) - self.assertIsInstance(self.sparse_array, com.ABCSparseArray) - self.assertIsInstance(self.categorical, com.ABCCategorical) - self.assertIsInstance(pd.Period('2012', freq='A-DEC'), com.ABCPeriod) + gt.ABCCategoricalIndex) + self.assertIsInstance(pd.Index(['a', 'b', 'c']), gt.ABCIndexClass) + self.assertIsInstance(pd.Int64Index([1, 2, 3]), gt.ABCIndexClass) + self.assertIsInstance(pd.Series([1, 2, 3]), gt.ABCSeries) + self.assertIsInstance(self.df, gt.ABCDataFrame) + self.assertIsInstance(self.df.to_panel(), gt.ABCPanel) + self.assertIsInstance(self.sparse_series, gt.ABCSparseSeries) + self.assertIsInstance(self.sparse_array, gt.ABCSparseArray) + self.assertIsInstance(self.categorical, gt.ABCCategorical) + self.assertIsInstance(pd.Period('2012', freq='A-DEC'), gt.ABCPeriod) if __name__ == '__main__': diff --git a/pandas/tests/types/test_inference.py b/pandas/tests/types/test_inference.py new file mode 100644 index 0000000000000..9a12220f5b41d --- /dev/null +++ b/pandas/tests/types/test_inference.py @@ -0,0 +1,847 @@ +# -*- coding: utf-8 -*- + +""" +These the test the public routines exposed in types/common.py +related to inference and not otherwise tested in types/test_common.py + +""" + +import nose +import collections +import re +from datetime import datetime, date, timedelta, time +import numpy as np + +import pandas as pd +from pandas import lib, tslib +from pandas import (Series, Index, DataFrame, Timedelta, + DatetimeIndex, TimedeltaIndex, Timestamp, + Panel, Period) +from pandas.compat import u, PY2, lrange +from pandas.types import inference +from pandas.types.common import (is_timedelta64_dtype, + is_timedelta64_ns_dtype, + is_number, + is_integer, + is_float, + is_bool, + is_scalar, + _ensure_int32) +from pandas.types.missing import isnull +from pandas.util import testing as tm + +_multiprocess_can_split_ = True + + +def test_is_sequence(): + is_seq = inference.is_sequence + assert (is_seq((1, 2))) + assert (is_seq([1, 2])) + assert (not is_seq("abcd")) + assert (not is_seq(u("abcd"))) + assert (not is_seq(np.int64)) + + class A(object): + + def __getitem__(self): + return 1 + + assert (not is_seq(A())) + + +def test_is_list_like(): + passes = ([], [1], (1, ), (1, 2), {'a': 1}, set([1, 'a']), Series([1]), + Series([]), Series(['a']).str) + fails = (1, '2', object()) + + for p in passes: + assert inference.is_list_like(p) + + for f in fails: + assert not inference.is_list_like(f) + + +def test_is_dict_like(): + passes = [{}, {'A': 1}, Series([1])] + fails = ['1', 1, [1, 2], (1, 2), range(2), Index([1])] + + for p in passes: + assert inference.is_dict_like(p) + + for f in fails: + assert not inference.is_dict_like(f) + + +def test_is_named_tuple(): + passes = (collections.namedtuple('Test', list('abc'))(1, 2, 3), ) + fails = ((1, 2, 3), 'a', Series({'pi': 3.14})) + + for p in passes: + assert inference.is_named_tuple(p) + + for f in fails: + assert not inference.is_named_tuple(f) + + +def test_is_hashable(): + + # all new-style classes are hashable by default + class HashableClass(object): + pass + + class UnhashableClass1(object): + __hash__ = None + + class UnhashableClass2(object): + + def __hash__(self): + raise TypeError("Not hashable") + + hashable = (1, + 3.14, + np.float64(3.14), + 'a', + tuple(), + (1, ), + HashableClass(), ) + not_hashable = ([], UnhashableClass1(), ) + abc_hashable_not_really_hashable = (([], ), UnhashableClass2(), ) + + for i in hashable: + assert inference.is_hashable(i) + for i in not_hashable: + assert not inference.is_hashable(i) + for i in abc_hashable_not_really_hashable: + assert not inference.is_hashable(i) + + # numpy.array is no longer collections.Hashable as of + # https://github.com/numpy/numpy/pull/5326, just test + # is_hashable() + assert not inference.is_hashable(np.array([])) + + # old-style classes in Python 2 don't appear hashable to + # collections.Hashable but also seem to support hash() by default + if PY2: + + class OldStyleClass(): + pass + + c = OldStyleClass() + assert not isinstance(c, collections.Hashable) + assert inference.is_hashable(c) + hash(c) # this will not raise + + +def test_is_re(): + passes = re.compile('ad'), + fails = 'x', 2, 3, object() + + for p in passes: + assert inference.is_re(p) + + for f in fails: + assert not inference.is_re(f) + + +def test_is_recompilable(): + passes = (r'a', u('x'), r'asdf', re.compile('adsf'), u(r'\u2233\s*'), + re.compile(r'')) + fails = 1, [], object() + + for p in passes: + assert inference.is_re_compilable(p) + + for f in fails: + assert not inference.is_re_compilable(f) + + +class TestInference(tm.TestCase): + + def test_infer_dtype_bytes(self): + compare = 'string' if PY2 else 'bytes' + + # string array of bytes + arr = np.array(list('abc'), dtype='S1') + self.assertEqual(lib.infer_dtype(arr), compare) + + # object array of bytes + arr = arr.astype(object) + self.assertEqual(lib.infer_dtype(arr), compare) + + def test_isinf_scalar(self): + # GH 11352 + self.assertTrue(lib.isposinf_scalar(float('inf'))) + self.assertTrue(lib.isposinf_scalar(np.inf)) + self.assertFalse(lib.isposinf_scalar(-np.inf)) + self.assertFalse(lib.isposinf_scalar(1)) + self.assertFalse(lib.isposinf_scalar('a')) + + self.assertTrue(lib.isneginf_scalar(float('-inf'))) + self.assertTrue(lib.isneginf_scalar(-np.inf)) + self.assertFalse(lib.isneginf_scalar(np.inf)) + self.assertFalse(lib.isneginf_scalar(1)) + self.assertFalse(lib.isneginf_scalar('a')) + + def test_maybe_convert_numeric_infinities(self): + # see gh-13274 + infinities = ['inf', 'inF', 'iNf', 'Inf', + 'iNF', 'InF', 'INf', 'INF'] + na_values = set(['', 'NULL', 'nan']) + + pos = np.array(['inf'], dtype=np.float64) + neg = np.array(['-inf'], dtype=np.float64) + + msg = "Unable to parse string" + + for infinity in infinities: + for maybe_int in (True, False): + out = lib.maybe_convert_numeric( + np.array([infinity], dtype=object), + na_values, maybe_int) + tm.assert_numpy_array_equal(out, pos) + + out = lib.maybe_convert_numeric( + np.array(['-' + infinity], dtype=object), + na_values, maybe_int) + tm.assert_numpy_array_equal(out, neg) + + out = lib.maybe_convert_numeric( + np.array([u(infinity)], dtype=object), + na_values, maybe_int) + tm.assert_numpy_array_equal(out, pos) + + out = lib.maybe_convert_numeric( + np.array(['+' + infinity], dtype=object), + na_values, maybe_int) + tm.assert_numpy_array_equal(out, pos) + + # too many characters + with tm.assertRaisesRegexp(ValueError, msg): + lib.maybe_convert_numeric( + np.array(['foo_' + infinity], dtype=object), + na_values, maybe_int) + + def test_maybe_convert_numeric_post_floatify_nan(self): + # see gh-13314 + data = np.array(['1.200', '-999.000', '4.500'], dtype=object) + expected = np.array([1.2, np.nan, 4.5], dtype=np.float64) + nan_values = set([-999, -999.0]) + + for coerce_type in (True, False): + out = lib.maybe_convert_numeric(data, nan_values, coerce_type) + tm.assert_numpy_array_equal(out, expected) + + def test_convert_infs(self): + arr = np.array(['inf', 'inf', 'inf'], dtype='O') + result = lib.maybe_convert_numeric(arr, set(), False) + self.assertTrue(result.dtype == np.float64) + + arr = np.array(['-inf', '-inf', '-inf'], dtype='O') + result = lib.maybe_convert_numeric(arr, set(), False) + self.assertTrue(result.dtype == np.float64) + + def test_scientific_no_exponent(self): + # See PR 12215 + arr = np.array(['42E', '2E', '99e', '6e'], dtype='O') + result = lib.maybe_convert_numeric(arr, set(), False, True) + self.assertTrue(np.all(np.isnan(result))) + + def test_convert_non_hashable(self): + # GH13324 + # make sure that we are handing non-hashables + arr = np.array([[10.0, 2], 1.0, 'apple']) + result = lib.maybe_convert_numeric(arr, set(), False, True) + tm.assert_numpy_array_equal(result, np.array([np.nan, 1.0, np.nan])) + + +class TestTypeInference(tm.TestCase): + _multiprocess_can_split_ = True + + def test_length_zero(self): + result = lib.infer_dtype(np.array([], dtype='i4')) + self.assertEqual(result, 'integer') + + result = lib.infer_dtype([]) + self.assertEqual(result, 'empty') + + def test_integers(self): + arr = np.array([1, 2, 3, np.int64(4), np.int32(5)], dtype='O') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'integer') + + arr = np.array([1, 2, 3, np.int64(4), np.int32(5), 'foo'], dtype='O') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'mixed-integer') + + arr = np.array([1, 2, 3, 4, 5], dtype='i4') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'integer') + + def test_bools(self): + arr = np.array([True, False, True, True, True], dtype='O') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'boolean') + + arr = np.array([np.bool_(True), np.bool_(False)], dtype='O') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'boolean') + + arr = np.array([True, False, True, 'foo'], dtype='O') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'mixed') + + arr = np.array([True, False, True], dtype=bool) + result = lib.infer_dtype(arr) + self.assertEqual(result, 'boolean') + + def test_floats(self): + arr = np.array([1., 2., 3., np.float64(4), np.float32(5)], dtype='O') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'floating') + + arr = np.array([1, 2, 3, np.float64(4), np.float32(5), 'foo'], + dtype='O') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'mixed-integer') + + arr = np.array([1, 2, 3, 4, 5], dtype='f4') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'floating') + + arr = np.array([1, 2, 3, 4, 5], dtype='f8') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'floating') + + def test_string(self): + pass + + def test_unicode(self): + pass + + def test_datetime(self): + + dates = [datetime(2012, 1, x) for x in range(1, 20)] + index = Index(dates) + self.assertEqual(index.inferred_type, 'datetime64') + + def test_infer_dtype_datetime(self): + + arr = np.array([Timestamp('2011-01-01'), + Timestamp('2011-01-02')]) + self.assertEqual(lib.infer_dtype(arr), 'datetime') + + arr = np.array([np.datetime64('2011-01-01'), + np.datetime64('2011-01-01')], dtype=object) + self.assertEqual(lib.infer_dtype(arr), 'datetime64') + + arr = np.array([datetime(2011, 1, 1), datetime(2012, 2, 1)]) + self.assertEqual(lib.infer_dtype(arr), 'datetime') + + # starts with nan + for n in [pd.NaT, np.nan]: + arr = np.array([n, pd.Timestamp('2011-01-02')]) + self.assertEqual(lib.infer_dtype(arr), 'datetime') + + arr = np.array([n, np.datetime64('2011-01-02')]) + self.assertEqual(lib.infer_dtype(arr), 'datetime64') + + arr = np.array([n, datetime(2011, 1, 1)]) + self.assertEqual(lib.infer_dtype(arr), 'datetime') + + arr = np.array([n, pd.Timestamp('2011-01-02'), n]) + self.assertEqual(lib.infer_dtype(arr), 'datetime') + + arr = np.array([n, np.datetime64('2011-01-02'), n]) + self.assertEqual(lib.infer_dtype(arr), 'datetime64') + + arr = np.array([n, datetime(2011, 1, 1), n]) + self.assertEqual(lib.infer_dtype(arr), 'datetime') + + # different type of nat + arr = np.array([np.timedelta64('nat'), + np.datetime64('2011-01-02')], dtype=object) + self.assertEqual(lib.infer_dtype(arr), 'mixed') + + arr = np.array([np.datetime64('2011-01-02'), + np.timedelta64('nat')], dtype=object) + self.assertEqual(lib.infer_dtype(arr), 'mixed') + + # mixed datetime + arr = np.array([datetime(2011, 1, 1), + pd.Timestamp('2011-01-02')]) + self.assertEqual(lib.infer_dtype(arr), 'datetime') + + # should be datetime? + arr = np.array([np.datetime64('2011-01-01'), + pd.Timestamp('2011-01-02')]) + self.assertEqual(lib.infer_dtype(arr), 'mixed') + + arr = np.array([pd.Timestamp('2011-01-02'), + np.datetime64('2011-01-01')]) + self.assertEqual(lib.infer_dtype(arr), 'mixed') + + arr = np.array([np.nan, pd.Timestamp('2011-01-02'), 1]) + self.assertEqual(lib.infer_dtype(arr), 'mixed-integer') + + arr = np.array([np.nan, pd.Timestamp('2011-01-02'), 1.1]) + self.assertEqual(lib.infer_dtype(arr), 'mixed') + + arr = np.array([np.nan, '2011-01-01', pd.Timestamp('2011-01-02')]) + self.assertEqual(lib.infer_dtype(arr), 'mixed') + + def test_infer_dtype_timedelta(self): + + arr = np.array([pd.Timedelta('1 days'), + pd.Timedelta('2 days')]) + self.assertEqual(lib.infer_dtype(arr), 'timedelta') + + arr = np.array([np.timedelta64(1, 'D'), + np.timedelta64(2, 'D')], dtype=object) + self.assertEqual(lib.infer_dtype(arr), 'timedelta') + + arr = np.array([timedelta(1), timedelta(2)]) + self.assertEqual(lib.infer_dtype(arr), 'timedelta') + + # starts with nan + for n in [pd.NaT, np.nan]: + arr = np.array([n, Timedelta('1 days')]) + self.assertEqual(lib.infer_dtype(arr), 'timedelta') + + arr = np.array([n, np.timedelta64(1, 'D')]) + self.assertEqual(lib.infer_dtype(arr), 'timedelta') + + arr = np.array([n, timedelta(1)]) + self.assertEqual(lib.infer_dtype(arr), 'timedelta') + + arr = np.array([n, pd.Timedelta('1 days'), n]) + self.assertEqual(lib.infer_dtype(arr), 'timedelta') + + arr = np.array([n, np.timedelta64(1, 'D'), n]) + self.assertEqual(lib.infer_dtype(arr), 'timedelta') + + arr = np.array([n, timedelta(1), n]) + self.assertEqual(lib.infer_dtype(arr), 'timedelta') + + # different type of nat + arr = np.array([np.datetime64('nat'), np.timedelta64(1, 'D')], + dtype=object) + self.assertEqual(lib.infer_dtype(arr), 'mixed') + + arr = np.array([np.timedelta64(1, 'D'), np.datetime64('nat')], + dtype=object) + self.assertEqual(lib.infer_dtype(arr), 'mixed') + + def test_infer_dtype_period(self): + # GH 13664 + arr = np.array([pd.Period('2011-01', freq='D'), + pd.Period('2011-02', freq='D')]) + self.assertEqual(pd.lib.infer_dtype(arr), 'period') + + arr = np.array([pd.Period('2011-01', freq='D'), + pd.Period('2011-02', freq='M')]) + self.assertEqual(pd.lib.infer_dtype(arr), 'period') + + # starts with nan + for n in [pd.NaT, np.nan]: + arr = np.array([n, pd.Period('2011-01', freq='D')]) + self.assertEqual(pd.lib.infer_dtype(arr), 'period') + + arr = np.array([n, pd.Period('2011-01', freq='D'), n]) + self.assertEqual(pd.lib.infer_dtype(arr), 'period') + + # different type of nat + arr = np.array([np.datetime64('nat'), pd.Period('2011-01', freq='M')], + dtype=object) + self.assertEqual(pd.lib.infer_dtype(arr), 'mixed') + + arr = np.array([pd.Period('2011-01', freq='M'), np.datetime64('nat')], + dtype=object) + self.assertEqual(pd.lib.infer_dtype(arr), 'mixed') + + def test_infer_dtype_all_nan_nat_like(self): + arr = np.array([np.nan, np.nan]) + self.assertEqual(lib.infer_dtype(arr), 'floating') + + # nan and None mix are result in mixed + arr = np.array([np.nan, np.nan, None]) + self.assertEqual(lib.infer_dtype(arr), 'mixed') + + arr = np.array([None, np.nan, np.nan]) + self.assertEqual(lib.infer_dtype(arr), 'mixed') + + # pd.NaT + arr = np.array([pd.NaT]) + self.assertEqual(lib.infer_dtype(arr), 'datetime') + + arr = np.array([pd.NaT, np.nan]) + self.assertEqual(lib.infer_dtype(arr), 'datetime') + + arr = np.array([np.nan, pd.NaT]) + self.assertEqual(lib.infer_dtype(arr), 'datetime') + + arr = np.array([np.nan, pd.NaT, np.nan]) + self.assertEqual(lib.infer_dtype(arr), 'datetime') + + arr = np.array([None, pd.NaT, None]) + self.assertEqual(lib.infer_dtype(arr), 'datetime') + + # np.datetime64(nat) + arr = np.array([np.datetime64('nat')]) + self.assertEqual(lib.infer_dtype(arr), 'datetime64') + + for n in [np.nan, pd.NaT, None]: + arr = np.array([n, np.datetime64('nat'), n]) + self.assertEqual(lib.infer_dtype(arr), 'datetime64') + + arr = np.array([pd.NaT, n, np.datetime64('nat'), n]) + self.assertEqual(lib.infer_dtype(arr), 'datetime64') + + arr = np.array([np.timedelta64('nat')], dtype=object) + self.assertEqual(lib.infer_dtype(arr), 'timedelta') + + for n in [np.nan, pd.NaT, None]: + arr = np.array([n, np.timedelta64('nat'), n]) + self.assertEqual(lib.infer_dtype(arr), 'timedelta') + + arr = np.array([pd.NaT, n, np.timedelta64('nat'), n]) + self.assertEqual(lib.infer_dtype(arr), 'timedelta') + + # datetime / timedelta mixed + arr = np.array([pd.NaT, np.datetime64('nat'), + np.timedelta64('nat'), np.nan]) + self.assertEqual(lib.infer_dtype(arr), 'mixed') + + arr = np.array([np.timedelta64('nat'), np.datetime64('nat')], + dtype=object) + self.assertEqual(lib.infer_dtype(arr), 'mixed') + + def test_is_datetimelike_array_all_nan_nat_like(self): + arr = np.array([np.nan, pd.NaT, np.datetime64('nat')]) + self.assertTrue(lib.is_datetime_array(arr)) + self.assertTrue(lib.is_datetime64_array(arr)) + self.assertFalse(lib.is_timedelta_array(arr)) + self.assertFalse(lib.is_timedelta64_array(arr)) + self.assertFalse(lib.is_timedelta_or_timedelta64_array(arr)) + + arr = np.array([np.nan, pd.NaT, np.timedelta64('nat')]) + self.assertFalse(lib.is_datetime_array(arr)) + self.assertFalse(lib.is_datetime64_array(arr)) + self.assertTrue(lib.is_timedelta_array(arr)) + self.assertTrue(lib.is_timedelta64_array(arr)) + self.assertTrue(lib.is_timedelta_or_timedelta64_array(arr)) + + arr = np.array([np.nan, pd.NaT, np.datetime64('nat'), + np.timedelta64('nat')]) + self.assertFalse(lib.is_datetime_array(arr)) + self.assertFalse(lib.is_datetime64_array(arr)) + self.assertFalse(lib.is_timedelta_array(arr)) + self.assertFalse(lib.is_timedelta64_array(arr)) + self.assertFalse(lib.is_timedelta_or_timedelta64_array(arr)) + + arr = np.array([np.nan, pd.NaT]) + self.assertTrue(lib.is_datetime_array(arr)) + self.assertTrue(lib.is_datetime64_array(arr)) + self.assertTrue(lib.is_timedelta_array(arr)) + self.assertTrue(lib.is_timedelta64_array(arr)) + self.assertTrue(lib.is_timedelta_or_timedelta64_array(arr)) + + arr = np.array([np.nan, np.nan], dtype=object) + self.assertFalse(lib.is_datetime_array(arr)) + self.assertFalse(lib.is_datetime64_array(arr)) + self.assertFalse(lib.is_timedelta_array(arr)) + self.assertFalse(lib.is_timedelta64_array(arr)) + self.assertFalse(lib.is_timedelta_or_timedelta64_array(arr)) + + def test_date(self): + + dates = [date(2012, 1, x) for x in range(1, 20)] + index = Index(dates) + self.assertEqual(index.inferred_type, 'date') + + def test_to_object_array_tuples(self): + r = (5, 6) + values = [r] + result = lib.to_object_array_tuples(values) + + try: + # make sure record array works + from collections import namedtuple + record = namedtuple('record', 'x y') + r = record(5, 6) + values = [r] + result = lib.to_object_array_tuples(values) # noqa + except ImportError: + pass + + def test_object(self): + + # GH 7431 + # cannot infer more than this as only a single element + arr = np.array([None], dtype='O') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'mixed') + + def test_to_object_array_width(self): + # see gh-13320 + rows = [[1, 2, 3], [4, 5, 6]] + + expected = np.array(rows, dtype=object) + out = lib.to_object_array(rows) + tm.assert_numpy_array_equal(out, expected) + + expected = np.array(rows, dtype=object) + out = lib.to_object_array(rows, min_width=1) + tm.assert_numpy_array_equal(out, expected) + + expected = np.array([[1, 2, 3, None, None], + [4, 5, 6, None, None]], dtype=object) + out = lib.to_object_array(rows, min_width=5) + tm.assert_numpy_array_equal(out, expected) + + def test_is_period(self): + self.assertTrue(lib.is_period(pd.Period('2011-01', freq='M'))) + self.assertFalse(lib.is_period(pd.PeriodIndex(['2011-01'], freq='M'))) + self.assertFalse(lib.is_period(pd.Timestamp('2011-01'))) + self.assertFalse(lib.is_period(1)) + self.assertFalse(lib.is_period(np.nan)) + + def test_categorical(self): + + # GH 8974 + from pandas import Categorical, Series + arr = Categorical(list('abc')) + result = lib.infer_dtype(arr) + self.assertEqual(result, 'categorical') + + result = lib.infer_dtype(Series(arr)) + self.assertEqual(result, 'categorical') + + arr = Categorical(list('abc'), categories=['cegfab'], ordered=True) + result = lib.infer_dtype(arr) + self.assertEqual(result, 'categorical') + + result = lib.infer_dtype(Series(arr)) + self.assertEqual(result, 'categorical') + + +class TestNumberScalar(tm.TestCase): + + def test_is_number(self): + + self.assertTrue(is_number(True)) + self.assertTrue(is_number(1)) + self.assertTrue(is_number(1.1)) + self.assertTrue(is_number(1 + 3j)) + self.assertTrue(is_number(np.bool(False))) + self.assertTrue(is_number(np.int64(1))) + self.assertTrue(is_number(np.float64(1.1))) + self.assertTrue(is_number(np.complex128(1 + 3j))) + self.assertTrue(is_number(np.nan)) + + self.assertFalse(is_number(None)) + self.assertFalse(is_number('x')) + self.assertFalse(is_number(datetime(2011, 1, 1))) + self.assertFalse(is_number(np.datetime64('2011-01-01'))) + self.assertFalse(is_number(Timestamp('2011-01-01'))) + self.assertFalse(is_number(Timestamp('2011-01-01', + tz='US/Eastern'))) + self.assertFalse(is_number(timedelta(1000))) + self.assertFalse(is_number(Timedelta('1 days'))) + + # questionable + self.assertFalse(is_number(np.bool_(False))) + self.assertTrue(is_number(np.timedelta64(1, 'D'))) + + def test_is_bool(self): + self.assertTrue(is_bool(True)) + self.assertTrue(is_bool(np.bool(False))) + self.assertTrue(is_bool(np.bool_(False))) + + self.assertFalse(is_bool(1)) + self.assertFalse(is_bool(1.1)) + self.assertFalse(is_bool(1 + 3j)) + self.assertFalse(is_bool(np.int64(1))) + self.assertFalse(is_bool(np.float64(1.1))) + self.assertFalse(is_bool(np.complex128(1 + 3j))) + self.assertFalse(is_bool(np.nan)) + self.assertFalse(is_bool(None)) + self.assertFalse(is_bool('x')) + self.assertFalse(is_bool(datetime(2011, 1, 1))) + self.assertFalse(is_bool(np.datetime64('2011-01-01'))) + self.assertFalse(is_bool(Timestamp('2011-01-01'))) + self.assertFalse(is_bool(Timestamp('2011-01-01', + tz='US/Eastern'))) + self.assertFalse(is_bool(timedelta(1000))) + self.assertFalse(is_bool(np.timedelta64(1, 'D'))) + self.assertFalse(is_bool(Timedelta('1 days'))) + + def test_is_integer(self): + self.assertTrue(is_integer(1)) + self.assertTrue(is_integer(np.int64(1))) + + self.assertFalse(is_integer(True)) + self.assertFalse(is_integer(1.1)) + self.assertFalse(is_integer(1 + 3j)) + self.assertFalse(is_integer(np.bool(False))) + self.assertFalse(is_integer(np.bool_(False))) + self.assertFalse(is_integer(np.float64(1.1))) + self.assertFalse(is_integer(np.complex128(1 + 3j))) + self.assertFalse(is_integer(np.nan)) + self.assertFalse(is_integer(None)) + self.assertFalse(is_integer('x')) + self.assertFalse(is_integer(datetime(2011, 1, 1))) + self.assertFalse(is_integer(np.datetime64('2011-01-01'))) + self.assertFalse(is_integer(Timestamp('2011-01-01'))) + self.assertFalse(is_integer(Timestamp('2011-01-01', + tz='US/Eastern'))) + self.assertFalse(is_integer(timedelta(1000))) + self.assertFalse(is_integer(Timedelta('1 days'))) + + # questionable + self.assertTrue(is_integer(np.timedelta64(1, 'D'))) + + def test_is_float(self): + self.assertTrue(is_float(1.1)) + self.assertTrue(is_float(np.float64(1.1))) + self.assertTrue(is_float(np.nan)) + + self.assertFalse(is_float(True)) + self.assertFalse(is_float(1)) + self.assertFalse(is_float(1 + 3j)) + self.assertFalse(is_float(np.bool(False))) + self.assertFalse(is_float(np.bool_(False))) + self.assertFalse(is_float(np.int64(1))) + self.assertFalse(is_float(np.complex128(1 + 3j))) + self.assertFalse(is_float(None)) + self.assertFalse(is_float('x')) + self.assertFalse(is_float(datetime(2011, 1, 1))) + self.assertFalse(is_float(np.datetime64('2011-01-01'))) + self.assertFalse(is_float(Timestamp('2011-01-01'))) + self.assertFalse(is_float(Timestamp('2011-01-01', + tz='US/Eastern'))) + self.assertFalse(is_float(timedelta(1000))) + self.assertFalse(is_float(np.timedelta64(1, 'D'))) + self.assertFalse(is_float(Timedelta('1 days'))) + + def test_is_timedelta(self): + self.assertTrue(is_timedelta64_dtype('timedelta64')) + self.assertTrue(is_timedelta64_dtype('timedelta64[ns]')) + self.assertFalse(is_timedelta64_ns_dtype('timedelta64')) + self.assertTrue(is_timedelta64_ns_dtype('timedelta64[ns]')) + + tdi = TimedeltaIndex([1e14, 2e14], dtype='timedelta64') + self.assertTrue(is_timedelta64_dtype(tdi)) + self.assertTrue(is_timedelta64_ns_dtype(tdi)) + self.assertTrue(is_timedelta64_ns_dtype(tdi.astype('timedelta64[ns]'))) + + # Conversion to Int64Index: + self.assertFalse(is_timedelta64_ns_dtype(tdi.astype('timedelta64'))) + self.assertFalse(is_timedelta64_ns_dtype(tdi.astype('timedelta64[h]'))) + + +class Testisscalar(tm.TestCase): + + def test_isscalar_builtin_scalars(self): + self.assertTrue(is_scalar(None)) + self.assertTrue(is_scalar(True)) + self.assertTrue(is_scalar(False)) + self.assertTrue(is_scalar(0.)) + self.assertTrue(is_scalar(np.nan)) + self.assertTrue(is_scalar('foobar')) + self.assertTrue(is_scalar(b'foobar')) + self.assertTrue(is_scalar(u('efoobar'))) + self.assertTrue(is_scalar(datetime(2014, 1, 1))) + self.assertTrue(is_scalar(date(2014, 1, 1))) + self.assertTrue(is_scalar(time(12, 0))) + self.assertTrue(is_scalar(timedelta(hours=1))) + self.assertTrue(is_scalar(pd.NaT)) + + def test_isscalar_builtin_nonscalars(self): + self.assertFalse(is_scalar({})) + self.assertFalse(is_scalar([])) + self.assertFalse(is_scalar([1])) + self.assertFalse(is_scalar(())) + self.assertFalse(is_scalar((1, ))) + self.assertFalse(is_scalar(slice(None))) + self.assertFalse(is_scalar(Ellipsis)) + + def test_isscalar_numpy_array_scalars(self): + self.assertTrue(is_scalar(np.int64(1))) + self.assertTrue(is_scalar(np.float64(1.))) + self.assertTrue(is_scalar(np.int32(1))) + self.assertTrue(is_scalar(np.object_('foobar'))) + self.assertTrue(is_scalar(np.str_('foobar'))) + self.assertTrue(is_scalar(np.unicode_(u('foobar')))) + self.assertTrue(is_scalar(np.bytes_(b'foobar'))) + self.assertTrue(is_scalar(np.datetime64('2014-01-01'))) + self.assertTrue(is_scalar(np.timedelta64(1, 'h'))) + + def test_isscalar_numpy_zerodim_arrays(self): + for zerodim in [np.array(1), np.array('foobar'), + np.array(np.datetime64('2014-01-01')), + np.array(np.timedelta64(1, 'h')), + np.array(np.datetime64('NaT'))]: + self.assertFalse(is_scalar(zerodim)) + self.assertTrue(is_scalar(lib.item_from_zerodim(zerodim))) + + def test_isscalar_numpy_arrays(self): + self.assertFalse(is_scalar(np.array([]))) + self.assertFalse(is_scalar(np.array([[]]))) + self.assertFalse(is_scalar(np.matrix('1; 2'))) + + def test_isscalar_pandas_scalars(self): + self.assertTrue(is_scalar(Timestamp('2014-01-01'))) + self.assertTrue(is_scalar(Timedelta(hours=1))) + self.assertTrue(is_scalar(Period('2014-01-01'))) + + def test_lisscalar_pandas_containers(self): + self.assertFalse(is_scalar(Series())) + self.assertFalse(is_scalar(Series([1]))) + self.assertFalse(is_scalar(DataFrame())) + self.assertFalse(is_scalar(DataFrame([[1]]))) + self.assertFalse(is_scalar(Panel())) + self.assertFalse(is_scalar(Panel([[[1]]]))) + self.assertFalse(is_scalar(Index([]))) + self.assertFalse(is_scalar(Index([1]))) + + +def test_datetimeindex_from_empty_datetime64_array(): + for unit in ['ms', 'us', 'ns']: + idx = DatetimeIndex(np.array([], dtype='datetime64[%s]' % unit)) + assert (len(idx) == 0) + + +def test_nan_to_nat_conversions(): + + df = DataFrame(dict({ + 'A': np.asarray( + lrange(10), dtype='float64'), + 'B': Timestamp('20010101') + })) + df.iloc[3:6, :] = np.nan + result = df.loc[4, 'B'].value + assert (result == tslib.iNaT) + + s = df['B'].copy() + s._data = s._data.setitem(indexer=tuple([slice(8, 9)]), value=np.nan) + assert (isnull(s[8])) + + # numpy < 1.7.0 is wrong + from distutils.version import LooseVersion + if LooseVersion(np.__version__) >= '1.7.0': + assert (s[8].value == np.datetime64('NaT').astype(np.int64)) + + +def test_ensure_int32(): + values = np.arange(10, dtype=np.int32) + result = _ensure_int32(values) + assert (result.dtype == np.int32) + + values = np.arange(10, dtype=np.int64) + result = _ensure_int32(values) + assert (result.dtype == np.int32) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/types/test_io.py b/pandas/tests/types/test_io.py new file mode 100644 index 0000000000000..545edf8f1386c --- /dev/null +++ b/pandas/tests/types/test_io.py @@ -0,0 +1,116 @@ +# -*- coding: utf-8 -*- + +import numpy as np +import pandas.lib as lib +import pandas.util.testing as tm + +from pandas.compat import long, u + + +class TestParseSQL(tm.TestCase): + + def test_convert_sql_column_floats(self): + arr = np.array([1.5, None, 3, 4.2], dtype=object) + result = lib.convert_sql_column(arr) + expected = np.array([1.5, np.nan, 3, 4.2], dtype='f8') + self.assert_numpy_array_equal(result, expected) + + def test_convert_sql_column_strings(self): + arr = np.array(['1.5', None, '3', '4.2'], dtype=object) + result = lib.convert_sql_column(arr) + expected = np.array(['1.5', np.nan, '3', '4.2'], dtype=object) + self.assert_numpy_array_equal(result, expected) + + def test_convert_sql_column_unicode(self): + arr = np.array([u('1.5'), None, u('3'), u('4.2')], + dtype=object) + result = lib.convert_sql_column(arr) + expected = np.array([u('1.5'), np.nan, u('3'), u('4.2')], + dtype=object) + self.assert_numpy_array_equal(result, expected) + + def test_convert_sql_column_ints(self): + arr = np.array([1, 2, 3, 4], dtype='O') + arr2 = np.array([1, 2, 3, 4], dtype='i4').astype('O') + result = lib.convert_sql_column(arr) + result2 = lib.convert_sql_column(arr2) + expected = np.array([1, 2, 3, 4], dtype='i8') + self.assert_numpy_array_equal(result, expected) + self.assert_numpy_array_equal(result2, expected) + + arr = np.array([1, 2, 3, None, 4], dtype='O') + result = lib.convert_sql_column(arr) + expected = np.array([1, 2, 3, np.nan, 4], dtype='f8') + self.assert_numpy_array_equal(result, expected) + + def test_convert_sql_column_longs(self): + arr = np.array([long(1), long(2), long(3), long(4)], dtype='O') + result = lib.convert_sql_column(arr) + expected = np.array([1, 2, 3, 4], dtype='i8') + self.assert_numpy_array_equal(result, expected) + + arr = np.array([long(1), long(2), long(3), None, long(4)], dtype='O') + result = lib.convert_sql_column(arr) + expected = np.array([1, 2, 3, np.nan, 4], dtype='f8') + self.assert_numpy_array_equal(result, expected) + + def test_convert_sql_column_bools(self): + arr = np.array([True, False, True, False], dtype='O') + result = lib.convert_sql_column(arr) + expected = np.array([True, False, True, False], dtype=bool) + self.assert_numpy_array_equal(result, expected) + + arr = np.array([True, False, None, False], dtype='O') + result = lib.convert_sql_column(arr) + expected = np.array([True, False, np.nan, False], dtype=object) + self.assert_numpy_array_equal(result, expected) + + def test_convert_sql_column_decimals(self): + from decimal import Decimal + arr = np.array([Decimal('1.5'), None, Decimal('3'), Decimal('4.2')]) + result = lib.convert_sql_column(arr) + expected = np.array([1.5, np.nan, 3, 4.2], dtype='f8') + self.assert_numpy_array_equal(result, expected) + + def test_convert_downcast_int64(self): + from pandas.parser import na_values + + arr = np.array([1, 2, 7, 8, 10], dtype=np.int64) + expected = np.array([1, 2, 7, 8, 10], dtype=np.int8) + + # default argument + result = lib.downcast_int64(arr, na_values) + self.assert_numpy_array_equal(result, expected) + + result = lib.downcast_int64(arr, na_values, use_unsigned=False) + self.assert_numpy_array_equal(result, expected) + + expected = np.array([1, 2, 7, 8, 10], dtype=np.uint8) + result = lib.downcast_int64(arr, na_values, use_unsigned=True) + self.assert_numpy_array_equal(result, expected) + + # still cast to int8 despite use_unsigned=True + # because of the negative number as an element + arr = np.array([1, 2, -7, 8, 10], dtype=np.int64) + expected = np.array([1, 2, -7, 8, 10], dtype=np.int8) + result = lib.downcast_int64(arr, na_values, use_unsigned=True) + self.assert_numpy_array_equal(result, expected) + + arr = np.array([1, 2, 7, 8, 300], dtype=np.int64) + expected = np.array([1, 2, 7, 8, 300], dtype=np.int16) + result = lib.downcast_int64(arr, na_values) + self.assert_numpy_array_equal(result, expected) + + int8_na = na_values[np.int8] + int64_na = na_values[np.int64] + arr = np.array([int64_na, 2, 3, 10, 15], dtype=np.int64) + expected = np.array([int8_na, 2, 3, 10, 15], dtype=np.int8) + result = lib.downcast_int64(arr, na_values) + self.assert_numpy_array_equal(result, expected) + + +if __name__ == '__main__': + import nose + + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/types/test_missing.py b/pandas/tests/types/test_missing.py new file mode 100644 index 0000000000000..fa2bd535bb8d5 --- /dev/null +++ b/pandas/tests/types/test_missing.py @@ -0,0 +1,311 @@ +# -*- coding: utf-8 -*- + +import nose +import numpy as np +from datetime import datetime +from pandas.util import testing as tm + +import pandas as pd +from pandas.core import config as cf +from pandas.compat import u +from pandas.tslib import iNaT +from pandas import (NaT, Float64Index, Series, + DatetimeIndex, TimedeltaIndex, date_range) +from pandas.types.dtypes import DatetimeTZDtype +from pandas.types.missing import (array_equivalent, isnull, notnull, + na_value_for_dtype) + +_multiprocess_can_split_ = True + + +def test_notnull(): + assert notnull(1.) + assert not notnull(None) + assert not notnull(np.NaN) + + with cf.option_context("mode.use_inf_as_null", False): + assert notnull(np.inf) + assert notnull(-np.inf) + + arr = np.array([1.5, np.inf, 3.5, -np.inf]) + result = notnull(arr) + assert result.all() + + with cf.option_context("mode.use_inf_as_null", True): + assert not notnull(np.inf) + assert not notnull(-np.inf) + + arr = np.array([1.5, np.inf, 3.5, -np.inf]) + result = notnull(arr) + assert result.sum() == 2 + + with cf.option_context("mode.use_inf_as_null", False): + for s in [tm.makeFloatSeries(), tm.makeStringSeries(), + tm.makeObjectSeries(), tm.makeTimeSeries(), + tm.makePeriodSeries()]: + assert (isinstance(isnull(s), Series)) + + +class TestIsNull(tm.TestCase): + + def test_0d_array(self): + self.assertTrue(isnull(np.array(np.nan))) + self.assertFalse(isnull(np.array(0.0))) + self.assertFalse(isnull(np.array(0))) + # test object dtype + self.assertTrue(isnull(np.array(np.nan, dtype=object))) + self.assertFalse(isnull(np.array(0.0, dtype=object))) + self.assertFalse(isnull(np.array(0, dtype=object))) + + def test_isnull(self): + self.assertFalse(isnull(1.)) + self.assertTrue(isnull(None)) + self.assertTrue(isnull(np.NaN)) + self.assertTrue(float('nan')) + self.assertFalse(isnull(np.inf)) + self.assertFalse(isnull(-np.inf)) + + # series + for s in [tm.makeFloatSeries(), tm.makeStringSeries(), + tm.makeObjectSeries(), tm.makeTimeSeries(), + tm.makePeriodSeries()]: + self.assertIsInstance(isnull(s), Series) + + # frame + for df in [tm.makeTimeDataFrame(), tm.makePeriodFrame(), + tm.makeMixedDataFrame()]: + result = isnull(df) + expected = df.apply(isnull) + tm.assert_frame_equal(result, expected) + + # panel + for p in [tm.makePanel(), tm.makePeriodPanel(), + tm.add_nans(tm.makePanel())]: + result = isnull(p) + expected = p.apply(isnull) + tm.assert_panel_equal(result, expected) + + # panel 4d + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + for p in [tm.makePanel4D(), tm.add_nans_panel4d(tm.makePanel4D())]: + result = isnull(p) + expected = p.apply(isnull) + tm.assert_panel4d_equal(result, expected) + + def test_isnull_lists(self): + result = isnull([[False]]) + exp = np.array([[False]]) + tm.assert_numpy_array_equal(result, exp) + + result = isnull([[1], [2]]) + exp = np.array([[False], [False]]) + tm.assert_numpy_array_equal(result, exp) + + # list of strings / unicode + result = isnull(['foo', 'bar']) + exp = np.array([False, False]) + tm.assert_numpy_array_equal(result, exp) + + result = isnull([u('foo'), u('bar')]) + exp = np.array([False, False]) + tm.assert_numpy_array_equal(result, exp) + + def test_isnull_nat(self): + result = isnull([NaT]) + exp = np.array([True]) + tm.assert_numpy_array_equal(result, exp) + + result = isnull(np.array([NaT], dtype=object)) + exp = np.array([True]) + tm.assert_numpy_array_equal(result, exp) + + def test_isnull_numpy_nat(self): + arr = np.array([NaT, np.datetime64('NaT'), np.timedelta64('NaT'), + np.datetime64('NaT', 's')]) + result = isnull(arr) + expected = np.array([True] * 4) + tm.assert_numpy_array_equal(result, expected) + + def test_isnull_datetime(self): + self.assertFalse(isnull(datetime.now())) + self.assertTrue(notnull(datetime.now())) + + idx = date_range('1/1/1990', periods=20) + exp = np.ones(len(idx), dtype=bool) + tm.assert_numpy_array_equal(notnull(idx), exp) + + idx = np.asarray(idx) + idx[0] = iNaT + idx = DatetimeIndex(idx) + mask = isnull(idx) + self.assertTrue(mask[0]) + exp = np.array([True] + [False] * (len(idx) - 1), dtype=bool) + self.assert_numpy_array_equal(mask, exp) + + # GH 9129 + pidx = idx.to_period(freq='M') + mask = isnull(pidx) + self.assertTrue(mask[0]) + exp = np.array([True] + [False] * (len(idx) - 1), dtype=bool) + self.assert_numpy_array_equal(mask, exp) + + mask = isnull(pidx[1:]) + exp = np.zeros(len(mask), dtype=bool) + self.assert_numpy_array_equal(mask, exp) + + def test_datetime_other_units(self): + idx = pd.DatetimeIndex(['2011-01-01', 'NaT', '2011-01-02']) + exp = np.array([False, True, False]) + tm.assert_numpy_array_equal(isnull(idx), exp) + tm.assert_numpy_array_equal(notnull(idx), ~exp) + tm.assert_numpy_array_equal(isnull(idx.values), exp) + tm.assert_numpy_array_equal(notnull(idx.values), ~exp) + + for dtype in ['datetime64[D]', 'datetime64[h]', 'datetime64[m]', + 'datetime64[s]', 'datetime64[ms]', 'datetime64[us]', + 'datetime64[ns]']: + values = idx.values.astype(dtype) + + exp = np.array([False, True, False]) + tm.assert_numpy_array_equal(isnull(values), exp) + tm.assert_numpy_array_equal(notnull(values), ~exp) + + exp = pd.Series([False, True, False]) + s = pd.Series(values) + tm.assert_series_equal(isnull(s), exp) + tm.assert_series_equal(notnull(s), ~exp) + s = pd.Series(values, dtype=object) + tm.assert_series_equal(isnull(s), exp) + tm.assert_series_equal(notnull(s), ~exp) + + def test_timedelta_other_units(self): + idx = pd.TimedeltaIndex(['1 days', 'NaT', '2 days']) + exp = np.array([False, True, False]) + tm.assert_numpy_array_equal(isnull(idx), exp) + tm.assert_numpy_array_equal(notnull(idx), ~exp) + tm.assert_numpy_array_equal(isnull(idx.values), exp) + tm.assert_numpy_array_equal(notnull(idx.values), ~exp) + + for dtype in ['timedelta64[D]', 'timedelta64[h]', 'timedelta64[m]', + 'timedelta64[s]', 'timedelta64[ms]', 'timedelta64[us]', + 'timedelta64[ns]']: + values = idx.values.astype(dtype) + + exp = np.array([False, True, False]) + tm.assert_numpy_array_equal(isnull(values), exp) + tm.assert_numpy_array_equal(notnull(values), ~exp) + + exp = pd.Series([False, True, False]) + s = pd.Series(values) + tm.assert_series_equal(isnull(s), exp) + tm.assert_series_equal(notnull(s), ~exp) + s = pd.Series(values, dtype=object) + tm.assert_series_equal(isnull(s), exp) + tm.assert_series_equal(notnull(s), ~exp) + + def test_period(self): + idx = pd.PeriodIndex(['2011-01', 'NaT', '2012-01'], freq='M') + exp = np.array([False, True, False]) + tm.assert_numpy_array_equal(isnull(idx), exp) + tm.assert_numpy_array_equal(notnull(idx), ~exp) + + exp = pd.Series([False, True, False]) + s = pd.Series(idx) + tm.assert_series_equal(isnull(s), exp) + tm.assert_series_equal(notnull(s), ~exp) + s = pd.Series(idx, dtype=object) + tm.assert_series_equal(isnull(s), exp) + tm.assert_series_equal(notnull(s), ~exp) + + +def test_array_equivalent(): + assert array_equivalent(np.array([np.nan, np.nan]), + np.array([np.nan, np.nan])) + assert array_equivalent(np.array([np.nan, 1, np.nan]), + np.array([np.nan, 1, np.nan])) + assert array_equivalent(np.array([np.nan, None], dtype='object'), + np.array([np.nan, None], dtype='object')) + assert array_equivalent(np.array([np.nan, 1 + 1j], dtype='complex'), + np.array([np.nan, 1 + 1j], dtype='complex')) + assert not array_equivalent( + np.array([np.nan, 1 + 1j], dtype='complex'), np.array( + [np.nan, 1 + 2j], dtype='complex')) + assert not array_equivalent( + np.array([np.nan, 1, np.nan]), np.array([np.nan, 2, np.nan])) + assert not array_equivalent( + np.array(['a', 'b', 'c', 'd']), np.array(['e', 'e'])) + assert array_equivalent(Float64Index([0, np.nan]), + Float64Index([0, np.nan])) + assert not array_equivalent( + Float64Index([0, np.nan]), Float64Index([1, np.nan])) + assert array_equivalent(DatetimeIndex([0, np.nan]), + DatetimeIndex([0, np.nan])) + assert not array_equivalent( + DatetimeIndex([0, np.nan]), DatetimeIndex([1, np.nan])) + assert array_equivalent(TimedeltaIndex([0, np.nan]), + TimedeltaIndex([0, np.nan])) + assert not array_equivalent( + TimedeltaIndex([0, np.nan]), TimedeltaIndex([1, np.nan])) + assert array_equivalent(DatetimeIndex([0, np.nan], tz='US/Eastern'), + DatetimeIndex([0, np.nan], tz='US/Eastern')) + assert not array_equivalent( + DatetimeIndex([0, np.nan], tz='US/Eastern'), DatetimeIndex( + [1, np.nan], tz='US/Eastern')) + assert not array_equivalent( + DatetimeIndex([0, np.nan]), DatetimeIndex( + [0, np.nan], tz='US/Eastern')) + assert not array_equivalent( + DatetimeIndex([0, np.nan], tz='CET'), DatetimeIndex( + [0, np.nan], tz='US/Eastern')) + assert not array_equivalent( + DatetimeIndex([0, np.nan]), TimedeltaIndex([0, np.nan])) + + +def test_array_equivalent_compat(): + # see gh-13388 + m = np.array([(1, 2), (3, 4)], dtype=[('a', int), ('b', float)]) + n = np.array([(1, 2), (3, 4)], dtype=[('a', int), ('b', float)]) + assert (array_equivalent(m, n, strict_nan=True)) + assert (array_equivalent(m, n, strict_nan=False)) + + m = np.array([(1, 2), (3, 4)], dtype=[('a', int), ('b', float)]) + n = np.array([(1, 2), (4, 3)], dtype=[('a', int), ('b', float)]) + assert (not array_equivalent(m, n, strict_nan=True)) + assert (not array_equivalent(m, n, strict_nan=False)) + + m = np.array([(1, 2), (3, 4)], dtype=[('a', int), ('b', float)]) + n = np.array([(1, 2), (3, 4)], dtype=[('b', int), ('a', float)]) + assert (not array_equivalent(m, n, strict_nan=True)) + assert (not array_equivalent(m, n, strict_nan=False)) + + +def test_array_equivalent_str(): + for dtype in ['O', 'S', 'U']: + assert array_equivalent(np.array(['A', 'B'], dtype=dtype), + np.array(['A', 'B'], dtype=dtype)) + assert not array_equivalent(np.array(['A', 'B'], dtype=dtype), + np.array(['A', 'X'], dtype=dtype)) + + +def test_na_value_for_dtype(): + for dtype in [np.dtype('M8[ns]'), np.dtype('m8[ns]'), + DatetimeTZDtype('datetime64[ns, US/Eastern]')]: + assert na_value_for_dtype(dtype) is NaT + + for dtype in ['u1', 'u2', 'u4', 'u8', + 'i1', 'i2', 'i4', 'i8']: + assert na_value_for_dtype(np.dtype(dtype)) == 0 + + for dtype in ['bool']: + assert na_value_for_dtype(np.dtype(dtype)) is False + + for dtype in ['f2', 'f4', 'f8']: + assert np.isnan(na_value_for_dtype(np.dtype(dtype))) + + for dtype in ['O']: + assert np.isnan(na_value_for_dtype(np.dtype(dtype))) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/types/test_types.py b/pandas/tests/types/test_types.py deleted file mode 100644 index b9f6006cab731..0000000000000 --- a/pandas/tests/types/test_types.py +++ /dev/null @@ -1,40 +0,0 @@ -# -*- coding: utf-8 -*- -import nose -import numpy as np - -from pandas import NaT -from pandas.types.api import (DatetimeTZDtype, CategoricalDtype, - na_value_for_dtype, pandas_dtype) - - -def test_pandas_dtype(): - - assert pandas_dtype('datetime64[ns, US/Eastern]') == DatetimeTZDtype( - 'datetime64[ns, US/Eastern]') - assert pandas_dtype('category') == CategoricalDtype() - for dtype in ['M8[ns]', 'm8[ns]', 'object', 'float64', 'int64']: - assert pandas_dtype(dtype) == np.dtype(dtype) - - -def test_na_value_for_dtype(): - for dtype in [np.dtype('M8[ns]'), np.dtype('m8[ns]'), - DatetimeTZDtype('datetime64[ns, US/Eastern]')]: - assert na_value_for_dtype(dtype) is NaT - - for dtype in ['u1', 'u2', 'u4', 'u8', - 'i1', 'i2', 'i4', 'i8']: - assert na_value_for_dtype(np.dtype(dtype)) == 0 - - for dtype in ['bool']: - assert na_value_for_dtype(np.dtype(dtype)) is False - - for dtype in ['f2', 'f4', 'f8']: - assert np.isnan(na_value_for_dtype(np.dtype(dtype))) - - for dtype in ['O']: - assert np.isnan(na_value_for_dtype(np.dtype(dtype))) - - -if __name__ == '__main__': - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index 182c0637ae29c..6521acbd0b733 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -2,30 +2,48 @@ SQL-style merge routines """ +import copy import warnings import numpy as np from pandas.compat import range, lrange, lzip, zip, map, filter import pandas.compat as compat -from pandas.core.categorical import Categorical -from pandas.core.frame import DataFrame, _merge_doc +from pandas import (Categorical, DataFrame, Series, + Index, MultiIndex, Timedelta) +from pandas.core.categorical import (_factorize_from_iterable, + _factorize_from_iterables) +from pandas.core.frame import _merge_doc +from pandas.types.generic import ABCSeries +from pandas.types.common import (is_datetime64tz_dtype, + is_datetime64_dtype, + needs_i8_conversion, + is_int64_dtype, + is_integer_dtype, + is_float_dtype, + is_integer, + is_int_or_datetime_dtype, + is_dtype_equal, + is_bool, + is_list_like, + _ensure_int64, + _ensure_float64, + _ensure_object) +from pandas.types.missing import na_value_for_dtype + from pandas.core.generic import NDFrame -from pandas.core.series import Series -from pandas.core.index import (Index, MultiIndex, _get_combined_index, +from pandas.core.index import (_get_combined_index, _ensure_index, _get_consensus_names, _all_indexes_same) from pandas.core.internals import (items_overlap_with_suffix, concatenate_block_managers) from pandas.util.decorators import Appender, Substitution -from pandas.core.common import ABCSeries import pandas.core.algorithms as algos import pandas.core.common as com import pandas.types.concat as _concat -from pandas.types.api import na_value_for_dtype -import pandas.algos as _algos +import pandas._join as _join import pandas.hashtable as _hash @@ -47,9 +65,100 @@ class MergeError(ValueError): pass -def ordered_merge(left, right, on=None, left_by=None, right_by=None, +def _groupby_and_merge(by, on, left, right, _merge_pieces, + check_duplicates=True): + """ + groupby & merge; we are always performing a left-by type operation + + Parameters + ---------- + by: field to group + on: duplicates field + left: left frame + right: right frame + _merge_pieces: function for merging + check_duplicates: boolean, default True + should we check & clean duplicates + """ + + pieces = [] + if not isinstance(by, (list, tuple)): + by = [by] + + lby = left.groupby(by, sort=False) + + # if we can groupby the rhs + # then we can get vastly better perf + try: + + # we will check & remove duplicates if indicated + if check_duplicates: + if on is None: + on = [] + elif not isinstance(on, (list, tuple)): + on = [on] + + if right.duplicated(by + on).any(): + right = right.drop_duplicates(by + on, keep='last') + rby = right.groupby(by, sort=False) + except KeyError: + rby = None + + for key, lhs in lby: + + if rby is None: + rhs = right + else: + try: + rhs = right.take(rby.indices[key]) + except KeyError: + # key doesn't exist in left + lcols = lhs.columns.tolist() + cols = lcols + [r for r in right.columns + if r not in set(lcols)] + merged = lhs.reindex(columns=cols) + merged.index = range(len(merged)) + pieces.append(merged) + continue + + merged = _merge_pieces(lhs, rhs) + + # make sure join keys are in the merged + # TODO, should _merge_pieces do this? + for k in by: + try: + if k in merged: + merged[k] = key + except: + pass + + pieces.append(merged) + + # preserve the original order + # if we have a missing piece this can be reset + result = concat(pieces, ignore_index=True) + result = result.reindex(columns=pieces[0].columns, copy=False) + return result, lby + + +def ordered_merge(left, right, on=None, left_on=None, right_on=None, + left_by=None, right_by=None, fill_method=None, suffixes=('_x', '_y')): + + warnings.warn("ordered_merge is deprecated and replace by merged_ordered", + FutureWarning, stacklevel=2) + return merge_ordered(left, right, on=on, + left_on=left_on, right_on=right_on, + left_by=left_by, right_by=right_by, + fill_method=fill_method, suffixes=suffixes) + + +def merge_ordered(left, right, on=None, + left_on=None, right_on=None, + left_by=None, right_by=None, + fill_method=None, suffixes=('_x', '_y'), + how='outer'): """Perform merge with optional filling/interpolation designed for ordered data like time series data. Optionally perform group-wise merge (see examples) @@ -58,8 +167,6 @@ def ordered_merge(left, right, on=None, left_by=None, right_by=None, ---------- left : DataFrame right : DataFrame - fill_method : {'ffill', None}, default None - Interpolation method for data on : label or list Field names to join on. Must be found in both DataFrames. left_on : label or list, or array-like @@ -75,9 +182,18 @@ def ordered_merge(left, right, on=None, left_by=None, right_by=None, right_by : column name or list of column names Group right DataFrame by group columns and merge piece by piece with left DataFrame + fill_method : {'ffill', None}, default None + Interpolation method for data suffixes : 2-length sequence (tuple, list, ...) Suffix to apply to overlapping column names in the left and right side, respectively + how : {'left', 'right', 'outer', 'inner'}, default 'outer' + * left: use only keys from left frame (SQL: left outer join) + * right: use only keys from right frame (SQL: right outer join) + * outer: use union of keys from both frames (SQL: full outer join) + * inner: use intersection of keys from both frames (SQL: inner join) + + .. versionadded:: 0.19.0 Examples -------- @@ -110,46 +226,202 @@ def ordered_merge(left, right, on=None, left_by=None, right_by=None, merged : DataFrame The output type will the be same as 'left', if it is a subclass of DataFrame. + + See also + -------- + merge + merge_asof + """ def _merger(x, y): + # perform the ordered merge operation op = _OrderedMerge(x, y, on=on, left_on=left_on, right_on=right_on, - # left_index=left_index, right_index=right_index, - suffixes=suffixes, fill_method=fill_method) + suffixes=suffixes, fill_method=fill_method, + how=how) return op.get_result() if left_by is not None and right_by is not None: raise ValueError('Can only group either left or right frames') elif left_by is not None: - if not isinstance(left_by, (list, tuple)): - left_by = [left_by] - pieces = [] - for key, xpiece in left.groupby(left_by): - merged = _merger(xpiece, right) - for k in left_by: - # May have passed ndarray - try: - if k in merged: - merged[k] = key - except: - pass - pieces.append(merged) - return concat(pieces, ignore_index=True) + result, _ = _groupby_and_merge(left_by, on, left, right, + lambda x, y: _merger(x, y), + check_duplicates=False) elif right_by is not None: - if not isinstance(right_by, (list, tuple)): - right_by = [right_by] - pieces = [] - for key, ypiece in right.groupby(right_by): - merged = _merger(left, ypiece) - for k in right_by: - try: - if k in merged: - merged[k] = key - except: - pass - pieces.append(merged) - return concat(pieces, ignore_index=True) + result, _ = _groupby_and_merge(right_by, on, right, left, + lambda x, y: _merger(y, x), + check_duplicates=False) else: - return _merger(left, right) + result = _merger(left, right) + return result + +ordered_merge.__doc__ = merge_ordered.__doc__ + + +def merge_asof(left, right, on=None, + left_on=None, right_on=None, + by=None, + suffixes=('_x', '_y'), + tolerance=None, + allow_exact_matches=True): + """Perform an asof merge. This is similar to a left-join except that we + match on nearest key rather than equal keys. + + For each row in the left DataFrame, we select the last row in the right + DataFrame whose 'on' key is less than or equal to the left's key. Both + DataFrames must be sorted by the key. + + Optionally perform group-wise merge. This searches for the nearest match + on the 'on' key within the same group according to 'by'. + + .. versionadded:: 0.19.0 + + Parameters + ---------- + left : DataFrame + right : DataFrame + on : label + Field name to join on. Must be found in both DataFrames. + The data MUST be ordered. Furthermore this must be a numeric column, + such as datetimelike, integer, or float. On or left_on/right_on + must be given. + left_on : label + Field name to join on in left DataFrame. + right_on : label + Field name to join on in right DataFrame. + by : column name + Group both the left and right DataFrames by the group column; perform + the merge operation on these pieces and recombine. + suffixes : 2-length sequence (tuple, list, ...) + Suffix to apply to overlapping column names in the left and right + side, respectively + tolerance : integer or Timedelta, optional, default None + select asof tolerance within this range; must be compatible + to the merge index. + allow_exact_matches : boolean, default True + + - If True, allow matching the same 'on' value + (i.e. less-than-or-equal-to) + - If False, don't match the same 'on' value + (i.e., stricly less-than) + + Returns + ------- + merged : DataFrame + + Examples + -------- + >>> left + a left_val + 0 1 a + 1 5 b + 2 10 c + + >>> right + a right_val + 0 1 1 + 1 2 2 + 2 3 3 + 3 6 6 + 4 7 7 + + >>> pd.merge_asof(left, right, on='a') + a left_val right_val + 0 1 a 1 + 1 5 b 3 + 2 10 c 7 + + >>> pd.merge_asof(left, right, on='a', allow_exact_matches=False) + a left_val right_val + 0 1 a NaN + 1 5 b 3.0 + 2 10 c 7.0 + + For this example, we can achieve a similar result thru + ``pd.merge_ordered()``, though its not nearly as performant. + + >>> (pd.merge_ordered(left, right, on='a') + ... .ffill() + ... .drop_duplicates(['left_val']) + ... ) + a left_val right_val + 0 1 a 1.0 + 3 5 b 3.0 + 6 10 c 7.0 + + Here is a real-world times-series example + + >>> quotes + time ticker bid ask + 0 2016-05-25 13:30:00.023 GOOG 720.50 720.93 + 1 2016-05-25 13:30:00.023 MSFT 51.95 51.96 + 2 2016-05-25 13:30:00.030 MSFT 51.97 51.98 + 3 2016-05-25 13:30:00.041 MSFT 51.99 52.00 + 4 2016-05-25 13:30:00.048 GOOG 720.50 720.93 + 5 2016-05-25 13:30:00.049 AAPL 97.99 98.01 + 6 2016-05-25 13:30:00.072 GOOG 720.50 720.88 + 7 2016-05-25 13:30:00.075 MSFT 52.01 52.03 + + >>> trades + time ticker price quantity + 0 2016-05-25 13:30:00.023 MSFT 51.95 75 + 1 2016-05-25 13:30:00.038 MSFT 51.95 155 + 2 2016-05-25 13:30:00.048 GOOG 720.77 100 + 3 2016-05-25 13:30:00.048 GOOG 720.92 100 + 4 2016-05-25 13:30:00.048 AAPL 98.00 100 + + By default we are taking the asof of the quotes + + >>> pd.asof_merge(trades, quotes, + ... on='time', + ... by='ticker') + time ticker price quantity bid ask + 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96 + 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98 + 2 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93 + 3 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93 + 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN + + We only asof within 2ms betwen the quote time and the trade time + + >>> pd.asof_merge(trades, quotes, + ... on='time', + ... by='ticker', + ... tolerance=pd.Timedelta('2ms')) + time ticker price quantity bid ask + 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96 + 1 2016-05-25 13:30:00.038 MSFT 51.95 155 NaN NaN + 2 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93 + 3 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93 + 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN + + We only asof within 10ms betwen the quote time and the trade time + and we exclude exact matches on time. However *prior* data will + propogate forward + + >>> pd.asof_merge(trades, quotes, + ... on='time', + ... by='ticker', + ... tolerance=pd.Timedelta('10ms'), + ... allow_exact_matches=False) + time ticker price quantity bid ask + 0 2016-05-25 13:30:00.023 MSFT 51.95 75 NaN NaN + 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98 + 2 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93 + 3 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93 + 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN + + See also + -------- + merge + merge_ordered + + """ + op = _AsOfMerge(left, right, + on=on, left_on=left_on, right_on=right_on, + by=by, suffixes=suffixes, + how='asof', tolerance=tolerance, + allow_exact_matches=allow_exact_matches) + return op.get_result() # TODO: transformations?? @@ -159,6 +431,7 @@ class _MergeOperation(object): Perform a database (SQL) merge operation between two DataFrame objects using either columns as keys or their row indexes """ + _merge_type = 'merge' def __init__(self, left, right, how='inner', on=None, left_on=None, right_on=None, axis=1, @@ -206,6 +479,8 @@ def __init__(self, left, right, how='inner', on=None, msg = msg.format(left.columns.nlevels, right.columns.nlevels) warnings.warn(msg, UserWarning) + self._validate_specification() + # note this function has side effects (self.left_join_keys, self.right_join_keys, @@ -233,7 +508,7 @@ def get_result(self): concat_axis=0, copy=self.copy) typ = self.left._constructor - result = typ(result_data).__finalize__(self, method='merge') + result = typ(result_data).__finalize__(self, method=self._merge_type) if self.indicator: result = self._indicator_post_merge(result) @@ -299,25 +574,25 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer): if name in self.left: if left_has_missing is None: - left_has_missing = any(left_indexer == -1) + left_has_missing = (left_indexer == -1).any() if left_has_missing: take_right = self.right_join_keys[i] - if not com.is_dtype_equal(result[name].dtype, - self.left[name].dtype): + if not is_dtype_equal(result[name].dtype, + self.left[name].dtype): take_left = self.left[name]._values elif name in self.right: if right_has_missing is None: - right_has_missing = any(right_indexer == -1) + right_has_missing = (right_indexer == -1).any() if right_has_missing: take_left = self.left_join_keys[i] - if not com.is_dtype_equal(result[name].dtype, - self.right[name].dtype): + if not is_dtype_equal(result[name].dtype, + self.right[name].dtype): take_right = self.right[name]._values elif left_indexer is not None \ @@ -355,6 +630,13 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer): else: result.insert(i, name or 'key_%d' % i, key_col) + def _get_join_indexers(self): + """ return the join indexers """ + return _get_join_indexers(self.left_join_keys, + self.right_join_keys, + sort=self.sort, + how=self.how) + def _get_join_info(self): left_ax = self.left._data.axes[self.axis] right_ax = self.right._data.axes[self.axis] @@ -373,9 +655,8 @@ def _get_join_info(self): sort=self.sort) else: (left_indexer, - right_indexer) = _get_join_indexers(self.left_join_keys, - self.right_join_keys, - sort=self.sort, how=self.how) + right_indexer) = self._get_join_indexers() + if self.right_index: if len(self.left) > 0: join_index = self.left.index.take(left_indexer) @@ -429,8 +710,6 @@ def _get_merge_keys(self): ------- left_keys, right_keys """ - self._validate_specification() - left_keys = [] right_keys = [] join_names = [] @@ -549,7 +828,8 @@ def _validate_specification(self): raise ValueError("len(right_on) must equal len(left_on)") -def _get_join_indexers(left_keys, right_keys, sort=False, how='inner'): +def _get_join_indexers(left_keys, right_keys, sort=False, how='inner', + **kwargs): """ Parameters @@ -579,26 +859,27 @@ def _get_join_indexers(left_keys, right_keys, sort=False, how='inner'): lkey, rkey, count = fkeys(lkey, rkey) # preserve left frame order if how == 'left' and sort == False - kwargs = {'sort': sort} if how == 'left' else {} + kwargs = copy.copy(kwargs) + if how == 'left': + kwargs['sort'] = sort join_func = _join_functions[how] + return join_func(lkey, rkey, count, **kwargs) class _OrderedMerge(_MergeOperation): + _merge_type = 'ordered_merge' - def __init__(self, left, right, on=None, by=None, left_on=None, - right_on=None, axis=1, left_index=False, right_index=False, + def __init__(self, left, right, on=None, left_on=None, + right_on=None, axis=1, suffixes=('_x', '_y'), copy=True, - fill_method=None): + fill_method=None, how='outer'): self.fill_method = fill_method - _MergeOperation.__init__(self, left, right, on=on, left_on=left_on, right_on=right_on, axis=axis, - left_index=left_index, - right_index=right_index, - how='outer', suffixes=suffixes, - sort=True # sorts when factorizing + how=how, suffixes=suffixes, + sort=True # factorize sorts ) def get_result(self): @@ -612,8 +893,8 @@ def get_result(self): rdata.items, rsuf) if self.fill_method == 'ffill': - left_join_indexer = _algos.ffill_indexer(left_indexer) - right_join_indexer = _algos.ffill_indexer(right_indexer) + left_join_indexer = _join.ffill_indexer(left_indexer) + right_join_indexer = _join.ffill_indexer(right_indexer) else: left_join_indexer = left_indexer right_join_indexer = right_indexer @@ -629,13 +910,191 @@ def get_result(self): concat_axis=0, copy=self.copy) typ = self.left._constructor - result = typ(result_data).__finalize__(self, method='ordered_merge') + result = typ(result_data).__finalize__(self, method=self._merge_type) self._maybe_add_join_keys(result, left_indexer, right_indexer) return result +_asof_functions = { + 'int64_t': _join.asof_join_int64_t, + 'double': _join.asof_join_double, +} + +_asof_by_functions = { + ('int64_t', 'int64_t'): _join.asof_join_int64_t_by_int64_t, + ('double', 'int64_t'): _join.asof_join_double_by_int64_t, + ('int64_t', 'object'): _join.asof_join_int64_t_by_object, + ('double', 'object'): _join.asof_join_double_by_object, +} + +_type_casters = { + 'int64_t': _ensure_int64, + 'double': _ensure_float64, + 'object': _ensure_object, +} + + +def _get_cython_type(dtype): + """ Given a dtype, return 'int64_t', 'double', or 'object' """ + if is_integer_dtype(dtype): + return 'int64_t' + elif is_float_dtype(dtype): + return 'double' + else: + return 'object' + + +class _AsOfMerge(_OrderedMerge): + _merge_type = 'asof_merge' + + def __init__(self, left, right, on=None, by=None, left_on=None, + right_on=None, axis=1, + suffixes=('_x', '_y'), copy=True, + fill_method=None, + how='asof', tolerance=None, + allow_exact_matches=True): + + self.by = by + self.tolerance = tolerance + self.allow_exact_matches = allow_exact_matches + + _OrderedMerge.__init__(self, left, right, on=on, left_on=left_on, + right_on=right_on, axis=axis, + how=how, suffixes=suffixes, + fill_method=fill_method) + + def _validate_specification(self): + super(_AsOfMerge, self)._validate_specification() + + # we only allow on to be a single item for on + if len(self.left_on) != 1: + raise MergeError("can only asof on a key for left") + + if len(self.right_on) != 1: + raise MergeError("can only asof on a key for right") + + # add by to our key-list so we can have it in the + # output as a key + if self.by is not None: + if not is_list_like(self.by): + self.by = [self.by] + + if len(self.by) != 1: + raise MergeError("can only asof by a single key") + + self.left_on = self.by + list(self.left_on) + self.right_on = self.by + list(self.right_on) + + @property + def _asof_key(self): + """ This is our asof key, the 'on' """ + return self.left_on[-1] + + def _get_merge_keys(self): + + # note this function has side effects + (left_join_keys, + right_join_keys, + join_names) = super(_AsOfMerge, self)._get_merge_keys() + + # validate index types are the same + for lk, rk in zip(left_join_keys, right_join_keys): + if not is_dtype_equal(lk.dtype, rk.dtype): + raise MergeError("incompatible merge keys, " + "must be the same type") + + # validate tolerance; must be a Timedelta if we have a DTI + if self.tolerance is not None: + + lt = left_join_keys[self.left_on.index(self._asof_key)] + msg = "incompatible tolerance, must be compat " \ + "with type {0}".format(type(lt)) + + if is_datetime64_dtype(lt): + if not isinstance(self.tolerance, Timedelta): + raise MergeError(msg) + if self.tolerance < Timedelta(0): + raise MergeError("tolerance must be positive") + + elif is_int64_dtype(lt): + if not is_integer(self.tolerance): + raise MergeError(msg) + if self.tolerance < 0: + raise MergeError("tolerance must be positive") + + else: + raise MergeError(msg) + + # validate allow_exact_matches + if not is_bool(self.allow_exact_matches): + raise MergeError("allow_exact_matches must be boolean, " + "passed {0}".format(self.allow_exact_matches)) + + return left_join_keys, right_join_keys, join_names + + def _get_join_indexers(self): + """ return the join indexers """ + + # values to compare + left_values = self.left_join_keys[-1] + right_values = self.right_join_keys[-1] + tolerance = self.tolerance + + # we required sortedness in the join keys + msg = " keys must be sorted" + if not Index(left_values).is_monotonic: + raise ValueError('left' + msg) + if not Index(right_values).is_monotonic: + raise ValueError('right' + msg) + + # initial type conversion as needed + if needs_i8_conversion(left_values): + left_values = left_values.view('i8') + right_values = right_values.view('i8') + if tolerance is not None: + tolerance = tolerance.value + + # a "by" parameter requires special handling + if self.by is not None: + left_by_values = self.left_join_keys[0] + right_by_values = self.right_join_keys[0] + + # choose appropriate function by type + on_type = _get_cython_type(left_values.dtype) + by_type = _get_cython_type(left_by_values.dtype) + + on_type_caster = _type_casters[on_type] + by_type_caster = _type_casters[by_type] + func = _asof_by_functions[(on_type, by_type)] + + left_values = on_type_caster(left_values) + right_values = on_type_caster(right_values) + left_by_values = by_type_caster(left_by_values) + right_by_values = by_type_caster(right_by_values) + + return func(left_values, + right_values, + left_by_values, + right_by_values, + self.allow_exact_matches, + tolerance) + else: + # choose appropriate function by type + on_type = _get_cython_type(left_values.dtype) + type_caster = _type_casters[on_type] + func = _asof_functions[on_type] + + left_values = type_caster(left_values) + right_values = type_caster(right_values) + + return func(left_values, + right_values, + self.allow_exact_matches, + tolerance) + + def _get_multiindex_indexer(join_keys, index, sort): from functools import partial @@ -668,15 +1127,15 @@ def _get_multiindex_indexer(join_keys, index, sort): # factorize keys to a dense i8 space lkey, rkey, count = fkeys(lkey, rkey) - return _algos.left_outer_join(lkey, rkey, count, sort=sort) + return _join.left_outer_join(lkey, rkey, count, sort=sort) def _get_single_indexer(join_key, index, sort=False): left_key, right_key, count = _factorize_keys(join_key, index, sort=sort) - left_indexer, right_indexer = _algos.left_outer_join( - com._ensure_int64(left_key), - com._ensure_int64(right_key), + left_indexer, right_indexer = _join.left_outer_join( + _ensure_int64(left_key), + _ensure_int64(right_key), count, sort=sort) return left_indexer, right_indexer @@ -709,29 +1168,29 @@ def _left_join_on_index(left_ax, right_ax, join_keys, sort=False): def _right_outer_join(x, y, max_groups): - right_indexer, left_indexer = _algos.left_outer_join(y, x, max_groups) + right_indexer, left_indexer = _join.left_outer_join(y, x, max_groups) return left_indexer, right_indexer _join_functions = { - 'inner': _algos.inner_join, - 'left': _algos.left_outer_join, + 'inner': _join.inner_join, + 'left': _join.left_outer_join, 'right': _right_outer_join, - 'outer': _algos.full_outer_join, + 'outer': _join.full_outer_join, } def _factorize_keys(lk, rk, sort=True): - if com.is_datetime64tz_dtype(lk) and com.is_datetime64tz_dtype(rk): + if is_datetime64tz_dtype(lk) and is_datetime64tz_dtype(rk): lk = lk.values rk = rk.values - if com.is_int_or_datetime_dtype(lk) and com.is_int_or_datetime_dtype(rk): + if is_int_or_datetime_dtype(lk) and is_int_or_datetime_dtype(rk): klass = _hash.Int64Factorizer - lk = com._ensure_int64(com._values_from_object(lk)) - rk = com._ensure_int64(com._values_from_object(rk)) + lk = _ensure_int64(com._values_from_object(lk)) + rk = _ensure_int64(com._values_from_object(rk)) else: klass = _hash.Factorizer - lk = com._ensure_object(lk) - rk = com._ensure_object(rk) + lk = _ensure_object(lk) + rk = _ensure_object(rk) rizer = klass(max(len(lk), len(rk))) @@ -765,16 +1224,12 @@ def _sort_labels(uniques, left, right): # tuplesafe uniques = Index(uniques).values - sorter = uniques.argsort() + l = len(left) + labels = np.concatenate([left, right]) - reverse_indexer = np.empty(len(sorter), dtype=np.int64) - reverse_indexer.put(sorter, np.arange(len(sorter))) - - new_left = reverse_indexer.take(com._ensure_platform_int(left)) - np.putmask(new_left, left == -1, -1) - - new_right = reverse_indexer.take(com._ensure_platform_int(right)) - np.putmask(new_right, right == -1, -1) + _, new_labels = algos.safe_sort(uniques, labels, na_sentinel=-1) + new_labels = _ensure_int64(new_labels) + new_left, new_right = new_labels[:l], new_labels[l:] return new_left, new_right @@ -835,9 +1290,12 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, join_axes : list of Index objects Specific indexes to use for the other n - 1 axes instead of performing inner/outer set logic - verify_integrity : boolean, default False - Check whether the new concatenated axis contains duplicates. This can - be very expensive relative to the actual data concatenation + ignore_index : boolean, default False + If True, do not use the index values along the concatenation axis. The + resulting axis will be labeled 0, ..., n - 1. This is useful if you are + concatenating objects where the concatenation axis does not have + meaningful indexing information. Note the index values on the other + axes are still respected in the join. keys : sequence, default None If multiple levels passed, should contain tuples. Construct hierarchical index using the passed keys as the outermost level @@ -846,12 +1304,9 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, MultiIndex. Otherwise they will be inferred from the keys names : list, default None Names for the levels in the resulting hierarchical index - ignore_index : boolean, default False - If True, do not use the index values along the concatenation axis. The - resulting axis will be labeled 0, ..., n - 1. This is useful if you are - concatenating objects where the concatenation axis does not have - meaningful indexing information. Note the index values on the other - axes are still respected in the join. + verify_integrity : boolean, default False + Check whether the new concatenated axis contains duplicates. This can + be very expensive relative to the actual data concatenation copy : boolean, default True If False, do not copy data unnecessarily @@ -1057,10 +1512,9 @@ def get_result(self): mgrs_indexers.append((obj._data, indexers)) - new_data = concatenate_block_managers(mgrs_indexers, - self.new_axes, - concat_axis=self.axis, - copy=self.copy) + new_data = concatenate_block_managers( + mgrs_indexers, self.new_axes, concat_axis=self.axis, + copy=self.copy) if not self.copy: new_data._consolidate_inplace() @@ -1179,8 +1633,7 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None): names = [None] * len(zipped) if levels is None: - levels = [Categorical.from_array( - zp, ordered=True).categories for zp in zipped] + _, levels = _factorize_from_iterables(zipped) else: levels = [_ensure_index(x) for x in levels] else: @@ -1218,9 +1671,9 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None): levels.extend(concat_index.levels) label_list.extend(concat_index.labels) else: - factor = Categorical.from_array(concat_index, ordered=True) - levels.append(factor.categories) - label_list.append(factor.codes) + codes, categories = _factorize_from_iterable(concat_index) + levels.append(categories) + label_list.append(codes) if len(names) == len(levels): names = list(names) diff --git a/pandas/tools/pivot.py b/pandas/tools/pivot.py index a4e6cc404a457..94b464f6fca6c 100644 --- a/pandas/tools/pivot.py +++ b/pandas/tools/pivot.py @@ -1,6 +1,7 @@ # pylint: disable=E1103 +from pandas.types.common import is_list_like, is_scalar from pandas import Series, DataFrame from pandas.core.index import MultiIndex, Index from pandas.core.groupby import Grouper @@ -9,7 +10,6 @@ from pandas.compat import range, lrange, zip from pandas import compat import pandas.core.common as com -import pandas.lib as lib import numpy as np @@ -86,7 +86,7 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', table = pivot_table(data, values=values, index=index, columns=columns, fill_value=fill_value, aggfunc=func, - margins=margins) + margins=margins, margins_name=margins_name) pieces.append(table) keys.append(func.__name__) return concat(pieces, keys=keys, axis=1) @@ -95,7 +95,7 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', values_passed = values is not None if values_passed: - if com.is_list_like(values): + if is_list_like(values): values_multi = True values = list(values) else: @@ -361,7 +361,7 @@ def _all_key(): def _convert_by(by): if by is None: by = [] - elif (lib.isscalar(by) or + elif (is_scalar(by) or isinstance(by, (np.ndarray, Index, Series, Grouper)) or hasattr(by, '__call__')): by = [by] @@ -523,6 +523,9 @@ def _normalize(table, normalize, margins): column_margin = table.loc[:, 'All'].drop('All') index_margin = table.loc['All', :].drop('All') table = table.drop('All', axis=1).drop('All') + # to keep index and columns names + table_index_names = table.index.names + table_columns_names = table.columns.names # Normalize core table = _normalize(table, normalize=normalize, margins=False) @@ -550,6 +553,9 @@ def _normalize(table, normalize, margins): else: raise ValueError("Not a valid normalize argument") + table.index.names = table_index_names + table.columns.names = table_columns_names + else: raise ValueError("Not a valid margins argument") diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index baca8045f0cc1..7fd0b1044f9d7 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -11,10 +11,17 @@ import numpy as np +from pandas.types.common import (is_list_like, + is_integer, + is_number, + is_hashable, + is_iterator) +from pandas.types.missing import isnull, notnull + from pandas.util.decorators import cache_readonly, deprecate_kwarg from pandas.core.base import PandasObject -import pandas.core.common as com -from pandas.core.common import AbstractMethodError + +from pandas.core.common import AbstractMethodError, _try_sort from pandas.core.generic import _shared_docs, _shared_doc_kwargs from pandas.core.index import Index, MultiIndex from pandas.core.series import Series, remove_na @@ -134,6 +141,14 @@ def _mpl_ge_1_5_0(): except ImportError: return False + +def _mpl_ge_2_0_0(): + try: + import matplotlib + return matplotlib.__version__ >= LooseVersion('2.0') + except ImportError: + return False + if _mpl_ge_1_5_0(): # Compat with mp 1.5, which uses cycler. import cycler @@ -161,7 +176,7 @@ def _get_standard_colors(num_colors=None, colormap=None, color_type='default', if colormap is not None: warnings.warn("'color' and 'colormap' cannot be used " "simultaneously. Using 'color'") - colors = list(color) if com.is_list_like(color) else color + colors = list(color) if is_list_like(color) else color else: if color_type == 'default': # need to call list() on the result to copy so we don't @@ -336,7 +351,7 @@ def scatter_matrix(frame, alpha=0.5, figsize=None, ax=None, grid=False, # no gaps between subplots fig.subplots_adjust(wspace=0, hspace=0) - mask = com.notnull(df) + mask = notnull(df) marker = _get_marker_compat(marker) @@ -980,7 +995,7 @@ def _validate_color_args(self): "simultaneously. Using 'color'") if 'color' in self.kwds and self.style is not None: - if com.is_list_like(self.style): + if is_list_like(self.style): styles = self.style else: styles = [self.style] @@ -1001,7 +1016,7 @@ def _iter_data(self, data=None, keep_index=False, fillna=None): # TODO: unused? # if self.sort_columns: - # columns = com._try_sort(data.columns) + # columns = _try_sort(data.columns) # else: # columns = data.columns @@ -1099,13 +1114,13 @@ def result(self): Return result axes """ if self.subplots: - if self.layout is not None and not com.is_list_like(self.ax): + if self.layout is not None and not is_list_like(self.ax): return self.axes.reshape(*self.layout) else: return self.axes else: sec_true = isinstance(self.secondary_y, bool) and self.secondary_y - all_sec = (com.is_list_like(self.secondary_y) and + all_sec = (is_list_like(self.secondary_y) and len(self.secondary_y) == self.nseries) if (sec_true or all_sec): # if all data is plotted on secondary, return right axes @@ -1322,7 +1337,7 @@ def _get_xticks(self, convert_period=False): @classmethod def _plot(cls, ax, x, y, style=None, is_errorbar=False, **kwds): - mask = com.isnull(y) + mask = isnull(y) if mask.any(): y = np.ma.array(y) y = np.ma.masked_where(mask, y) @@ -1463,8 +1478,8 @@ def match_labels(data, e): err = np.atleast_2d(evalues) err = np.tile(err, (self.nseries, 1)) - elif com.is_list_like(err): - if com.is_iterator(err): + elif is_list_like(err): + if is_iterator(err): err = np.atleast_2d(list(err)) else: # raw error values @@ -1486,7 +1501,7 @@ def match_labels(data, e): if len(err) == 1: err = np.tile(err, (self.nseries, 1)) - elif com.is_number(err): + elif is_number(err): err = np.tile([err], (self.nseries, len(self.data))) else: @@ -1543,9 +1558,9 @@ def __init__(self, data, x, y, **kwargs): MPLPlot.__init__(self, data, **kwargs) if x is None or y is None: raise ValueError(self._kind + ' requires and x and y column') - if com.is_integer(x) and not self.data.columns.holds_integer(): + if is_integer(x) and not self.data.columns.holds_integer(): x = self.data.columns[x] - if com.is_integer(y) and not self.data.columns.holds_integer(): + if is_integer(y) and not self.data.columns.holds_integer(): y = self.data.columns[y] self.x = x self.y = y @@ -1569,7 +1584,7 @@ def __init__(self, data, x, y, s=None, c=None, **kwargs): # the handling of this argument later s = 20 super(ScatterPlot, self).__init__(data, x, y, s=s, **kwargs) - if com.is_integer(c) and not self.data.columns.holds_integer(): + if is_integer(c) and not self.data.columns.holds_integer(): c = self.data.columns[c] self.c = c @@ -1577,7 +1592,7 @@ def _make_plot(self): x, y, c, data = self.x, self.y, self.c, self.data ax = self.axes[0] - c_is_column = com.is_hashable(c) and c in self.data.columns + c_is_column = is_hashable(c) and c in self.data.columns # plot a colorbar only if a colormap is provided or necessary cb = self.kwds.pop('colorbar', self.colormap or c_is_column) @@ -1629,7 +1644,7 @@ class HexBinPlot(PlanePlot): def __init__(self, data, x, y, C=None, **kwargs): super(HexBinPlot, self).__init__(data, x, y, **kwargs) - if com.is_integer(C) and not self.data.columns.holds_integer(): + if is_integer(C) and not self.data.columns.holds_integer(): C = self.data.columns[C] self.C = C @@ -1832,10 +1847,16 @@ def __init__(self, data, **kwargs): @classmethod def _plot(cls, ax, x, y, style=None, column_num=None, stacking_id=None, is_errorbar=False, **kwds): + if column_num == 0: cls._initialize_stacker(ax, stacking_id, len(y)) y_values = cls._get_stacked_values(ax, stacking_id, y, kwds['label']) - lines = MPLPlot._plot(ax, x, y_values, style=style, **kwds) + + # need to remove label, because subplots uses mpl legend as it is + line_kwds = kwds.copy() + if cls.mpl_ge_1_5_0(): + line_kwds.pop('label') + lines = MPLPlot._plot(ax, x, y_values, style=style, **line_kwds) # get data from the line to get coordinates for fill_between xdata, y_values = lines[0].get_data(orig=False) @@ -1853,18 +1874,21 @@ def _plot(cls, ax, x, y, style=None, column_num=None, if 'color' not in kwds: kwds['color'] = lines[0].get_color() - if cls.mpl_ge_1_5_0(): # mpl 1.5 added real support for poly legends - kwds.pop('label') - ax.fill_between(xdata, start, y_values, **kwds) + rect = ax.fill_between(xdata, start, y_values, **kwds) cls._update_stacker(ax, stacking_id, y) - return lines + + # LinePlot expects list of artists + res = [rect] if cls.mpl_ge_1_5_0() else lines + return res def _add_legend_handle(self, handle, label, index=None): - from matplotlib.patches import Rectangle - # Because fill_between isn't supported in legend, - # specifically add Rectangle handle here - alpha = self.kwds.get('alpha', None) - handle = Rectangle((0, 0), 1, 1, fc=handle.get_color(), alpha=alpha) + if not self.mpl_ge_1_5_0(): + from matplotlib.patches import Rectangle + # Because fill_between isn't supported in legend, + # specifically add Rectangle handle here + alpha = self.kwds.get('alpha', None) + handle = Rectangle((0, 0), 1, 1, fc=handle.get_color(), + alpha=alpha) LinePlot._add_legend_handle(self, handle, label, index=index) def _post_plot_logic(self, ax, data): @@ -1912,9 +1936,9 @@ def __init__(self, data, **kwargs): self.ax_pos = self.tick_pos - self.tickoffset def _args_adjust(self): - if com.is_list_like(self.bottom): + if is_list_like(self.bottom): self.bottom = np.array(self.bottom) - if com.is_list_like(self.left): + if is_list_like(self.left): self.left = np.array(self.left) @classmethod @@ -2027,18 +2051,18 @@ def __init__(self, data, bins=10, bottom=0, **kwargs): MPLPlot.__init__(self, data, **kwargs) def _args_adjust(self): - if com.is_integer(self.bins): + if is_integer(self.bins): # create common bin edge values = (self.data._convert(datetime=True)._get_numeric_data()) values = np.ravel(values) - values = values[~com.isnull(values)] + values = values[~isnull(values)] hist, self.bins = np.histogram( values, bins=self.bins, range=self.kwds.get('range', None), weights=self.kwds.get('weights', None)) - if com.is_list_like(self.bottom): + if is_list_like(self.bottom): self.bottom = np.array(self.bottom) @classmethod @@ -2046,7 +2070,7 @@ def _plot(cls, ax, y, style=None, bins=None, bottom=0, column_num=0, stacking_id=None, **kwds): if column_num == 0: cls._initialize_stacker(ax, stacking_id, len(bins) - 1) - y = y[~com.isnull(y)] + y = y[~isnull(y)] base = np.zeros(len(bins) - 1) bottom = bottom + \ @@ -2223,7 +2247,7 @@ class BoxPlot(LinePlot): # namedtuple to hold results BP = namedtuple("Boxplot", ['ax', 'lines']) - def __init__(self, data, return_type=None, **kwargs): + def __init__(self, data, return_type='axes', **kwargs): # Do not call LinePlot.__init__ which may fill nan if return_type not in self._valid_return_types: raise ValueError( @@ -2242,7 +2266,7 @@ def _args_adjust(self): self.sharey = False @classmethod - def _plot(cls, ax, y, column_num=None, return_type=None, **kwds): + def _plot(cls, ax, y, column_num=None, return_type='axes', **kwds): if y.ndim == 2: y = [remove_na(v) for v in y] # Boxplot fails with empty arrays, so need to add a NaN @@ -2315,7 +2339,7 @@ def maybe_color_bp(self, bp): def _make_plot(self): if self.subplots: - self._return_obj = compat.OrderedDict() + self._return_obj = Series() for i, (label, y) in enumerate(self._iter_data()): ax = self._get_ax(i) @@ -2411,7 +2435,7 @@ def _plot(data, x=None, y=None, subplots=False, msg = "{0} requires either y column or 'subplots=True'" raise ValueError(msg.format(kind)) elif y is not None: - if com.is_integer(y) and not data.columns.holds_integer(): + if is_integer(y) and not data.columns.holds_integer(): y = data.columns[y] # converted to series actually. copy to not modify data = data[y].copy() @@ -2420,12 +2444,12 @@ def _plot(data, x=None, y=None, subplots=False, else: if isinstance(data, DataFrame): if x is not None: - if com.is_integer(x) and not data.columns.holds_integer(): + if is_integer(x) and not data.columns.holds_integer(): x = data.columns[x] data = data.set_index(x) if y is not None: - if com.is_integer(y) and not data.columns.holds_integer(): + if is_integer(y) and not data.columns.holds_integer(): y = data.columns[y] label = kwds['label'] if 'label' in kwds else y series = data[y].copy() # Don't modify @@ -2434,7 +2458,7 @@ def _plot(data, x=None, y=None, subplots=False, for kw in ['xerr', 'yerr']: if (kw in kwds) and \ (isinstance(kwds[kw], string_types) or - com.is_integer(kwds[kw])): + is_integer(kwds[kw])): try: kwds[kw] = data[kwds[kw]] except (IndexError, KeyError, TypeError): @@ -2667,14 +2691,17 @@ def plot_series(data, kind='line', ax=None, # Series unique grid : Setting this to True will show the grid layout : tuple (optional) (rows, columns) for the layout of the plot - return_type : {'axes', 'dict', 'both'}, default 'dict' - The kind of object to return. 'dict' returns a dictionary - whose values are the matplotlib Lines of the boxplot; + return_type : {None, 'axes', 'dict', 'both'}, default None + The kind of object to return. The default is ``axes`` 'axes' returns the matplotlib axes the boxplot is drawn on; + 'dict' returns a dictionary whose values are the matplotlib + Lines of the boxplot; 'both' returns a namedtuple with the axes and dict. - When grouping with ``by``, a dict mapping columns to ``return_type`` - is returned. + When grouping with ``by``, a Series mapping columns to ``return_type`` + is returned, unless ``return_type`` is None, in which case a NumPy + array of axes is returned with the same shape as ``layout``. + See the prose documentation for more. kwds : other plotting keyword arguments to be passed to matplotlib boxplot function @@ -2700,7 +2727,7 @@ def boxplot(data, column=None, by=None, ax=None, fontsize=None, # validate return_type: if return_type not in BoxPlot._valid_return_types: - raise ValueError("return_type must be {None, 'axes', 'dict', 'both'}") + raise ValueError("return_type must be {'axes', 'dict', 'both'}") from pandas import Series, DataFrame if isinstance(data, Series): @@ -2745,23 +2772,19 @@ def plot_group(keys, values, ax): columns = [column] if by is not None: + # Prefer array return type for 2-D plots to match the subplot layout + # https://github.com/pydata/pandas/pull/12216#issuecomment-241175580 result = _grouped_plot_by_column(plot_group, data, columns=columns, by=by, grid=grid, figsize=figsize, ax=ax, layout=layout, return_type=return_type) else: + if return_type is None: + return_type = 'axes' if layout is not None: raise ValueError("The 'layout' keyword is not supported when " "'by' is None") - if return_type is None: - msg = ("\nThe default value for 'return_type' will change to " - "'axes' in a future release.\n To use the future behavior " - "now, set return_type='axes'.\n To keep the previous " - "behavior and silence this warning, set " - "return_type='dict'.") - warnings.warn(msg, FutureWarning, stacklevel=3) - return_type = 'dict' if ax is None: ax = _gca() data = data._get_numeric_data() @@ -2897,7 +2920,7 @@ def hist_frame(data, column=None, by=None, grid=True, xlabelsize=None, layout=layout) _axes = _flatten(axes) - for i, col in enumerate(com._try_sort(data.columns)): + for i, col in enumerate(_try_sort(data.columns)): ax = _axes[i] ax.hist(data[col].dropna().values, bins=bins, **kwds) ax.set_title(col) @@ -3080,12 +3103,12 @@ def boxplot_frame_groupby(grouped, subplots=True, column=None, fontsize=None, figsize=figsize, layout=layout) axes = _flatten(axes) - ret = compat.OrderedDict() + ret = Series() for (key, group), ax in zip(grouped, axes): d = group.boxplot(ax=ax, column=column, fontsize=fontsize, rot=rot, grid=grid, **kwds) ax.set_title(pprint_thing(key)) - ret[key] = d + ret.loc[key] = d fig.subplots_adjust(bottom=0.15, top=0.9, left=0.1, right=0.9, wspace=0.2) else: @@ -3151,7 +3174,9 @@ def _grouped_plot_by_column(plotf, data, columns=None, by=None, _axes = _flatten(axes) - result = compat.OrderedDict() + result = Series() + ax_values = [] + for i, col in enumerate(columns): ax = _axes[i] gp_col = grouped[col] @@ -3159,9 +3184,11 @@ def _grouped_plot_by_column(plotf, data, columns=None, by=None, re_plotf = plotf(keys, values, ax, **kwargs) ax.set_title(col) ax.set_xlabel(pprint_thing(by)) - result[col] = re_plotf + ax_values.append(re_plotf) ax.grid(grid) + result = Series(ax_values, index=columns) + # Return axes in multiplot case, maybe revisit later # 985 if return_type is None: result = axes @@ -3345,7 +3372,7 @@ def _subplots(naxes=None, sharex=False, sharey=False, squeeze=True, if ax is None: fig = plt.figure(**fig_kw) else: - if com.is_list_like(ax): + if is_list_like(ax): ax = _flatten(ax) if layout is not None: warnings.warn("When passing multiple axes, layout keyword is " @@ -3353,7 +3380,8 @@ def _subplots(naxes=None, sharex=False, sharey=False, squeeze=True, if sharex or sharey: warnings.warn("When passing multiple axes, sharex and sharey " "are ignored. These settings must be specified " - "when creating axes", UserWarning) + "when creating axes", UserWarning, + stacklevel=4) if len(ax) == naxes: fig = ax[0].get_figure() return fig, ax @@ -3370,7 +3398,8 @@ def _subplots(naxes=None, sharex=False, sharey=False, squeeze=True, return fig, _flatten(ax) else: warnings.warn("To output multiple subplots, the figure containing " - "the passed axes is being cleared", UserWarning) + "the passed axes is being cleared", UserWarning, + stacklevel=4) fig.clear() nrows, ncols = _get_layout(naxes, layout=layout, layout_type=layout_type) @@ -3485,7 +3514,7 @@ def _handle_shared_axes(axarr, nplots, naxes, nrows, ncols, sharex, sharey): def _flatten(axes): - if not com.is_list_like(axes): + if not is_list_like(axes): return np.array([axes]) elif isinstance(axes, (np.ndarray, Index)): return axes.ravel() diff --git a/pandas/tools/rplot.py b/pandas/tools/rplot.py deleted file mode 100644 index 5a748b60aae9c..0000000000000 --- a/pandas/tools/rplot.py +++ /dev/null @@ -1,984 +0,0 @@ -import random -import warnings -from copy import deepcopy -from pandas.core.common import _values_from_object - -import numpy as np -from pandas.compat import range, zip -# -# TODO: -# * Make sure legends work properly -# - - -warnings.warn("\n" - "The rplot trellis plotting interface is deprecated and will be " - "removed in a future version. We refer to external packages " - "like seaborn for similar but more refined functionality. \n\n" - "See our docs http://pandas.pydata.org/pandas-docs/stable" - "/visualization.html#rplot " - "for some example how to convert your existing code to these " - "packages.", FutureWarning, stacklevel=2) - - -class Scale: - """ - Base class for mapping between graphical and data attributes. - """ - pass - - -class ScaleGradient(Scale): - """ - A mapping between a data attribute value and a - point in colour space between two specified colours. - """ - - def __init__(self, column, colour1, colour2): - """Initialize ScaleGradient instance. - - Parameters: - ----------- - column: string, pandas DataFrame column name - colour1: tuple - 3 element tuple with float values representing an RGB colour - colour2: tuple - 3 element tuple with float values representing an RGB colour - """ - self.column = column - self.colour1 = colour1 - self.colour2 = colour2 - self.categorical = False - - def __call__(self, data, index): - """Return a colour corresponding to data attribute value. - - Parameters: - ----------- - data: pandas DataFrame - index: pandas DataFrame row index - - Returns: - -------- - A three element tuple representing an RGB somewhere between colour1 and - colour2 - """ - x = data[self.column].iget(index) - a = min(data[self.column]) - b = max(data[self.column]) - r1, g1, b1 = self.colour1 - r2, g2, b2 = self.colour2 - x_scaled = (x - a) / (b - a) - return (r1 + (r2 - r1) * x_scaled, - g1 + (g2 - g1) * x_scaled, - b1 + (b2 - b1) * x_scaled) - - -class ScaleGradient2(Scale): - """ - Create a mapping between a data attribute value and a - point in colour space in a line of three specified colours. - """ - - def __init__(self, column, colour1, colour2, colour3): - """Initialize ScaleGradient2 instance. - - Parameters: - ----------- - column: string, pandas DataFrame column name - colour1: tuple - 3 element tuple with float values representing an RGB colour - colour2: tuple - 3 element tuple with float values representing an RGB colour - colour3: tuple - 3 element tuple with float values representing an RGB colour - """ - self.column = column - self.colour1 = colour1 - self.colour2 = colour2 - self.colour3 = colour3 - self.categorical = False - - def __call__(self, data, index): - """Return a colour corresponding to data attribute value. - - Parameters: - ----------- - data: pandas DataFrame - index: pandas DataFrame row index - - Returns: - -------- - A three element tuple representing an RGB somewhere along the line - of colour1, colour2 and colour3 - """ - x = data[self.column].iget(index) - a = min(data[self.column]) - b = max(data[self.column]) - r1, g1, b1 = self.colour1 - r2, g2, b2 = self.colour2 - r3, g3, b3 = self.colour3 - x_scaled = (x - a) / (b - a) - if x_scaled < 0.5: - x_scaled *= 2.0 - return (r1 + (r2 - r1) * x_scaled, - g1 + (g2 - g1) * x_scaled, - b1 + (b2 - b1) * x_scaled) - else: - x_scaled = (x_scaled - 0.5) * 2.0 - return (r2 + (r3 - r2) * x_scaled, - g2 + (g3 - g2) * x_scaled, - b2 + (b3 - b2) * x_scaled) - - -class ScaleSize(Scale): - """ - Provide a mapping between a DataFrame column and matplotlib - scatter plot shape size. - """ - - def __init__(self, column, min_size=5.0, max_size=100.0, - transform=lambda x: x): - """Initialize ScaleSize instance. - - Parameters: - ----------- - column: string, a column name - min_size: float, minimum point size - max_size: float, maximum point size - transform: function - a one argument function of form float -> float (e.g. lambda x: - log(x)) - """ - self.column = column - self.min_size = min_size - self.max_size = max_size - self.transform = transform - self.categorical = False - - def __call__(self, data, index): - """Return matplotlib scatter plot marker shape size. - - Parameters: - ----------- - data: pandas DataFrame - index: pandas DataFrame row index - """ - x = data[self.column].iget(index) - a = float(min(data[self.column])) - b = float(max(data[self.column])) - return self.transform(self.min_size + ((x - a) / (b - a)) * - (self.max_size - self.min_size)) - - -class ScaleShape(Scale): - """ - Provides a mapping between matplotlib marker shapes - and attribute values. - """ - - def __init__(self, column): - """Initialize ScaleShape instance. - - Parameters: - ----------- - column: string, pandas DataFrame column name - """ - self.column = column - self.shapes = ['o', '+', 's', '*', '^', '<', '>', 'v', '|', 'x'] - self.legends = set([]) - self.categorical = True - - def __call__(self, data, index): - """Returns a matplotlib marker identifier. - - Parameters: - ----------- - data: pandas DataFrame - index: pandas DataFrame row index - - Returns: - -------- - a matplotlib marker identifier - """ - values = sorted(list(set(data[self.column]))) - if len(values) > len(self.shapes): - raise ValueError("Too many different values of the categorical " - "attribute for ScaleShape") - x = data[self.column].iget(index) - return self.shapes[values.index(x)] - - -class ScaleRandomColour(Scale): - """ - Maps a random colour to a DataFrame attribute. - """ - - def __init__(self, column): - """Initialize ScaleRandomColour instance. - - Parameters: - ----------- - column: string, pandas DataFrame column name - """ - self.column = column - self.categorical = True - - def __call__(self, data, index): - """Return a tuple of three floats, representing - an RGB colour. - - Parameters: - ----------- - data: pandas DataFrame - index: pandas DataFrame row index - """ - random.seed(data[self.column].iget(index)) - return [random.random() for _ in range(3)] - - -class ScaleConstant(Scale): - """ - Constant returning scale. Usually used automatically. - """ - - def __init__(self, value): - """Initialize ScaleConstant instance. - - Parameters: - ----------- - value: any Python value to be returned when called - """ - self.value = value - self.categorical = False - - def __call__(self, data, index): - """Return the constant value. - - Parameters: - ----------- - data: pandas DataFrame - index: pandas DataFrame row index - - Returns: - -------- - A constant value specified during initialisation - """ - return self.value - - -def default_aes(x=None, y=None): - """Create the default aesthetics dictionary. - - Parameters: - ----------- - x: string, DataFrame column name - y: string, DataFrame column name - - Returns: - -------- - a dictionary with aesthetics bindings - """ - return { - 'x': x, - 'y': y, - 'size': ScaleConstant(40.0), - 'colour': ScaleConstant('grey'), - 'shape': ScaleConstant('o'), - 'alpha': ScaleConstant(1.0), - } - - -def make_aes(x=None, y=None, size=None, colour=None, shape=None, alpha=None): - """Create an empty aesthetics dictionary. - - Parameters: - ----------- - x: string, DataFrame column name - y: string, DataFrame column name - size: function, binding for size attribute of Geoms - colour: function, binding for colour attribute of Geoms - shape: function, binding for shape attribute of Geoms - alpha: function, binding for alpha attribute of Geoms - - Returns: - -------- - a dictionary with aesthetics bindings - """ - if not hasattr(size, '__call__') and size is not None: - size = ScaleConstant(size) - if not hasattr(colour, '__call__') and colour is not None: - colour = ScaleConstant(colour) - if not hasattr(shape, '__call__') and shape is not None: - shape = ScaleConstant(shape) - if not hasattr(alpha, '__call__') and alpha is not None: - alpha = ScaleConstant(alpha) - if any([isinstance(size, scale) - for scale in [ScaleConstant, ScaleSize]]) or size is None: - pass - else: - raise ValueError( - 'size mapping should be done through ScaleConstant or ScaleSize') - if (any([isinstance(colour, scale) - for scale in [ScaleConstant, ScaleGradient, - ScaleGradient2, ScaleRandomColour]]) or - colour is None): - pass - else: - raise ValueError('colour mapping should be done through ' - 'ScaleConstant, ScaleRandomColour, ScaleGradient ' - 'or ScaleGradient2') - if (any([isinstance(shape, scale) - for scale in [ScaleConstant, ScaleShape]]) or - shape is None): - pass - else: - raise ValueError('shape mapping should be done through ScaleConstant ' - 'or ScaleShape') - if (any([isinstance(alpha, scale) for scale in [ScaleConstant]]) or - alpha is None): - pass - else: - raise ValueError('alpha mapping should be done through ScaleConstant') - return { - 'x': x, - 'y': y, - 'size': size, - 'colour': colour, - 'shape': shape, - 'alpha': alpha, - } - - -class Layer: - """ - Layer object representing a single plot layer. - """ - - def __init__(self, data=None, **kwds): - """Initialize layer object. - - Parameters: - ----------- - data: pandas DataFrame instance - aes: aesthetics dictionary with bindings - """ - self.data = data - self.aes = make_aes(**kwds) - self.legend = {} - - def work(self, fig=None, ax=None): - """Do the drawing (usually) work. - - Parameters: - ----------- - fig: matplotlib figure - ax: matplotlib axis object - - Returns: - -------- - a tuple with the same figure and axis instances - """ - return fig, ax - - -class GeomPoint(Layer): - - def work(self, fig=None, ax=None): - """Render the layer on a matplotlib axis. - You can specify either a figure or an axis to draw on. - - Parameters: - ----------- - fig: matplotlib figure object - ax: matplotlib axis object to draw on - - Returns: - -------- - fig, ax: matplotlib figure and axis objects - """ - if ax is None: - if fig is None: - return fig, ax - else: - ax = fig.gca() - for index in range(len(self.data)): - row = self.data.iloc[index] - x = row[self.aes['x']] - y = row[self.aes['y']] - size_scaler = self.aes['size'] - colour_scaler = self.aes['colour'] - shape_scaler = self.aes['shape'] - alpha = self.aes['alpha'] - size_value = size_scaler(self.data, index) - colour_value = colour_scaler(self.data, index) - marker_value = shape_scaler(self.data, index) - alpha_value = alpha(self.data, index) - patch = ax.scatter(x, y, - s=size_value, - c=colour_value, - marker=marker_value, - alpha=alpha_value) - label = [] - if colour_scaler.categorical: - label += [colour_scaler.column, row[colour_scaler.column]] - if shape_scaler.categorical: - label += [shape_scaler.column, row[shape_scaler.column]] - self.legend[tuple(label)] = patch - ax.set_xlabel(self.aes['x']) - ax.set_ylabel(self.aes['y']) - return fig, ax - - -class GeomPolyFit(Layer): - """ - Draw a polynomial fit of specified degree. - """ - - def __init__(self, degree, lw=2.0, colour='grey'): - """Initialize GeomPolyFit object. - - Parameters: - ----------- - degree: an integer, polynomial degree - lw: line width - colour: matplotlib colour - """ - self.degree = degree - self.lw = lw - self.colour = colour - Layer.__init__(self) - - def work(self, fig=None, ax=None): - """Draw the polynomial fit on matplotlib figure or axis - - Parameters: - ----------- - fig: matplotlib figure - ax: matplotlib axis - - Returns: - -------- - a tuple with figure and axis objects - """ - if ax is None: - if fig is None: - return fig, ax - else: - ax = fig.gca() - from numpy.polynomial.polynomial import polyfit - from numpy.polynomial.polynomial import polyval - x = self.data[self.aes['x']] - y = self.data[self.aes['y']] - min_x = min(x) - max_x = max(x) - c = polyfit(x, y, self.degree) - x_ = np.linspace(min_x, max_x, len(x)) - y_ = polyval(x_, c) - ax.plot(x_, y_, lw=self.lw, c=self.colour) - return fig, ax - - -class GeomScatter(Layer): - """ - An efficient scatter plot, use this instead of GeomPoint for speed. - """ - - def __init__(self, marker='o', colour='lightblue', alpha=1.0): - """Initialize GeomScatter instance. - - Parameters: - ----------- - marker: matplotlib marker string - colour: matplotlib colour - alpha: matplotlib alpha - """ - self.marker = marker - self.colour = colour - self.alpha = alpha - Layer.__init__(self) - - def work(self, fig=None, ax=None): - """Draw a scatter plot on matplotlib figure or axis - - Parameters: - ----------- - fig: matplotlib figure - ax: matplotlib axis - - Returns: - -------- - a tuple with figure and axis objects - """ - if ax is None: - if fig is None: - return fig, ax - else: - ax = fig.gca() - x = self.data[self.aes['x']] - y = self.data[self.aes['y']] - ax.scatter(x, y, marker=self.marker, c=self.colour, alpha=self.alpha) - return fig, ax - - -class GeomHistogram(Layer): - """ - An efficient histogram, use this instead of GeomBar for speed. - """ - - def __init__(self, bins=10, colour='lightblue'): - """Initialize GeomHistogram instance. - - Parameters: - ----------- - bins: integer, number of histogram bins - colour: matplotlib colour - """ - self.bins = bins - self.colour = colour - Layer.__init__(self) - - def work(self, fig=None, ax=None): - """Draw a histogram on matplotlib figure or axis - - Parameters: - ----------- - fig: matplotlib figure - ax: matplotlib axis - - Returns: - -------- - a tuple with figure and axis objects - """ - if ax is None: - if fig is None: - return fig, ax - else: - ax = fig.gca() - x = self.data[self.aes['x']] - ax.hist(_values_from_object(x), self.bins, facecolor=self.colour) - ax.set_xlabel(self.aes['x']) - return fig, ax - - -class GeomDensity(Layer): - """ - A kernel density estimation plot. - """ - - def work(self, fig=None, ax=None): - """Draw a one dimensional kernel density plot. - You can specify either a figure or an axis to draw on. - - Parameters: - ----------- - fig: matplotlib figure object - ax: matplotlib axis object to draw on - - Returns: - -------- - fig, ax: matplotlib figure and axis objects - """ - if ax is None: - if fig is None: - return fig, ax - else: - ax = fig.gca() - from scipy.stats import gaussian_kde - x = self.data[self.aes['x']] - gkde = gaussian_kde(x) - ind = np.linspace(x.min(), x.max(), 200) - ax.plot(ind, gkde.evaluate(ind)) - return fig, ax - - -class GeomDensity2D(Layer): - - def work(self, fig=None, ax=None): - """Draw a two dimensional kernel density plot. - You can specify either a figure or an axis to draw on. - - Parameters: - ----------- - fig: matplotlib figure object - ax: matplotlib axis object to draw on - - Returns: - -------- - fig, ax: matplotlib figure and axis objects - """ - if ax is None: - if fig is None: - return fig, ax - else: - ax = fig.gca() - x = self.data[self.aes['x']] - y = self.data[self.aes['y']] - - # TODO: unused? - # rvs = np.array([x, y]) - - x_min = x.min() - x_max = x.max() - y_min = y.min() - y_max = y.max() - X, Y = np.mgrid[x_min:x_max:200j, y_min:y_max:200j] - positions = np.vstack([X.ravel(), Y.ravel()]) - values = np.vstack([x, y]) - import scipy.stats as stats - kernel = stats.gaussian_kde(values) - Z = np.reshape(kernel(positions).T, X.shape) - ax.contour(Z, extent=[x_min, x_max, y_min, y_max]) - return fig, ax - - -class TrellisGrid(Layer): - - def __init__(self, by): - """Initialize TreelisGrid instance. - - Parameters: - ----------- - by: column names to group by - """ - if len(by) != 2: - raise ValueError("You must give a list of length 2 to group by") - elif by[0] == '.' and by[1] == '.': - raise ValueError( - "At least one of grouping attributes must be not a dot") - self.by = by - - def trellis(self, layers): - """ - Create a trellis structure for a list of layers. Each layer will be - cloned with different data in to a two dimensional grid. - - Parameters: - ----------- - layers: a list of Layer objects - - Returns: - -------- - trellised_layers: Clones of each layer in the list arranged in a - trellised latice - """ - trellised_layers = [] - for layer in layers: - data = layer.data - if self.by[0] == '.': - grouped = data.groupby(self.by[1]) - elif self.by[1] == '.': - grouped = data.groupby(self.by[0]) - else: - grouped = data.groupby(self.by) - groups = list(grouped.groups.keys()) - if self.by[0] == '.' or self.by[1] == '.': - shingle1 = set([g for g in groups]) - else: - shingle1 = set([g[0] for g in groups]) - shingle2 = set([g[1] for g in groups]) - if self.by[0] == '.': - self.rows = 1 - self.cols = len(shingle1) - elif self.by[1] == '.': - self.rows = len(shingle1) - self.cols = 1 - else: - self.rows = len(shingle1) - self.cols = len(shingle2) - trellised = [[None for _ in range(self.cols)] - for _ in range(self.rows)] - self.group_grid = [[None for _ in range( - self.cols)] for _ in range(self.rows)] - row = 0 - col = 0 - for group, data in grouped: - new_layer = deepcopy(layer) - new_layer.data = data - trellised[row][col] = new_layer - self.group_grid[row][col] = group - col += 1 - if col >= self.cols: - col = 0 - row += 1 - trellised_layers.append(trellised) - return trellised_layers - - -def dictionary_union(dict1, dict2): - """Take two dictionaries, return dictionary union. - - Parameters: - ----------- - dict1: Python dictionary - dict2: Python dictionary - - Returns: - -------- - A union of the dictionaries. It assumes that values - with the same keys are identical. - """ - keys1 = list(dict1.keys()) - keys2 = list(dict2.keys()) - result = {} - for key1 in keys1: - result[key1] = dict1[key1] - for key2 in keys2: - result[key2] = dict2[key2] - return result - - -def merge_aes(layer1, layer2): - """Merges the aesthetics dictionaries for the two layers. - Look up sequence_layers function. Which layer is first and which - one is second is important. - - Parameters: - ----------- - layer1: Layer object - layer2: Layer object - """ - for key in layer2.aes.keys(): - if layer2.aes[key] is None: - layer2.aes[key] = layer1.aes[key] - - -def sequence_layers(layers): - """ - Go through the list of layers and fill in the missing bits of information. - The basic rules are this: - - * If the current layer has data set to None, take the data from previous - layer. - * For each aesthetic mapping, if that mapping is set to None, take it from - previous layer. - - Parameters: - ----------- - layers: a list of Layer objects - """ - for layer1, layer2 in zip(layers[:-1], layers[1:]): - if layer2.data is None: - layer2.data = layer1.data - merge_aes(layer1, layer2) - return layers - - -def sequence_grids(layer_grids): - """ - Go through the list of layer girds and perform the same thing as - sequence_layers. - - Parameters: - ----------- - layer_grids: a list of two dimensional layer grids - """ - for grid1, grid2 in zip(layer_grids[:-1], layer_grids[1:]): - for row1, row2 in zip(grid1, grid2): - for layer1, layer2 in zip(row1, row2): - if layer2.data is None: - layer2.data = layer1.data - merge_aes(layer1, layer2) - return layer_grids - - -def work_grid(grid, fig): - """ - Take a two dimensional grid, add subplots to a figure for each cell and do - layer work. - - Parameters: - ----------- - grid: a two dimensional grid of layers - fig: matplotlib figure to draw on - - Returns: - -------- - axes: a two dimensional list of matplotlib axes - """ - nrows = len(grid) - ncols = len(grid[0]) - axes = [[None for _ in range(ncols)] for _ in range(nrows)] - for row in range(nrows): - for col in range(ncols): - axes[row][col] = fig.add_subplot( - nrows, ncols, ncols * row + col + 1) - grid[row][col].work(ax=axes[row][col]) - return axes - - -def adjust_subplots(fig, axes, trellis, layers): - """Adjust the subtplots on matplotlib figure with the - fact that we have a trellis plot in mind. - - Parameters: - ----------- - fig: matplotlib figure - axes: a two dimensional grid of matplotlib axes - trellis: TrellisGrid object - layers: last grid of layers in the plot - """ - # Flatten the axes grid - axes = [ax for row in axes for ax in row] - min_x = min([ax.get_xlim()[0] for ax in axes]) - max_x = max([ax.get_xlim()[1] for ax in axes]) - min_y = min([ax.get_ylim()[0] for ax in axes]) - max_y = max([ax.get_ylim()[1] for ax in axes]) - [ax.set_xlim(min_x, max_x) for ax in axes] - [ax.set_ylim(min_y, max_y) for ax in axes] - for index, axis in enumerate(axes): - if index % trellis.cols == 0: - pass - else: - axis.get_yaxis().set_ticks([]) - axis.set_ylabel('') - if index / trellis.cols == trellis.rows - 1: - pass - else: - axis.get_xaxis().set_ticks([]) - axis.set_xlabel('') - if trellis.by[0] == '.': - label1 = "%s = %s" % (trellis.by[1], trellis.group_grid[ - index // trellis.cols][index % trellis.cols]) - label2 = None - elif trellis.by[1] == '.': - label1 = "%s = %s" % (trellis.by[0], trellis.group_grid[ - index // trellis.cols][index % trellis.cols]) - label2 = None - else: - label1 = "%s = %s" % ( - trellis.by[0], - trellis.group_grid[index // trellis.cols] - [index % trellis.cols][0]) - label2 = "%s = %s" % ( - trellis.by[1], - trellis.group_grid[index // trellis.cols] - [index % trellis.cols][1]) - if label2 is not None: - axis.table(cellText=[[label1], [label2]], - loc='top', cellLoc='center', - cellColours=[['lightgrey'], ['lightgrey']]) - else: - axis.table(cellText=[[label1]], loc='top', - cellLoc='center', cellColours=[['lightgrey']]) - # Flatten the layer grid - layers = [layer for row in layers for layer in row] - legend = {} - for layer in layers: - legend = dictionary_union(legend, layer.legend) - patches = [] - labels = [] - if len(list(legend.keys())) == 0: - key_function = lambda tup: tup - elif len(list(legend.keys())[0]) == 2: - key_function = lambda tup: (tup[1]) - else: - key_function = lambda tup: (tup[1], tup[3]) - for key in sorted(list(legend.keys()), key=key_function): - value = legend[key] - patches.append(value) - if len(key) == 2: - col, val = key - labels.append("%s" % str(val)) - elif len(key) == 4: - col1, val1, col2, val2 = key - labels.append("%s, %s" % (str(val1), str(val2))) - else: - raise ValueError( - "Maximum 2 categorical attributes to display a lengend of") - if len(legend): - fig.legend(patches, labels, loc='upper right') - fig.subplots_adjust(wspace=0.05, hspace=0.2) - - -class RPlot: - """ - The main plot object. Add layers to an instance of this object to create a - plot. - """ - - def __init__(self, data, x=None, y=None): - """Initialize RPlot instance. - - Parameters: - ----------- - data: pandas DataFrame instance - x: string, DataFrame column name - y: string, DataFrame column name - """ - self.layers = [Layer(data, **default_aes(x=x, y=y))] - - def add(self, layer): - """Add a layer to RPlot instance. - - Parameters: - ----------- - layer: Layer instance - """ - if not isinstance(layer, Layer): - raise TypeError( - "The operand on the right side of + must be a Layer instance") - self.layers.append(layer) - - def render(self, fig=None): - """Render all the layers on a matplotlib figure. - - Parameters: - ----------- - fig: matplotlib figure - """ - import matplotlib.pyplot as plt - if fig is None: - fig = plt.gcf() - # Look for the last TrellisGrid instance in the layer list - last_trellis = None - for layer in self.layers: - if isinstance(layer, TrellisGrid): - last_trellis = layer - if last_trellis is None: - # We have a simple, non-trellised plot - new_layers = sequence_layers(self.layers) - for layer in new_layers: - layer.work(fig=fig) - legend = {} - for layer in new_layers: - legend = dictionary_union(legend, layer.legend) - patches = [] - labels = [] - if len(list(legend.keys())) == 0: - key_function = lambda tup: tup - elif len(list(legend.keys())[0]) == 2: - key_function = lambda tup: (tup[1]) - else: - key_function = lambda tup: (tup[1], tup[3]) - for key in sorted(list(legend.keys()), key=key_function): - value = legend[key] - patches.append(value) - if len(key) == 2: - col, val = key - labels.append("%s" % str(val)) - elif len(key) == 4: - col1, val1, col2, val2 = key - labels.append("%s, %s" % (str(val1), str(val2))) - else: - raise ValueError("Maximum 2 categorical attributes to " - "display a lengend of") - if len(legend): - fig.legend(patches, labels, loc='upper right') - else: - # We have a trellised plot. First let's remove all other - # TrellisGrid instances from the layer list, including this one. - new_layers = [] - for layer in self.layers: - if not isinstance(layer, TrellisGrid): - new_layers.append(layer) - new_layers = sequence_layers(new_layers) - # Now replace the old layers by their trellised versions - new_layers = last_trellis.trellis(new_layers) - # Prepare the subplots and draw on them - new_layers = sequence_grids(new_layers) - axes_grids = [work_grid(grid, fig) for grid in new_layers] - axes_grid = axes_grids[-1] - adjust_subplots(fig, axes_grid, last_trellis, new_layers[-1]) - # And we're done - return fig diff --git a/pandas/tools/tests/data/allow_exact_matches.csv b/pandas/tools/tests/data/allow_exact_matches.csv new file mode 100644 index 0000000000000..0446fb744c540 --- /dev/null +++ b/pandas/tools/tests/data/allow_exact_matches.csv @@ -0,0 +1,28 @@ +time,ticker,price,quantity,marketCenter,bid,ask +20160525 13:30:00.023,MSFT,51.95,75,NASDAQ,, +20160525 13:30:00.038,MSFT,51.95,155,NASDAQ,51.95,51.95 +20160525 13:30:00.048,GOOG,720.77,100,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.92,100,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.93,200,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.93,300,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.93,600,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.93,44,NASDAQ,720.5,720.93 +20160525 13:30:00.074,AAPL,98.67,478343,NASDAQ,, +20160525 13:30:00.075,AAPL,98.67,478343,NASDAQ,, +20160525 13:30:00.075,AAPL,98.66,6,NASDAQ,, +20160525 13:30:00.075,AAPL,98.65,30,NASDAQ,, +20160525 13:30:00.075,AAPL,98.65,75,NASDAQ,, +20160525 13:30:00.075,AAPL,98.65,20,NASDAQ,, +20160525 13:30:00.075,AAPL,98.65,35,NASDAQ,, +20160525 13:30:00.075,AAPL,98.65,10,NASDAQ,, +20160525 13:30:00.075,AAPL,98.55,6,ARCA,, +20160525 13:30:00.075,AAPL,98.55,6,ARCA,, +20160525 13:30:00.076,AAPL,98.56,1000,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,200,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,300,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,400,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,600,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,200,ARCA,98.55,98.56 +20160525 13:30:00.078,MSFT,51.95,783,NASDAQ,51.95,51.95 +20160525 13:30:00.078,MSFT,51.95,100,NASDAQ,51.95,51.95 +20160525 13:30:00.078,MSFT,51.95,100,NASDAQ,51.95,51.95 diff --git a/pandas/tools/tests/data/allow_exact_matches_and_tolerance.csv b/pandas/tools/tests/data/allow_exact_matches_and_tolerance.csv new file mode 100644 index 0000000000000..0446fb744c540 --- /dev/null +++ b/pandas/tools/tests/data/allow_exact_matches_and_tolerance.csv @@ -0,0 +1,28 @@ +time,ticker,price,quantity,marketCenter,bid,ask +20160525 13:30:00.023,MSFT,51.95,75,NASDAQ,, +20160525 13:30:00.038,MSFT,51.95,155,NASDAQ,51.95,51.95 +20160525 13:30:00.048,GOOG,720.77,100,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.92,100,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.93,200,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.93,300,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.93,600,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.93,44,NASDAQ,720.5,720.93 +20160525 13:30:00.074,AAPL,98.67,478343,NASDAQ,, +20160525 13:30:00.075,AAPL,98.67,478343,NASDAQ,, +20160525 13:30:00.075,AAPL,98.66,6,NASDAQ,, +20160525 13:30:00.075,AAPL,98.65,30,NASDAQ,, +20160525 13:30:00.075,AAPL,98.65,75,NASDAQ,, +20160525 13:30:00.075,AAPL,98.65,20,NASDAQ,, +20160525 13:30:00.075,AAPL,98.65,35,NASDAQ,, +20160525 13:30:00.075,AAPL,98.65,10,NASDAQ,, +20160525 13:30:00.075,AAPL,98.55,6,ARCA,, +20160525 13:30:00.075,AAPL,98.55,6,ARCA,, +20160525 13:30:00.076,AAPL,98.56,1000,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,200,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,300,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,400,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,600,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,200,ARCA,98.55,98.56 +20160525 13:30:00.078,MSFT,51.95,783,NASDAQ,51.95,51.95 +20160525 13:30:00.078,MSFT,51.95,100,NASDAQ,51.95,51.95 +20160525 13:30:00.078,MSFT,51.95,100,NASDAQ,51.95,51.95 diff --git a/pandas/tools/tests/data/asof.csv b/pandas/tools/tests/data/asof.csv new file mode 100644 index 0000000000000..d7d061bc46ccc --- /dev/null +++ b/pandas/tools/tests/data/asof.csv @@ -0,0 +1,28 @@ +time,ticker,price,quantity,marketCenter,bid,ask +20160525 13:30:00.023,MSFT,51.95,75,NASDAQ,51.95,51.95 +20160525 13:30:00.038,MSFT,51.95,155,NASDAQ,51.95,51.95 +20160525 13:30:00.048,GOOG,720.77,100,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.92,100,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.93,200,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.93,300,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.93,600,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.93,44,NASDAQ,720.5,720.93 +20160525 13:30:00.074,AAPL,98.67,478343,NASDAQ,, +20160525 13:30:00.075,AAPL,98.67,478343,NASDAQ,98.55,98.56 +20160525 13:30:00.075,AAPL,98.66,6,NASDAQ,98.55,98.56 +20160525 13:30:00.075,AAPL,98.65,30,NASDAQ,98.55,98.56 +20160525 13:30:00.075,AAPL,98.65,75,NASDAQ,98.55,98.56 +20160525 13:30:00.075,AAPL,98.65,20,NASDAQ,98.55,98.56 +20160525 13:30:00.075,AAPL,98.65,35,NASDAQ,98.55,98.56 +20160525 13:30:00.075,AAPL,98.65,10,NASDAQ,98.55,98.56 +20160525 13:30:00.075,AAPL,98.55,6,ARCA,98.55,98.56 +20160525 13:30:00.075,AAPL,98.55,6,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,1000,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,200,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,300,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,400,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,600,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,200,ARCA,98.55,98.56 +20160525 13:30:00.078,MSFT,51.95,783,NASDAQ,51.92,51.95 +20160525 13:30:00.078,MSFT,51.95,100,NASDAQ,51.92,51.95 +20160525 13:30:00.078,MSFT,51.95,100,NASDAQ,51.92,51.95 diff --git a/pandas/tools/tests/data/asof2.csv b/pandas/tools/tests/data/asof2.csv new file mode 100644 index 0000000000000..2c9c0392dd617 --- /dev/null +++ b/pandas/tools/tests/data/asof2.csv @@ -0,0 +1,78 @@ +time,ticker,price,quantity,marketCenter,bid,ask +20160525 13:30:00.023,MSFT,51.95,75,NASDAQ,51.95,51.95 +20160525 13:30:00.038,MSFT,51.95,155,NASDAQ,51.95,51.95 +20160525 13:30:00.048,GOOG,720.77,100,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.92,100,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.93,200,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.93,300,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.93,600,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.93,44,NASDAQ,720.5,720.93 +20160525 13:30:00.074,AAPL,98.67,478343,NASDAQ,, +20160525 13:30:00.075,AAPL,98.67,478343,NASDAQ,98.55,98.56 +20160525 13:30:00.075,AAPL,98.66,6,NASDAQ,98.55,98.56 +20160525 13:30:00.075,AAPL,98.65,30,NASDAQ,98.55,98.56 +20160525 13:30:00.075,AAPL,98.65,75,NASDAQ,98.55,98.56 +20160525 13:30:00.075,AAPL,98.65,20,NASDAQ,98.55,98.56 +20160525 13:30:00.075,AAPL,98.65,35,NASDAQ,98.55,98.56 +20160525 13:30:00.075,AAPL,98.65,10,NASDAQ,98.55,98.56 +20160525 13:30:00.075,AAPL,98.55,6,ARCA,98.55,98.56 +20160525 13:30:00.075,AAPL,98.55,6,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,1000,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,200,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,300,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,400,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,600,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,200,ARCA,98.55,98.56 +20160525 13:30:00.078,MSFT,51.95,783,NASDAQ,51.92,51.95 +20160525 13:30:00.078,MSFT,51.95,100,NASDAQ,51.92,51.95 +20160525 13:30:00.078,MSFT,51.95,100,NASDAQ,51.92,51.95 +20160525 13:30:00.084,AAPL,98.64,40,NASDAQ,98.55,98.56 +20160525 13:30:00.084,AAPL,98.55,149,EDGX,98.55,98.56 +20160525 13:30:00.086,AAPL,98.56,500,ARCA,98.55,98.63 +20160525 13:30:00.104,AAPL,98.63,647,EDGX,98.62,98.63 +20160525 13:30:00.104,AAPL,98.63,300,EDGX,98.62,98.63 +20160525 13:30:00.104,AAPL,98.63,50,NASDAQ,98.62,98.63 +20160525 13:30:00.104,AAPL,98.63,50,NASDAQ,98.62,98.63 +20160525 13:30:00.104,AAPL,98.63,70,NASDAQ,98.62,98.63 +20160525 13:30:00.104,AAPL,98.63,70,NASDAQ,98.62,98.63 +20160525 13:30:00.104,AAPL,98.63,1,NASDAQ,98.62,98.63 +20160525 13:30:00.104,AAPL,98.63,62,NASDAQ,98.62,98.63 +20160525 13:30:00.104,AAPL,98.63,10,NASDAQ,98.62,98.63 +20160525 13:30:00.104,AAPL,98.63,100,ARCA,98.62,98.63 +20160525 13:30:00.105,AAPL,98.63,100,ARCA,98.62,98.63 +20160525 13:30:00.105,AAPL,98.63,700,ARCA,98.62,98.63 +20160525 13:30:00.106,AAPL,98.63,61,EDGX,98.62,98.63 +20160525 13:30:00.107,AAPL,98.63,100,ARCA,98.62,98.63 +20160525 13:30:00.107,AAPL,98.63,53,ARCA,98.62,98.63 +20160525 13:30:00.108,AAPL,98.63,100,ARCA,98.62,98.63 +20160525 13:30:00.108,AAPL,98.63,839,ARCA,98.62,98.63 +20160525 13:30:00.115,AAPL,98.63,5,EDGX,98.62,98.63 +20160525 13:30:00.118,AAPL,98.63,295,EDGX,98.62,98.63 +20160525 13:30:00.118,AAPL,98.63,5,EDGX,98.62,98.63 +20160525 13:30:00.128,AAPL,98.63,100,NASDAQ,98.62,98.63 +20160525 13:30:00.128,AAPL,98.63,100,NASDAQ,98.62,98.63 +20160525 13:30:00.128,MSFT,51.92,100,ARCA,51.92,51.95 +20160525 13:30:00.129,AAPL,98.62,100,NASDAQ,98.61,98.63 +20160525 13:30:00.129,AAPL,98.62,10,NASDAQ,98.61,98.63 +20160525 13:30:00.129,AAPL,98.62,59,NASDAQ,98.61,98.63 +20160525 13:30:00.129,AAPL,98.62,31,NASDAQ,98.61,98.63 +20160525 13:30:00.129,AAPL,98.62,69,NASDAQ,98.61,98.63 +20160525 13:30:00.129,AAPL,98.62,12,NASDAQ,98.61,98.63 +20160525 13:30:00.129,AAPL,98.62,12,EDGX,98.61,98.63 +20160525 13:30:00.129,AAPL,98.62,100,ARCA,98.61,98.63 +20160525 13:30:00.129,AAPL,98.62,100,ARCA,98.61,98.63 +20160525 13:30:00.130,MSFT,51.95,317,ARCA,51.93,51.95 +20160525 13:30:00.130,MSFT,51.95,283,ARCA,51.93,51.95 +20160525 13:30:00.135,MSFT,51.93,100,EDGX,51.92,51.95 +20160525 13:30:00.135,AAPL,98.62,100,ARCA,98.61,98.62 +20160525 13:30:00.144,AAPL,98.62,12,NASDAQ,98.61,98.62 +20160525 13:30:00.144,AAPL,98.62,88,NASDAQ,98.61,98.62 +20160525 13:30:00.144,AAPL,98.62,162,NASDAQ,98.61,98.62 +20160525 13:30:00.144,AAPL,98.61,100,BATS,98.61,98.62 +20160525 13:30:00.144,AAPL,98.62,61,ARCA,98.61,98.62 +20160525 13:30:00.144,AAPL,98.62,25,ARCA,98.61,98.62 +20160525 13:30:00.144,AAPL,98.62,14,ARCA,98.61,98.62 +20160525 13:30:00.145,AAPL,98.62,12,ARCA,98.6,98.63 +20160525 13:30:00.145,AAPL,98.62,100,ARCA,98.6,98.63 +20160525 13:30:00.145,AAPL,98.63,100,NASDAQ,98.6,98.63 +20160525 13:30:00.145,AAPL,98.63,100,NASDAQ,98.6,98.63 diff --git a/pandas/tools/tests/cut_data.csv b/pandas/tools/tests/data/cut_data.csv similarity index 100% rename from pandas/tools/tests/cut_data.csv rename to pandas/tools/tests/data/cut_data.csv diff --git a/pandas/tools/tests/data/quotes.csv b/pandas/tools/tests/data/quotes.csv new file mode 100644 index 0000000000000..3f31d2cfffe1b --- /dev/null +++ b/pandas/tools/tests/data/quotes.csv @@ -0,0 +1,17 @@ +time,ticker,bid,ask +20160525 13:30:00.023,GOOG,720.50,720.93 +20160525 13:30:00.023,MSFT,51.95,51.95 +20160525 13:30:00.041,MSFT,51.95,51.95 +20160525 13:30:00.048,GOOG,720.50,720.93 +20160525 13:30:00.048,GOOG,720.50,720.93 +20160525 13:30:00.048,GOOG,720.50,720.93 +20160525 13:30:00.048,GOOG,720.50,720.93 +20160525 13:30:00.072,GOOG,720.50,720.88 +20160525 13:30:00.075,AAPL,98.55,98.56 +20160525 13:30:00.076,AAPL,98.55,98.56 +20160525 13:30:00.076,AAPL,98.55,98.56 +20160525 13:30:00.076,AAPL,98.55,98.56 +20160525 13:30:00.078,MSFT,51.95,51.95 +20160525 13:30:00.078,MSFT,51.95,51.95 +20160525 13:30:00.078,MSFT,51.95,51.95 +20160525 13:30:00.078,MSFT,51.92,51.95 diff --git a/pandas/tools/tests/data/quotes2.csv b/pandas/tools/tests/data/quotes2.csv new file mode 100644 index 0000000000000..7ade1e7faf1ae --- /dev/null +++ b/pandas/tools/tests/data/quotes2.csv @@ -0,0 +1,57 @@ +time,ticker,bid,ask +20160525 13:30:00.023,GOOG,720.50,720.93 +20160525 13:30:00.023,MSFT,51.95,51.95 +20160525 13:30:00.041,MSFT,51.95,51.95 +20160525 13:30:00.048,GOOG,720.50,720.93 +20160525 13:30:00.048,GOOG,720.50,720.93 +20160525 13:30:00.048,GOOG,720.50,720.93 +20160525 13:30:00.048,GOOG,720.50,720.93 +20160525 13:30:00.072,GOOG,720.50,720.88 +20160525 13:30:00.075,AAPL,98.55,98.56 +20160525 13:30:00.076,AAPL,98.55,98.56 +20160525 13:30:00.076,AAPL,98.55,98.56 +20160525 13:30:00.076,AAPL,98.55,98.56 +20160525 13:30:00.078,MSFT,51.95,51.95 +20160525 13:30:00.078,MSFT,51.95,51.95 +20160525 13:30:00.078,MSFT,51.95,51.95 +20160525 13:30:00.078,MSFT,51.92,51.95 +20160525 13:30:00.079,MSFT,51.92,51.95 +20160525 13:30:00.080,AAPL,98.55,98.56 +20160525 13:30:00.084,AAPL,98.55,98.56 +20160525 13:30:00.086,AAPL,98.55,98.63 +20160525 13:30:00.088,AAPL,98.65,98.63 +20160525 13:30:00.089,AAPL,98.63,98.63 +20160525 13:30:00.104,AAPL,98.63,98.63 +20160525 13:30:00.104,AAPL,98.63,98.63 +20160525 13:30:00.104,AAPL,98.63,98.63 +20160525 13:30:00.104,AAPL,98.63,98.63 +20160525 13:30:00.104,AAPL,98.62,98.63 +20160525 13:30:00.105,AAPL,98.62,98.63 +20160525 13:30:00.107,AAPL,98.62,98.63 +20160525 13:30:00.115,AAPL,98.62,98.63 +20160525 13:30:00.115,AAPL,98.62,98.63 +20160525 13:30:00.118,AAPL,98.62,98.63 +20160525 13:30:00.128,AAPL,98.62,98.63 +20160525 13:30:00.128,AAPL,98.62,98.63 +20160525 13:30:00.129,AAPL,98.62,98.63 +20160525 13:30:00.129,AAPL,98.61,98.63 +20160525 13:30:00.129,AAPL,98.62,98.63 +20160525 13:30:00.129,AAPL,98.62,98.63 +20160525 13:30:00.129,AAPL,98.61,98.63 +20160525 13:30:00.130,MSFT,51.93,51.95 +20160525 13:30:00.130,MSFT,51.93,51.95 +20160525 13:30:00.130,AAPL,98.61,98.63 +20160525 13:30:00.131,AAPL,98.61,98.62 +20160525 13:30:00.131,AAPL,98.61,98.62 +20160525 13:30:00.135,MSFT,51.92,51.95 +20160525 13:30:00.135,AAPL,98.61,98.62 +20160525 13:30:00.136,AAPL,98.61,98.62 +20160525 13:30:00.136,AAPL,98.61,98.62 +20160525 13:30:00.144,AAPL,98.61,98.62 +20160525 13:30:00.144,AAPL,98.61,98.62 +20160525 13:30:00.145,AAPL,98.61,98.62 +20160525 13:30:00.145,AAPL,98.61,98.63 +20160525 13:30:00.145,AAPL,98.61,98.63 +20160525 13:30:00.145,AAPL,98.60,98.63 +20160525 13:30:00.145,AAPL,98.61,98.63 +20160525 13:30:00.145,AAPL,98.60,98.63 diff --git a/pandas/tools/tests/data/tolerance.csv b/pandas/tools/tests/data/tolerance.csv new file mode 100644 index 0000000000000..d7d061bc46ccc --- /dev/null +++ b/pandas/tools/tests/data/tolerance.csv @@ -0,0 +1,28 @@ +time,ticker,price,quantity,marketCenter,bid,ask +20160525 13:30:00.023,MSFT,51.95,75,NASDAQ,51.95,51.95 +20160525 13:30:00.038,MSFT,51.95,155,NASDAQ,51.95,51.95 +20160525 13:30:00.048,GOOG,720.77,100,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.92,100,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.93,200,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.93,300,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.93,600,NASDAQ,720.5,720.93 +20160525 13:30:00.048,GOOG,720.93,44,NASDAQ,720.5,720.93 +20160525 13:30:00.074,AAPL,98.67,478343,NASDAQ,, +20160525 13:30:00.075,AAPL,98.67,478343,NASDAQ,98.55,98.56 +20160525 13:30:00.075,AAPL,98.66,6,NASDAQ,98.55,98.56 +20160525 13:30:00.075,AAPL,98.65,30,NASDAQ,98.55,98.56 +20160525 13:30:00.075,AAPL,98.65,75,NASDAQ,98.55,98.56 +20160525 13:30:00.075,AAPL,98.65,20,NASDAQ,98.55,98.56 +20160525 13:30:00.075,AAPL,98.65,35,NASDAQ,98.55,98.56 +20160525 13:30:00.075,AAPL,98.65,10,NASDAQ,98.55,98.56 +20160525 13:30:00.075,AAPL,98.55,6,ARCA,98.55,98.56 +20160525 13:30:00.075,AAPL,98.55,6,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,1000,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,200,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,300,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,400,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,600,ARCA,98.55,98.56 +20160525 13:30:00.076,AAPL,98.56,200,ARCA,98.55,98.56 +20160525 13:30:00.078,MSFT,51.95,783,NASDAQ,51.92,51.95 +20160525 13:30:00.078,MSFT,51.95,100,NASDAQ,51.92,51.95 +20160525 13:30:00.078,MSFT,51.95,100,NASDAQ,51.92,51.95 diff --git a/pandas/tools/tests/data/trades.csv b/pandas/tools/tests/data/trades.csv new file mode 100644 index 0000000000000..b26a4ce714255 --- /dev/null +++ b/pandas/tools/tests/data/trades.csv @@ -0,0 +1,28 @@ +time,ticker,price,quantity,marketCenter +20160525 13:30:00.023,MSFT,51.9500,75,NASDAQ +20160525 13:30:00.038,MSFT,51.9500,155,NASDAQ +20160525 13:30:00.048,GOOG,720.7700,100,NASDAQ +20160525 13:30:00.048,GOOG,720.9200,100,NASDAQ +20160525 13:30:00.048,GOOG,720.9300,200,NASDAQ +20160525 13:30:00.048,GOOG,720.9300,300,NASDAQ +20160525 13:30:00.048,GOOG,720.9300,600,NASDAQ +20160525 13:30:00.048,GOOG,720.9300,44,NASDAQ +20160525 13:30:00.074,AAPL,98.6700,478343,NASDAQ +20160525 13:30:00.075,AAPL,98.6700,478343,NASDAQ +20160525 13:30:00.075,AAPL,98.6600,6,NASDAQ +20160525 13:30:00.075,AAPL,98.6500,30,NASDAQ +20160525 13:30:00.075,AAPL,98.6500,75,NASDAQ +20160525 13:30:00.075,AAPL,98.6500,20,NASDAQ +20160525 13:30:00.075,AAPL,98.6500,35,NASDAQ +20160525 13:30:00.075,AAPL,98.6500,10,NASDAQ +20160525 13:30:00.075,AAPL,98.5500,6,ARCA +20160525 13:30:00.075,AAPL,98.5500,6,ARCA +20160525 13:30:00.076,AAPL,98.5600,1000,ARCA +20160525 13:30:00.076,AAPL,98.5600,200,ARCA +20160525 13:30:00.076,AAPL,98.5600,300,ARCA +20160525 13:30:00.076,AAPL,98.5600,400,ARCA +20160525 13:30:00.076,AAPL,98.5600,600,ARCA +20160525 13:30:00.076,AAPL,98.5600,200,ARCA +20160525 13:30:00.078,MSFT,51.9500,783,NASDAQ +20160525 13:30:00.078,MSFT,51.9500,100,NASDAQ +20160525 13:30:00.078,MSFT,51.9500,100,NASDAQ diff --git a/pandas/tools/tests/data/trades2.csv b/pandas/tools/tests/data/trades2.csv new file mode 100644 index 0000000000000..64021faa68ce3 --- /dev/null +++ b/pandas/tools/tests/data/trades2.csv @@ -0,0 +1,78 @@ +time,ticker,price,quantity,marketCenter +20160525 13:30:00.023,MSFT,51.9500,75,NASDAQ +20160525 13:30:00.038,MSFT,51.9500,155,NASDAQ +20160525 13:30:00.048,GOOG,720.7700,100,NASDAQ +20160525 13:30:00.048,GOOG,720.9200,100,NASDAQ +20160525 13:30:00.048,GOOG,720.9300,200,NASDAQ +20160525 13:30:00.048,GOOG,720.9300,300,NASDAQ +20160525 13:30:00.048,GOOG,720.9300,600,NASDAQ +20160525 13:30:00.048,GOOG,720.9300,44,NASDAQ +20160525 13:30:00.074,AAPL,98.6700,478343,NASDAQ +20160525 13:30:00.075,AAPL,98.6700,478343,NASDAQ +20160525 13:30:00.075,AAPL,98.6600,6,NASDAQ +20160525 13:30:00.075,AAPL,98.6500,30,NASDAQ +20160525 13:30:00.075,AAPL,98.6500,75,NASDAQ +20160525 13:30:00.075,AAPL,98.6500,20,NASDAQ +20160525 13:30:00.075,AAPL,98.6500,35,NASDAQ +20160525 13:30:00.075,AAPL,98.6500,10,NASDAQ +20160525 13:30:00.075,AAPL,98.5500,6,ARCA +20160525 13:30:00.075,AAPL,98.5500,6,ARCA +20160525 13:30:00.076,AAPL,98.5600,1000,ARCA +20160525 13:30:00.076,AAPL,98.5600,200,ARCA +20160525 13:30:00.076,AAPL,98.5600,300,ARCA +20160525 13:30:00.076,AAPL,98.5600,400,ARCA +20160525 13:30:00.076,AAPL,98.5600,600,ARCA +20160525 13:30:00.076,AAPL,98.5600,200,ARCA +20160525 13:30:00.078,MSFT,51.9500,783,NASDAQ +20160525 13:30:00.078,MSFT,51.9500,100,NASDAQ +20160525 13:30:00.078,MSFT,51.9500,100,NASDAQ +20160525 13:30:00.084,AAPL,98.6400,40,NASDAQ +20160525 13:30:00.084,AAPL,98.5500,149,EDGX +20160525 13:30:00.086,AAPL,98.5600,500,ARCA +20160525 13:30:00.104,AAPL,98.6300,647,EDGX +20160525 13:30:00.104,AAPL,98.6300,300,EDGX +20160525 13:30:00.104,AAPL,98.6300,50,NASDAQ +20160525 13:30:00.104,AAPL,98.6300,50,NASDAQ +20160525 13:30:00.104,AAPL,98.6300,70,NASDAQ +20160525 13:30:00.104,AAPL,98.6300,70,NASDAQ +20160525 13:30:00.104,AAPL,98.6300,1,NASDAQ +20160525 13:30:00.104,AAPL,98.6300,62,NASDAQ +20160525 13:30:00.104,AAPL,98.6300,10,NASDAQ +20160525 13:30:00.104,AAPL,98.6300,100,ARCA +20160525 13:30:00.105,AAPL,98.6300,100,ARCA +20160525 13:30:00.105,AAPL,98.6300,700,ARCA +20160525 13:30:00.106,AAPL,98.6300,61,EDGX +20160525 13:30:00.107,AAPL,98.6300,100,ARCA +20160525 13:30:00.107,AAPL,98.6300,53,ARCA +20160525 13:30:00.108,AAPL,98.6300,100,ARCA +20160525 13:30:00.108,AAPL,98.6300,839,ARCA +20160525 13:30:00.115,AAPL,98.6300,5,EDGX +20160525 13:30:00.118,AAPL,98.6300,295,EDGX +20160525 13:30:00.118,AAPL,98.6300,5,EDGX +20160525 13:30:00.128,AAPL,98.6300,100,NASDAQ +20160525 13:30:00.128,AAPL,98.6300,100,NASDAQ +20160525 13:30:00.128,MSFT,51.9200,100,ARCA +20160525 13:30:00.129,AAPL,98.6200,100,NASDAQ +20160525 13:30:00.129,AAPL,98.6200,10,NASDAQ +20160525 13:30:00.129,AAPL,98.6200,59,NASDAQ +20160525 13:30:00.129,AAPL,98.6200,31,NASDAQ +20160525 13:30:00.129,AAPL,98.6200,69,NASDAQ +20160525 13:30:00.129,AAPL,98.6200,12,NASDAQ +20160525 13:30:00.129,AAPL,98.6200,12,EDGX +20160525 13:30:00.129,AAPL,98.6200,100,ARCA +20160525 13:30:00.129,AAPL,98.6200,100,ARCA +20160525 13:30:00.130,MSFT,51.9500,317,ARCA +20160525 13:30:00.130,MSFT,51.9500,283,ARCA +20160525 13:30:00.135,MSFT,51.9300,100,EDGX +20160525 13:30:00.135,AAPL,98.6200,100,ARCA +20160525 13:30:00.144,AAPL,98.6200,12,NASDAQ +20160525 13:30:00.144,AAPL,98.6200,88,NASDAQ +20160525 13:30:00.144,AAPL,98.6200,162,NASDAQ +20160525 13:30:00.144,AAPL,98.6100,100,BATS +20160525 13:30:00.144,AAPL,98.6200,61,ARCA +20160525 13:30:00.144,AAPL,98.6200,25,ARCA +20160525 13:30:00.144,AAPL,98.6200,14,ARCA +20160525 13:30:00.145,AAPL,98.6200,12,ARCA +20160525 13:30:00.145,AAPL,98.6200,100,ARCA +20160525 13:30:00.145,AAPL,98.6300,100,NASDAQ +20160525 13:30:00.145,AAPL,98.6300,100,NASDAQ diff --git a/pandas/tools/tests/test_concat.py b/pandas/tools/tests/test_concat.py index 9d9b0635e0f35..8e20cfa83c405 100644 --- a/pandas/tools/tests/test_concat.py +++ b/pandas/tools/tests/test_concat.py @@ -4,19 +4,20 @@ from numpy.random import randn from datetime import datetime -from pandas.compat import StringIO +from pandas.compat import StringIO, iteritems import pandas as pd from pandas import (DataFrame, concat, read_csv, isnull, Series, date_range, Index, Panel, MultiIndex, Timestamp, - DatetimeIndex) + DatetimeIndex, Categorical) +from pandas.types.concat import union_categoricals from pandas.util import testing as tm from pandas.util.testing import (assert_frame_equal, makeCustomDataframe as mkdf, assert_almost_equal) -class TestConcatenate(tm.TestCase): +class ConcatenateBase(tm.TestCase): _multiprocess_can_split_ = True @@ -25,6 +26,681 @@ def setUp(self): self.mixed_frame = self.frame.copy() self.mixed_frame['foo'] = 'bar' + +class TestConcatAppendCommon(ConcatenateBase): + + """ + Test common dtype coercion rules between concat and append. + """ + + def setUp(self): + + dt_data = [pd.Timestamp('2011-01-01'), + pd.Timestamp('2011-01-02'), + pd.Timestamp('2011-01-03')] + tz_data = [pd.Timestamp('2011-01-01', tz='US/Eastern'), + pd.Timestamp('2011-01-02', tz='US/Eastern'), + pd.Timestamp('2011-01-03', tz='US/Eastern')] + + td_data = [pd.Timedelta('1 days'), + pd.Timedelta('2 days'), + pd.Timedelta('3 days')] + + period_data = [pd.Period('2011-01', freq='M'), + pd.Period('2011-02', freq='M'), + pd.Period('2011-03', freq='M')] + + self.data = {'bool': [True, False, True], + 'int64': [1, 2, 3], + 'float64': [1.1, np.nan, 3.3], + 'category': pd.Categorical(['X', 'Y', 'Z']), + 'object': ['a', 'b', 'c'], + 'datetime64[ns]': dt_data, + 'datetime64[ns, US/Eastern]': tz_data, + 'timedelta64[ns]': td_data, + 'period[M]': period_data} + + def _check_expected_dtype(self, obj, label): + """ + Check whether obj has expected dtype depending on label + considering not-supported dtypes + """ + if isinstance(obj, pd.Index): + if label == 'bool': + self.assertEqual(obj.dtype, 'object') + else: + self.assertEqual(obj.dtype, label) + elif isinstance(obj, pd.Series): + if label.startswith('period'): + self.assertEqual(obj.dtype, 'object') + else: + self.assertEqual(obj.dtype, label) + else: + raise ValueError + + def test_dtypes(self): + # to confirm test case covers intended dtypes + for typ, vals in iteritems(self.data): + self._check_expected_dtype(pd.Index(vals), typ) + self._check_expected_dtype(pd.Series(vals), typ) + + def test_concatlike_same_dtypes(self): + # GH 13660 + for typ1, vals1 in iteritems(self.data): + + vals2 = vals1 + vals3 = vals1 + + if typ1 == 'category': + exp_data = pd.Categorical(list(vals1) + list(vals2)) + exp_data3 = pd.Categorical(list(vals1) + list(vals2) + + list(vals3)) + else: + exp_data = vals1 + vals2 + exp_data3 = vals1 + vals2 + vals3 + + # ----- Index ----- # + + # index.append + res = pd.Index(vals1).append(pd.Index(vals2)) + exp = pd.Index(exp_data) + tm.assert_index_equal(res, exp) + + # 3 elements + res = pd.Index(vals1).append([pd.Index(vals2), pd.Index(vals3)]) + exp = pd.Index(exp_data3) + tm.assert_index_equal(res, exp) + + # index.append name mismatch + i1 = pd.Index(vals1, name='x') + i2 = pd.Index(vals2, name='y') + res = i1.append(i2) + exp = pd.Index(exp_data) + tm.assert_index_equal(res, exp) + + # index.append name match + i1 = pd.Index(vals1, name='x') + i2 = pd.Index(vals2, name='x') + res = i1.append(i2) + exp = pd.Index(exp_data, name='x') + tm.assert_index_equal(res, exp) + + # cannot append non-index + with tm.assertRaisesRegexp(TypeError, 'all inputs must be Index'): + pd.Index(vals1).append(vals2) + + with tm.assertRaisesRegexp(TypeError, 'all inputs must be Index'): + pd.Index(vals1).append([pd.Index(vals2), vals3]) + + # ----- Series ----- # + + # series.append + res = pd.Series(vals1).append(pd.Series(vals2), + ignore_index=True) + exp = pd.Series(exp_data) + tm.assert_series_equal(res, exp, check_index_type=True) + + # concat + res = pd.concat([pd.Series(vals1), pd.Series(vals2)], + ignore_index=True) + tm.assert_series_equal(res, exp, check_index_type=True) + + # 3 elements + res = pd.Series(vals1).append([pd.Series(vals2), pd.Series(vals3)], + ignore_index=True) + exp = pd.Series(exp_data3) + tm.assert_series_equal(res, exp) + + res = pd.concat([pd.Series(vals1), pd.Series(vals2), + pd.Series(vals3)], ignore_index=True) + tm.assert_series_equal(res, exp) + + # name mismatch + s1 = pd.Series(vals1, name='x') + s2 = pd.Series(vals2, name='y') + res = s1.append(s2, ignore_index=True) + exp = pd.Series(exp_data) + tm.assert_series_equal(res, exp, check_index_type=True) + + res = pd.concat([s1, s2], ignore_index=True) + tm.assert_series_equal(res, exp, check_index_type=True) + + # name match + s1 = pd.Series(vals1, name='x') + s2 = pd.Series(vals2, name='x') + res = s1.append(s2, ignore_index=True) + exp = pd.Series(exp_data, name='x') + tm.assert_series_equal(res, exp, check_index_type=True) + + res = pd.concat([s1, s2], ignore_index=True) + tm.assert_series_equal(res, exp, check_index_type=True) + + # cannot append non-index + msg = "cannot concatenate a non-NDFrame object" + with tm.assertRaisesRegexp(TypeError, msg): + pd.Series(vals1).append(vals2) + + with tm.assertRaisesRegexp(TypeError, msg): + pd.Series(vals1).append([pd.Series(vals2), vals3]) + + with tm.assertRaisesRegexp(TypeError, msg): + pd.concat([pd.Series(vals1), vals2]) + + with tm.assertRaisesRegexp(TypeError, msg): + pd.concat([pd.Series(vals1), pd.Series(vals2), vals3]) + + def test_concatlike_dtypes_coercion(self): + # GH 13660 + for typ1, vals1 in iteritems(self.data): + for typ2, vals2 in iteritems(self.data): + + vals3 = vals2 + + # basically infer + exp_index_dtype = None + exp_series_dtype = None + + if typ1 == typ2: + # same dtype is tested in test_concatlike_same_dtypes + continue + elif typ1 == 'category' or typ2 == 'category': + # ToDo: suspicious + continue + + # specify expected dtype + if typ1 == 'bool' and typ2 in ('int64', 'float64'): + # series coerces to numeric based on numpy rule + # index doesn't because bool is object dtype + exp_series_dtype = typ2 + elif typ2 == 'bool' and typ1 in ('int64', 'float64'): + exp_series_dtype = typ1 + elif (typ1 == 'datetime64[ns, US/Eastern]' or + typ2 == 'datetime64[ns, US/Eastern]' or + typ1 == 'timedelta64[ns]' or + typ2 == 'timedelta64[ns]'): + exp_index_dtype = object + exp_series_dtype = object + + exp_data = vals1 + vals2 + exp_data3 = vals1 + vals2 + vals3 + + # ----- Index ----- # + + # index.append + res = pd.Index(vals1).append(pd.Index(vals2)) + exp = pd.Index(exp_data, dtype=exp_index_dtype) + tm.assert_index_equal(res, exp) + + # 3 elements + res = pd.Index(vals1).append([pd.Index(vals2), + pd.Index(vals3)]) + exp = pd.Index(exp_data3, dtype=exp_index_dtype) + tm.assert_index_equal(res, exp) + + # ----- Series ----- # + + # series.append + res = pd.Series(vals1).append(pd.Series(vals2), + ignore_index=True) + exp = pd.Series(exp_data, dtype=exp_series_dtype) + tm.assert_series_equal(res, exp, check_index_type=True) + + # concat + res = pd.concat([pd.Series(vals1), pd.Series(vals2)], + ignore_index=True) + tm.assert_series_equal(res, exp, check_index_type=True) + + # 3 elements + res = pd.Series(vals1).append([pd.Series(vals2), + pd.Series(vals3)], + ignore_index=True) + exp = pd.Series(exp_data3, dtype=exp_series_dtype) + tm.assert_series_equal(res, exp) + + res = pd.concat([pd.Series(vals1), pd.Series(vals2), + pd.Series(vals3)], ignore_index=True) + tm.assert_series_equal(res, exp) + + def test_concatlike_common_coerce_to_pandas_object(self): + # GH 13626 + # result must be Timestamp/Timedelta, not datetime.datetime/timedelta + dti = pd.DatetimeIndex(['2011-01-01', '2011-01-02']) + tdi = pd.TimedeltaIndex(['1 days', '2 days']) + + exp = pd.Index([pd.Timestamp('2011-01-01'), + pd.Timestamp('2011-01-02'), + pd.Timedelta('1 days'), + pd.Timedelta('2 days')]) + + res = dti.append(tdi) + tm.assert_index_equal(res, exp) + tm.assertIsInstance(res[0], pd.Timestamp) + tm.assertIsInstance(res[-1], pd.Timedelta) + + dts = pd.Series(dti) + tds = pd.Series(tdi) + res = dts.append(tds) + tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) + tm.assertIsInstance(res.iloc[0], pd.Timestamp) + tm.assertIsInstance(res.iloc[-1], pd.Timedelta) + + res = pd.concat([dts, tds]) + tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) + tm.assertIsInstance(res.iloc[0], pd.Timestamp) + tm.assertIsInstance(res.iloc[-1], pd.Timedelta) + + def test_concatlike_datetimetz(self): + # GH 7795 + for tz in ['UTC', 'US/Eastern', 'Asia/Tokyo']: + dti1 = pd.DatetimeIndex(['2011-01-01', '2011-01-02'], tz=tz) + dti2 = pd.DatetimeIndex(['2012-01-01', '2012-01-02'], tz=tz) + + exp = pd.DatetimeIndex(['2011-01-01', '2011-01-02', + '2012-01-01', '2012-01-02'], tz=tz) + + res = dti1.append(dti2) + tm.assert_index_equal(res, exp) + + dts1 = pd.Series(dti1) + dts2 = pd.Series(dti2) + res = dts1.append(dts2) + tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) + + res = pd.concat([dts1, dts2]) + tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) + + def test_concatlike_datetimetz_short(self): + # GH 7795 + for tz in ['UTC', 'US/Eastern', 'Asia/Tokyo', 'EST5EDT']: + + ix1 = pd.DatetimeIndex(start='2014-07-15', end='2014-07-17', + freq='D', tz=tz) + ix2 = pd.DatetimeIndex(['2014-07-11', '2014-07-21'], tz=tz) + df1 = pd.DataFrame(0, index=ix1, columns=['A', 'B']) + df2 = pd.DataFrame(0, index=ix2, columns=['A', 'B']) + + exp_idx = pd.DatetimeIndex(['2014-07-15', '2014-07-16', + '2014-07-17', '2014-07-11', + '2014-07-21'], tz=tz) + exp = pd.DataFrame(0, index=exp_idx, columns=['A', 'B']) + + tm.assert_frame_equal(df1.append(df2), exp) + tm.assert_frame_equal(pd.concat([df1, df2]), exp) + + def test_concatlike_datetimetz_to_object(self): + # GH 13660 + + # different tz coerces to object + for tz in ['UTC', 'US/Eastern', 'Asia/Tokyo']: + dti1 = pd.DatetimeIndex(['2011-01-01', '2011-01-02'], tz=tz) + dti2 = pd.DatetimeIndex(['2012-01-01', '2012-01-02']) + + exp = pd.Index([pd.Timestamp('2011-01-01', tz=tz), + pd.Timestamp('2011-01-02', tz=tz), + pd.Timestamp('2012-01-01'), + pd.Timestamp('2012-01-02')], dtype=object) + + res = dti1.append(dti2) + tm.assert_index_equal(res, exp) + + dts1 = pd.Series(dti1) + dts2 = pd.Series(dti2) + res = dts1.append(dts2) + tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) + + res = pd.concat([dts1, dts2]) + tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) + + # different tz + dti3 = pd.DatetimeIndex(['2012-01-01', '2012-01-02'], + tz='US/Pacific') + + exp = pd.Index([pd.Timestamp('2011-01-01', tz=tz), + pd.Timestamp('2011-01-02', tz=tz), + pd.Timestamp('2012-01-01', tz='US/Pacific'), + pd.Timestamp('2012-01-02', tz='US/Pacific')], + dtype=object) + + res = dti1.append(dti3) + # tm.assert_index_equal(res, exp) + + dts1 = pd.Series(dti1) + dts3 = pd.Series(dti3) + res = dts1.append(dts3) + tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) + + res = pd.concat([dts1, dts3]) + tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) + + def test_concatlike_common_period(self): + # GH 13660 + pi1 = pd.PeriodIndex(['2011-01', '2011-02'], freq='M') + pi2 = pd.PeriodIndex(['2012-01', '2012-02'], freq='M') + + exp = pd.PeriodIndex(['2011-01', '2011-02', '2012-01', + '2012-02'], freq='M') + + res = pi1.append(pi2) + tm.assert_index_equal(res, exp) + + ps1 = pd.Series(pi1) + ps2 = pd.Series(pi2) + res = ps1.append(ps2) + tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) + + res = pd.concat([ps1, ps2]) + tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) + + def test_concatlike_common_period_diff_freq_to_object(self): + # GH 13221 + pi1 = pd.PeriodIndex(['2011-01', '2011-02'], freq='M') + pi2 = pd.PeriodIndex(['2012-01-01', '2012-02-01'], freq='D') + + exp = pd.Index([pd.Period('2011-01', freq='M'), + pd.Period('2011-02', freq='M'), + pd.Period('2012-01-01', freq='D'), + pd.Period('2012-02-01', freq='D')], dtype=object) + + res = pi1.append(pi2) + tm.assert_index_equal(res, exp) + + ps1 = pd.Series(pi1) + ps2 = pd.Series(pi2) + res = ps1.append(ps2) + tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) + + res = pd.concat([ps1, ps2]) + tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) + + def test_concatlike_common_period_mixed_dt_to_object(self): + # GH 13221 + # different datetimelike + pi1 = pd.PeriodIndex(['2011-01', '2011-02'], freq='M') + tdi = pd.TimedeltaIndex(['1 days', '2 days']) + exp = pd.Index([pd.Period('2011-01', freq='M'), + pd.Period('2011-02', freq='M'), + pd.Timedelta('1 days'), + pd.Timedelta('2 days')], dtype=object) + + res = pi1.append(tdi) + tm.assert_index_equal(res, exp) + + ps1 = pd.Series(pi1) + tds = pd.Series(tdi) + res = ps1.append(tds) + tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) + + res = pd.concat([ps1, tds]) + tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) + + # inverse + exp = pd.Index([pd.Timedelta('1 days'), + pd.Timedelta('2 days'), + pd.Period('2011-01', freq='M'), + pd.Period('2011-02', freq='M')], dtype=object) + + res = tdi.append(pi1) + tm.assert_index_equal(res, exp) + + ps1 = pd.Series(pi1) + tds = pd.Series(tdi) + res = tds.append(ps1) + tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) + + res = pd.concat([tds, ps1]) + tm.assert_series_equal(res, pd.Series(exp, index=[0, 1, 0, 1])) + + def test_concat_categorical(self): + # GH 13524 + + # same categories -> category + s1 = pd.Series([1, 2, np.nan], dtype='category') + s2 = pd.Series([2, 1, 2], dtype='category') + + exp = pd.Series([1, 2, np.nan, 2, 1, 2], dtype='category') + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + # partially different categories => not-category + s1 = pd.Series([3, 2], dtype='category') + s2 = pd.Series([2, 1], dtype='category') + + exp = pd.Series([3, 2, 2, 1]) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + # completelly different categories (same dtype) => not-category + s1 = pd.Series([10, 11, np.nan], dtype='category') + s2 = pd.Series([np.nan, 1, 3, 2], dtype='category') + + exp = pd.Series([10, 11, np.nan, np.nan, 1, 3, 2]) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + def test_concat_categorical_coercion(self): + # GH 13524 + + # category + not-category => not-category + s1 = pd.Series([1, 2, np.nan], dtype='category') + s2 = pd.Series([2, 1, 2]) + + exp = pd.Series([1, 2, np.nan, 2, 1, 2]) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + # result shouldn't be affected by 1st elem dtype + exp = pd.Series([2, 1, 2, 1, 2, np.nan]) + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) + tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) + + # all values are not in category => not-category + s1 = pd.Series([3, 2], dtype='category') + s2 = pd.Series([2, 1]) + + exp = pd.Series([3, 2, 2, 1]) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + exp = pd.Series([2, 1, 3, 2]) + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) + tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) + + # completelly different categories => not-category + s1 = pd.Series([10, 11, np.nan], dtype='category') + s2 = pd.Series([1, 3, 2]) + + exp = pd.Series([10, 11, np.nan, 1, 3, 2]) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + exp = pd.Series([1, 3, 2, 10, 11, np.nan]) + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) + tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) + + # different dtype => not-category + s1 = pd.Series([10, 11, np.nan], dtype='category') + s2 = pd.Series(['a', 'b', 'c']) + + exp = pd.Series([10, 11, np.nan, 'a', 'b', 'c']) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + exp = pd.Series(['a', 'b', 'c', 10, 11, np.nan]) + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) + tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) + + # if normal series only contains NaN-likes => not-category + s1 = pd.Series([10, 11], dtype='category') + s2 = pd.Series([np.nan, np.nan, np.nan]) + + exp = pd.Series([10, 11, np.nan, np.nan, np.nan]) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + exp = pd.Series([np.nan, np.nan, np.nan, 10, 11]) + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) + tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) + + def test_concat_categorical_3elem_coercion(self): + # GH 13524 + + # mixed dtypes => not-category + s1 = pd.Series([1, 2, np.nan], dtype='category') + s2 = pd.Series([2, 1, 2], dtype='category') + s3 = pd.Series([1, 2, 1, 2, np.nan]) + + exp = pd.Series([1, 2, np.nan, 2, 1, 2, 1, 2, 1, 2, np.nan]) + tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp) + tm.assert_series_equal(s1.append([s2, s3], ignore_index=True), exp) + + exp = pd.Series([1, 2, 1, 2, np.nan, 1, 2, np.nan, 2, 1, 2]) + tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s3.append([s1, s2], ignore_index=True), exp) + + # values are all in either category => not-category + s1 = pd.Series([4, 5, 6], dtype='category') + s2 = pd.Series([1, 2, 3], dtype='category') + s3 = pd.Series([1, 3, 4]) + + exp = pd.Series([4, 5, 6, 1, 2, 3, 1, 3, 4]) + tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp) + tm.assert_series_equal(s1.append([s2, s3], ignore_index=True), exp) + + exp = pd.Series([1, 3, 4, 4, 5, 6, 1, 2, 3]) + tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s3.append([s1, s2], ignore_index=True), exp) + + # values are all in either category => not-category + s1 = pd.Series([4, 5, 6], dtype='category') + s2 = pd.Series([1, 2, 3], dtype='category') + s3 = pd.Series([10, 11, 12]) + + exp = pd.Series([4, 5, 6, 1, 2, 3, 10, 11, 12]) + tm.assert_series_equal(pd.concat([s1, s2, s3], ignore_index=True), exp) + tm.assert_series_equal(s1.append([s2, s3], ignore_index=True), exp) + + exp = pd.Series([10, 11, 12, 4, 5, 6, 1, 2, 3]) + tm.assert_series_equal(pd.concat([s3, s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s3.append([s1, s2], ignore_index=True), exp) + + def test_concat_categorical_multi_coercion(self): + # GH 13524 + + s1 = pd.Series([1, 3], dtype='category') + s2 = pd.Series([3, 4], dtype='category') + s3 = pd.Series([2, 3]) + s4 = pd.Series([2, 2], dtype='category') + s5 = pd.Series([1, np.nan]) + s6 = pd.Series([1, 3, 2], dtype='category') + + # mixed dtype, values are all in categories => not-category + exp = pd.Series([1, 3, 3, 4, 2, 3, 2, 2, 1, np.nan, 1, 3, 2]) + res = pd.concat([s1, s2, s3, s4, s5, s6], ignore_index=True) + tm.assert_series_equal(res, exp) + res = s1.append([s2, s3, s4, s5, s6], ignore_index=True) + tm.assert_series_equal(res, exp) + + exp = pd.Series([1, 3, 2, 1, np.nan, 2, 2, 2, 3, 3, 4, 1, 3]) + res = pd.concat([s6, s5, s4, s3, s2, s1], ignore_index=True) + tm.assert_series_equal(res, exp) + res = s6.append([s5, s4, s3, s2, s1], ignore_index=True) + tm.assert_series_equal(res, exp) + + def test_concat_categorical_ordered(self): + # GH 13524 + + s1 = pd.Series(pd.Categorical([1, 2, np.nan], ordered=True)) + s2 = pd.Series(pd.Categorical([2, 1, 2], ordered=True)) + + exp = pd.Series(pd.Categorical([1, 2, np.nan, 2, 1, 2], ordered=True)) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + exp = pd.Series(pd.Categorical([1, 2, np.nan, 2, 1, 2, 1, 2, np.nan], + ordered=True)) + tm.assert_series_equal(pd.concat([s1, s2, s1], ignore_index=True), exp) + tm.assert_series_equal(s1.append([s2, s1], ignore_index=True), exp) + + def test_concat_categorical_coercion_nan(self): + # GH 13524 + + # some edge cases + # category + not-category => not category + s1 = pd.Series(np.array([np.nan, np.nan], dtype=np.float64), + dtype='category') + s2 = pd.Series([np.nan, 1]) + + exp = pd.Series([np.nan, np.nan, np.nan, 1]) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + s1 = pd.Series([1, np.nan], dtype='category') + s2 = pd.Series([np.nan, np.nan]) + + exp = pd.Series([1, np.nan, np.nan, np.nan]) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + # mixed dtype, all nan-likes => not-category + s1 = pd.Series([np.nan, np.nan], dtype='category') + s2 = pd.Series([np.nan, np.nan]) + + exp = pd.Series([np.nan, np.nan, np.nan, np.nan], dtype=object) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) + tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) + + # all category nan-likes => category + s1 = pd.Series([np.nan, np.nan], dtype='category') + s2 = pd.Series([np.nan, np.nan], dtype='category') + + exp = pd.Series([np.nan, np.nan, np.nan, np.nan], dtype='category') + + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + def test_concat_categorical_empty(self): + # GH 13524 + + s1 = pd.Series([], dtype='category') + s2 = pd.Series([1, 2], dtype='category') + + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2) + tm.assert_series_equal(s1.append(s2, ignore_index=True), s2) + + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), s2) + tm.assert_series_equal(s2.append(s1, ignore_index=True), s2) + + s1 = pd.Series([], dtype='category') + s2 = pd.Series([], dtype='category') + + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2) + tm.assert_series_equal(s1.append(s2, ignore_index=True), s2) + + s1 = pd.Series([], dtype='category') + s2 = pd.Series([]) + + # different dtype => not-category + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), s2) + tm.assert_series_equal(s1.append(s2, ignore_index=True), s2) + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), s2) + tm.assert_series_equal(s2.append(s1, ignore_index=True), s2) + + s1 = pd.Series([], dtype='category') + s2 = pd.Series([np.nan, np.nan]) + + # empty Series is ignored + exp = pd.Series([np.nan, np.nan]) + tm.assert_series_equal(pd.concat([s1, s2], ignore_index=True), exp) + tm.assert_series_equal(s1.append(s2, ignore_index=True), exp) + + tm.assert_series_equal(pd.concat([s2, s1], ignore_index=True), exp) + tm.assert_series_equal(s2.append(s1, ignore_index=True), exp) + + +class TestAppend(ConcatenateBase): + def test_append(self): begin_index = self.frame.index[:5] end_index = self.frame.index[5:] @@ -141,42 +817,32 @@ def test_append_preserve_index_name(self): result = df1.append(df2) self.assertEqual(result.index.name, 'A') - def test_join_many(self): - df = DataFrame(np.random.randn(10, 6), columns=list('abcdef')) - df_list = [df[['a', 'b']], df[['c', 'd']], df[['e', 'f']]] - - joined = df_list[0].join(df_list[1:]) - tm.assert_frame_equal(joined, df) - - df_list = [df[['a', 'b']][:-2], - df[['c', 'd']][2:], df[['e', 'f']][1:9]] - - def _check_diff_index(df_list, result, exp_index): - reindexed = [x.reindex(exp_index) for x in df_list] - expected = reindexed[0].join(reindexed[1:]) - tm.assert_frame_equal(result, expected) - - # different join types - joined = df_list[0].join(df_list[1:], how='outer') - _check_diff_index(df_list, joined, df.index) - - joined = df_list[0].join(df_list[1:]) - _check_diff_index(df_list, joined, df_list[0].index) - - joined = df_list[0].join(df_list[1:], how='inner') - _check_diff_index(df_list, joined, df.index[2:8]) - - self.assertRaises(ValueError, df_list[0].join, df_list[1:], on='a') - - def test_join_many_mixed(self): - df = DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D']) - df['key'] = ['foo', 'bar'] * 4 - df1 = df.ix[:, ['A', 'B']] - df2 = df.ix[:, ['C', 'D']] - df3 = df.ix[:, ['key']] - - result = df1.join([df2, df3]) - assert_frame_equal(result, df) + def test_append_dtype_coerce(self): + + # GH 4993 + # appending with datetime will incorrectly convert datetime64 + import datetime as dt + from pandas import NaT + + df1 = DataFrame(index=[1, 2], data=[dt.datetime(2013, 1, 1, 0, 0), + dt.datetime(2013, 1, 2, 0, 0)], + columns=['start_time']) + df2 = DataFrame(index=[4, 5], data=[[dt.datetime(2013, 1, 3, 0, 0), + dt.datetime(2013, 1, 3, 6, 10)], + [dt.datetime(2013, 1, 4, 0, 0), + dt.datetime(2013, 1, 4, 7, 10)]], + columns=['start_time', 'end_time']) + + expected = concat([Series([NaT, NaT, dt.datetime(2013, 1, 3, 6, 10), + dt.datetime(2013, 1, 4, 7, 10)], + name='end_time'), + Series([dt.datetime(2013, 1, 1, 0, 0), + dt.datetime(2013, 1, 2, 0, 0), + dt.datetime(2013, 1, 3, 0, 0), + dt.datetime(2013, 1, 4, 0, 0)], + name='start_time')], axis=1) + result = df1.append(df2, ignore_index=True) + assert_frame_equal(result, expected) def test_append_missing_column_proper_upcast(self): df1 = DataFrame({'A': np.array([1, 2, 3, 4], dtype='i8')}) @@ -187,6 +853,9 @@ def test_append_missing_column_proper_upcast(self): self.assertEqual(appended['A'].dtype, 'f8') self.assertEqual(appended['B'].dtype, 'O') + +class TestConcatenate(ConcatenateBase): + def test_concat_copy(self): df = DataFrame(np.random.randn(4, 3)) @@ -523,35 +1192,6 @@ def test_with_mixed_tuples(self): # it works concat([df1, df2]) - def test_join_dups(self): - - # joining dups - df = concat([DataFrame(np.random.randn(10, 4), - columns=['A', 'A', 'B', 'B']), - DataFrame(np.random.randint(0, 10, size=20) - .reshape(10, 2), - columns=['A', 'C'])], - axis=1) - - expected = concat([df, df], axis=1) - result = df.join(df, rsuffix='_2') - result.columns = expected.columns - assert_frame_equal(result, expected) - - # GH 4975, invalid join on dups - w = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) - x = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) - y = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) - z = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) - - dta = x.merge(y, left_index=True, right_index=True).merge( - z, left_index=True, right_index=True, how="outer") - dta = dta.merge(w, left_index=True, right_index=True) - expected = concat([x, y, z, w], axis=1) - expected.columns = ['x_x', 'y_x', 'x_y', - 'y_y', 'x_x', 'y_x', 'x_y', 'y_y'] - assert_frame_equal(dta, expected) - def test_handle_empty_objects(self): df = DataFrame(np.random.randn(10, 4), columns=list('abcd')) @@ -648,86 +1288,40 @@ def test_concat_mixed_objs(self): panel = tm.makePanel() self.assertRaises(ValueError, lambda: concat([panel, s1], axis=1)) - def test_panel_join(self): - panel = tm.makePanel() - tm.add_nans(panel) - - p1 = panel.ix[:2, :10, :3] - p2 = panel.ix[2:, 5:, 2:] + def test_empty_dtype_coerce(self): - # left join - result = p1.join(p2) - expected = p1.copy() - expected['ItemC'] = p2['ItemC'] - tm.assert_panel_equal(result, expected) + # xref to #12411 + # xref to #12045 + # xref to #11594 + # see below - # right join - result = p1.join(p2, how='right') - expected = p2.copy() - expected['ItemA'] = p1['ItemA'] - expected['ItemB'] = p1['ItemB'] - expected = expected.reindex(items=['ItemA', 'ItemB', 'ItemC']) - tm.assert_panel_equal(result, expected) + # 10571 + df1 = DataFrame(data=[[1, None], [2, None]], columns=['a', 'b']) + df2 = DataFrame(data=[[3, None], [4, None]], columns=['a', 'b']) + result = concat([df1, df2]) + expected = df1.dtypes + tm.assert_series_equal(result.dtypes, expected) - # inner join - result = p1.join(p2, how='inner') - expected = panel.ix[:, 5:10, 2:3] - tm.assert_panel_equal(result, expected) + def test_dtype_coerceion(self): - # outer join - result = p1.join(p2, how='outer') - expected = p1.reindex(major=panel.major_axis, - minor=panel.minor_axis) - expected = expected.join(p2.reindex(major=panel.major_axis, - minor=panel.minor_axis)) - tm.assert_panel_equal(result, expected) + # 12411 + df = DataFrame({'date': [pd.Timestamp('20130101').tz_localize('UTC'), + pd.NaT]}) - def test_panel_join_overlap(self): - panel = tm.makePanel() - tm.add_nans(panel) - - p1 = panel.ix[['ItemA', 'ItemB', 'ItemC']] - p2 = panel.ix[['ItemB', 'ItemC']] - - # Expected index is - # - # ItemA, ItemB_p1, ItemC_p1, ItemB_p2, ItemC_p2 - joined = p1.join(p2, lsuffix='_p1', rsuffix='_p2') - p1_suf = p1.ix[['ItemB', 'ItemC']].add_suffix('_p1') - p2_suf = p2.ix[['ItemB', 'ItemC']].add_suffix('_p2') - no_overlap = panel.ix[['ItemA']] - expected = no_overlap.join(p1_suf.join(p2_suf)) - tm.assert_panel_equal(joined, expected) - - def test_panel_join_many(self): - tm.K = 10 - panel = tm.makePanel() - tm.K = 4 + result = concat([df.iloc[[0]], df.iloc[[1]]]) + tm.assert_series_equal(result.dtypes, df.dtypes) - panels = [panel.ix[:2], panel.ix[2:6], panel.ix[6:]] + # 12045 + import datetime + df = DataFrame({'date': [datetime.datetime(2012, 1, 1), + datetime.datetime(1012, 1, 2)]}) + result = concat([df.iloc[[0]], df.iloc[[1]]]) + tm.assert_series_equal(result.dtypes, df.dtypes) - joined = panels[0].join(panels[1:]) - tm.assert_panel_equal(joined, panel) - - panels = [panel.ix[:2, :-5], panel.ix[2:6, 2:], panel.ix[6:, 5:-7]] - - data_dict = {} - for p in panels: - data_dict.update(p.iteritems()) - - joined = panels[0].join(panels[1:], how='inner') - expected = Panel.from_dict(data_dict, intersect=True) - tm.assert_panel_equal(joined, expected) - - joined = panels[0].join(panels[1:], how='outer') - expected = Panel.from_dict(data_dict, intersect=False) - tm.assert_panel_equal(joined, expected) - - # edge cases - self.assertRaises(ValueError, panels[0].join, panels[1:], - how='outer', lsuffix='foo', rsuffix='bar') - self.assertRaises(ValueError, panels[0].join, panels[1:], - how='right') + # 11594 + df = DataFrame({'text': ['some words'] + [None] * 9}) + result = concat([df.iloc[[0]], df.iloc[[1]]]) + tm.assert_series_equal(result.dtypes, df.dtypes) def test_panel_concat_other_axes(self): panel = tm.makePanel() @@ -782,35 +1376,37 @@ def df(): concat([panel1, panel3], axis=1, verify_integrity=True) def test_panel4d_concat(self): - p4d = tm.makePanel4D() + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + p4d = tm.makePanel4D() - p1 = p4d.ix[:, :, :5, :] - p2 = p4d.ix[:, :, 5:, :] + p1 = p4d.ix[:, :, :5, :] + p2 = p4d.ix[:, :, 5:, :] - result = concat([p1, p2], axis=2) - tm.assert_panel4d_equal(result, p4d) + result = concat([p1, p2], axis=2) + tm.assert_panel4d_equal(result, p4d) - p1 = p4d.ix[:, :, :, :2] - p2 = p4d.ix[:, :, :, 2:] + p1 = p4d.ix[:, :, :, :2] + p2 = p4d.ix[:, :, :, 2:] - result = concat([p1, p2], axis=3) - tm.assert_panel4d_equal(result, p4d) + result = concat([p1, p2], axis=3) + tm.assert_panel4d_equal(result, p4d) def test_panel4d_concat_mixed_type(self): - p4d = tm.makePanel4D() + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + p4d = tm.makePanel4D() - # if things are a bit misbehaved - p1 = p4d.ix[:, :2, :, :2] - p2 = p4d.ix[:, :, :, 2:] - p1['L5'] = 'baz' + # if things are a bit misbehaved + p1 = p4d.ix[:, :2, :, :2] + p2 = p4d.ix[:, :, :, 2:] + p1['L5'] = 'baz' - result = concat([p1, p2], axis=3) + result = concat([p1, p2], axis=3) - p2['L5'] = np.nan - expected = concat([p1, p2], axis=3) - expected = expected.ix[result.labels] + p2['L5'] = np.nan + expected = concat([p1, p2], axis=3) + expected = expected.ix[result.labels] - tm.assert_panel4d_equal(result, expected) + tm.assert_panel4d_equal(result, expected) def test_concat_series(self): @@ -919,6 +1515,262 @@ def test_concat_keys_with_none(self): keys=['b', 'c', 'd', 'e']) tm.assert_frame_equal(result, expected) + def test_union_categorical(self): + # GH 13361 + data = [ + (list('abc'), list('abd'), list('abcabd')), + ([0, 1, 2], [2, 3, 4], [0, 1, 2, 2, 3, 4]), + ([0, 1.2, 2], [2, 3.4, 4], [0, 1.2, 2, 2, 3.4, 4]), + + (['b', 'b', np.nan, 'a'], ['a', np.nan, 'c'], + ['b', 'b', np.nan, 'a', 'a', np.nan, 'c']), + + (pd.date_range('2014-01-01', '2014-01-05'), + pd.date_range('2014-01-06', '2014-01-07'), + pd.date_range('2014-01-01', '2014-01-07')), + + (pd.date_range('2014-01-01', '2014-01-05', tz='US/Central'), + pd.date_range('2014-01-06', '2014-01-07', tz='US/Central'), + pd.date_range('2014-01-01', '2014-01-07', tz='US/Central')), + + (pd.period_range('2014-01-01', '2014-01-05'), + pd.period_range('2014-01-06', '2014-01-07'), + pd.period_range('2014-01-01', '2014-01-07')), + ] + + for a, b, combined in data: + result = union_categoricals([Categorical(a), Categorical(b)]) + expected = Categorical(combined) + tm.assert_categorical_equal(result, expected, + check_category_order=True) + + # new categories ordered by appearance + s = Categorical(['x', 'y', 'z']) + s2 = Categorical(['a', 'b', 'c']) + result = union_categoricals([s, s2]) + expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'], + categories=['x', 'y', 'z', 'a', 'b', 'c']) + tm.assert_categorical_equal(result, expected) + + s = Categorical([0, 1.2, 2], ordered=True) + s2 = Categorical([0, 1.2, 2], ordered=True) + result = union_categoricals([s, s2]) + expected = Categorical([0, 1.2, 2, 0, 1.2, 2], ordered=True) + tm.assert_categorical_equal(result, expected) + + # must exactly match types + s = Categorical([0, 1.2, 2]) + s2 = Categorical([2, 3, 4]) + msg = 'dtype of categories must be the same' + with tm.assertRaisesRegexp(TypeError, msg): + union_categoricals([s, s2]) + + msg = 'No Categoricals to union' + with tm.assertRaisesRegexp(ValueError, msg): + union_categoricals([]) + + def test_union_categoricals_nan(self): + # GH 13759 + res = union_categoricals([pd.Categorical([1, 2, np.nan]), + pd.Categorical([3, 2, np.nan])]) + exp = Categorical([1, 2, np.nan, 3, 2, np.nan]) + tm.assert_categorical_equal(res, exp) + + res = union_categoricals([pd.Categorical(['A', 'B']), + pd.Categorical(['B', 'B', np.nan])]) + exp = Categorical(['A', 'B', 'B', 'B', np.nan]) + tm.assert_categorical_equal(res, exp) + + val1 = [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-03-01'), + pd.NaT] + val2 = [pd.NaT, pd.Timestamp('2011-01-01'), + pd.Timestamp('2011-02-01')] + + res = union_categoricals([pd.Categorical(val1), pd.Categorical(val2)]) + exp = Categorical(val1 + val2, + categories=[pd.Timestamp('2011-01-01'), + pd.Timestamp('2011-03-01'), + pd.Timestamp('2011-02-01')]) + tm.assert_categorical_equal(res, exp) + + # all NaN + res = union_categoricals([pd.Categorical([np.nan, np.nan]), + pd.Categorical(['X'])]) + exp = Categorical([np.nan, np.nan, 'X']) + tm.assert_categorical_equal(res, exp) + + res = union_categoricals([pd.Categorical([np.nan, np.nan]), + pd.Categorical([np.nan, np.nan])]) + exp = Categorical([np.nan, np.nan, np.nan, np.nan]) + tm.assert_categorical_equal(res, exp) + + def test_union_categoricals_empty(self): + # GH 13759 + res = union_categoricals([pd.Categorical([]), + pd.Categorical([])]) + exp = Categorical([]) + tm.assert_categorical_equal(res, exp) + + res = union_categoricals([pd.Categorical([]), + pd.Categorical([1.0])]) + exp = Categorical([1.0]) + tm.assert_categorical_equal(res, exp) + + # to make dtype equal + nanc = pd.Categorical(np.array([np.nan], dtype=np.float64)) + res = union_categoricals([nanc, + pd.Categorical([])]) + tm.assert_categorical_equal(res, nanc) + + def test_union_categorical_same_category(self): + # check fastpath + c1 = Categorical([1, 2, 3, 4], categories=[1, 2, 3, 4]) + c2 = Categorical([3, 2, 1, np.nan], categories=[1, 2, 3, 4]) + res = union_categoricals([c1, c2]) + exp = Categorical([1, 2, 3, 4, 3, 2, 1, np.nan], + categories=[1, 2, 3, 4]) + tm.assert_categorical_equal(res, exp) + + c1 = Categorical(['z', 'z', 'z'], categories=['x', 'y', 'z']) + c2 = Categorical(['x', 'x', 'x'], categories=['x', 'y', 'z']) + res = union_categoricals([c1, c2]) + exp = Categorical(['z', 'z', 'z', 'x', 'x', 'x'], + categories=['x', 'y', 'z']) + tm.assert_categorical_equal(res, exp) + + def test_union_categoricals_ordered(self): + c1 = Categorical([1, 2, 3], ordered=True) + c2 = Categorical([1, 2, 3], ordered=False) + + msg = 'Categorical.ordered must be the same' + with tm.assertRaisesRegexp(TypeError, msg): + union_categoricals([c1, c2]) + + res = union_categoricals([c1, c1]) + exp = Categorical([1, 2, 3, 1, 2, 3], ordered=True) + tm.assert_categorical_equal(res, exp) + + c1 = Categorical([1, 2, 3, np.nan], ordered=True) + c2 = Categorical([3, 2], categories=[1, 2, 3], ordered=True) + + res = union_categoricals([c1, c2]) + exp = Categorical([1, 2, 3, np.nan, 3, 2], ordered=True) + tm.assert_categorical_equal(res, exp) + + c1 = Categorical([1, 2, 3], ordered=True) + c2 = Categorical([1, 2, 3], categories=[3, 2, 1], ordered=True) + + msg = "to union ordered Categoricals, all categories must be the same" + with tm.assertRaisesRegexp(TypeError, msg): + union_categoricals([c1, c2]) + + def test_union_categoricals_sort(self): + # GH 13846 + c1 = Categorical(['x', 'y', 'z']) + c2 = Categorical(['a', 'b', 'c']) + result = union_categoricals([c1, c2], sort_categories=True) + expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'], + categories=['a', 'b', 'c', 'x', 'y', 'z']) + tm.assert_categorical_equal(result, expected) + + # fastpath + c1 = Categorical(['a', 'b'], categories=['b', 'a', 'c']) + c2 = Categorical(['b', 'c'], categories=['b', 'a', 'c']) + result = union_categoricals([c1, c2], sort_categories=True) + expected = Categorical(['a', 'b', 'b', 'c'], + categories=['a', 'b', 'c']) + tm.assert_categorical_equal(result, expected) + + c1 = Categorical(['a', 'b'], categories=['c', 'a', 'b']) + c2 = Categorical(['b', 'c'], categories=['c', 'a', 'b']) + result = union_categoricals([c1, c2], sort_categories=True) + expected = Categorical(['a', 'b', 'b', 'c'], + categories=['a', 'b', 'c']) + tm.assert_categorical_equal(result, expected) + + # fastpath - skip resort + c1 = Categorical(['a', 'b'], categories=['a', 'b', 'c']) + c2 = Categorical(['b', 'c'], categories=['a', 'b', 'c']) + result = union_categoricals([c1, c2], sort_categories=True) + expected = Categorical(['a', 'b', 'b', 'c'], + categories=['a', 'b', 'c']) + tm.assert_categorical_equal(result, expected) + + c1 = Categorical(['x', np.nan]) + c2 = Categorical([np.nan, 'b']) + result = union_categoricals([c1, c2], sort_categories=True) + expected = Categorical(['x', np.nan, np.nan, 'b'], + categories=['b', 'x']) + tm.assert_categorical_equal(result, expected) + + c1 = Categorical([np.nan]) + c2 = Categorical([np.nan]) + result = union_categoricals([c1, c2], sort_categories=True) + expected = Categorical([np.nan, np.nan], categories=[]) + tm.assert_categorical_equal(result, expected) + + c1 = Categorical([]) + c2 = Categorical([]) + result = union_categoricals([c1, c2], sort_categories=True) + expected = Categorical([]) + tm.assert_categorical_equal(result, expected) + + c1 = Categorical(['b', 'a'], categories=['b', 'a', 'c'], ordered=True) + c2 = Categorical(['a', 'c'], categories=['b', 'a', 'c'], ordered=True) + with tm.assertRaises(TypeError): + union_categoricals([c1, c2], sort_categories=True) + + def test_union_categoricals_sort_false(self): + # GH 13846 + c1 = Categorical(['x', 'y', 'z']) + c2 = Categorical(['a', 'b', 'c']) + result = union_categoricals([c1, c2], sort_categories=False) + expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'], + categories=['x', 'y', 'z', 'a', 'b', 'c']) + tm.assert_categorical_equal(result, expected) + + # fastpath + c1 = Categorical(['a', 'b'], categories=['b', 'a', 'c']) + c2 = Categorical(['b', 'c'], categories=['b', 'a', 'c']) + result = union_categoricals([c1, c2], sort_categories=False) + expected = Categorical(['a', 'b', 'b', 'c'], + categories=['b', 'a', 'c']) + tm.assert_categorical_equal(result, expected) + + # fastpath - skip resort + c1 = Categorical(['a', 'b'], categories=['a', 'b', 'c']) + c2 = Categorical(['b', 'c'], categories=['a', 'b', 'c']) + result = union_categoricals([c1, c2], sort_categories=False) + expected = Categorical(['a', 'b', 'b', 'c'], + categories=['a', 'b', 'c']) + tm.assert_categorical_equal(result, expected) + + c1 = Categorical(['x', np.nan]) + c2 = Categorical([np.nan, 'b']) + result = union_categoricals([c1, c2], sort_categories=False) + expected = Categorical(['x', np.nan, np.nan, 'b'], + categories=['x', 'b']) + tm.assert_categorical_equal(result, expected) + + c1 = Categorical([np.nan]) + c2 = Categorical([np.nan]) + result = union_categoricals([c1, c2], sort_categories=False) + expected = Categorical([np.nan, np.nan], categories=[]) + tm.assert_categorical_equal(result, expected) + + c1 = Categorical([]) + c2 = Categorical([]) + result = union_categoricals([c1, c2], sort_categories=False) + expected = Categorical([]) + tm.assert_categorical_equal(result, expected) + + c1 = Categorical(['b', 'a'], categories=['b', 'a', 'c'], ordered=True) + c2 = Categorical(['a', 'c'], categories=['b', 'a', 'c'], ordered=True) + result = union_categoricals([c1, c2], sort_categories=False) + expected = Categorical(['b', 'a', 'a', 'c'], + categories=['b', 'a', 'c'], ordered=True) + tm.assert_categorical_equal(result, expected) + def test_concat_bug_1719(self): ts1 = tm.makeTimeSeries() ts2 = tm.makeTimeSeries()[::2] @@ -1031,6 +1883,253 @@ def test_concat_invalid_first_argument(self): expected = read_csv(StringIO(data)) assert_frame_equal(result, expected) + def test_concat_NaT_series(self): + # GH 11693 + # test for merging NaT series with datetime series. + x = Series(date_range('20151124 08:00', '20151124 09:00', + freq='1h', tz='US/Eastern')) + y = Series(pd.NaT, index=[0, 1], dtype='datetime64[ns, US/Eastern]') + expected = Series([x[0], x[1], pd.NaT, pd.NaT]) + + result = concat([x, y], ignore_index=True) + tm.assert_series_equal(result, expected) + + # all NaT with tz + expected = Series(pd.NaT, index=range(4), + dtype='datetime64[ns, US/Eastern]') + result = pd.concat([y, y], ignore_index=True) + tm.assert_series_equal(result, expected) + + # without tz + x = pd.Series(pd.date_range('20151124 08:00', + '20151124 09:00', freq='1h')) + y = pd.Series(pd.date_range('20151124 10:00', + '20151124 11:00', freq='1h')) + y[:] = pd.NaT + expected = pd.Series([x[0], x[1], pd.NaT, pd.NaT]) + result = pd.concat([x, y], ignore_index=True) + tm.assert_series_equal(result, expected) + + # all NaT without tz + x[:] = pd.NaT + expected = pd.Series(pd.NaT, index=range(4), + dtype='datetime64[ns]') + result = pd.concat([x, y], ignore_index=True) + tm.assert_series_equal(result, expected) + + def test_concat_tz_frame(self): + df2 = DataFrame(dict(A=pd.Timestamp('20130102', tz='US/Eastern'), + B=pd.Timestamp('20130603', tz='CET')), + index=range(5)) + + # concat + df3 = pd.concat([df2.A.to_frame(), df2.B.to_frame()], axis=1) + assert_frame_equal(df2, df3) + + def test_concat_tz_series(self): + # GH 11755 + # tz and no tz + x = Series(date_range('20151124 08:00', + '20151124 09:00', + freq='1h', tz='UTC')) + y = Series(date_range('2012-01-01', '2012-01-02')) + expected = Series([x[0], x[1], y[0], y[1]], + dtype='object') + result = concat([x, y], ignore_index=True) + tm.assert_series_equal(result, expected) + + # GH 11887 + # concat tz and object + x = Series(date_range('20151124 08:00', + '20151124 09:00', + freq='1h', tz='UTC')) + y = Series(['a', 'b']) + expected = Series([x[0], x[1], y[0], y[1]], + dtype='object') + result = concat([x, y], ignore_index=True) + tm.assert_series_equal(result, expected) + + # 12217 + # 12306 fixed I think + + # Concat'ing two UTC times + first = pd.DataFrame([[datetime(2016, 1, 1)]]) + first[0] = first[0].dt.tz_localize('UTC') + + second = pd.DataFrame([[datetime(2016, 1, 2)]]) + second[0] = second[0].dt.tz_localize('UTC') + + result = pd.concat([first, second]) + self.assertEqual(result[0].dtype, 'datetime64[ns, UTC]') + + # Concat'ing two London times + first = pd.DataFrame([[datetime(2016, 1, 1)]]) + first[0] = first[0].dt.tz_localize('Europe/London') + + second = pd.DataFrame([[datetime(2016, 1, 2)]]) + second[0] = second[0].dt.tz_localize('Europe/London') + + result = pd.concat([first, second]) + self.assertEqual(result[0].dtype, 'datetime64[ns, Europe/London]') + + # Concat'ing 2+1 London times + first = pd.DataFrame([[datetime(2016, 1, 1)], [datetime(2016, 1, 2)]]) + first[0] = first[0].dt.tz_localize('Europe/London') + + second = pd.DataFrame([[datetime(2016, 1, 3)]]) + second[0] = second[0].dt.tz_localize('Europe/London') + + result = pd.concat([first, second]) + self.assertEqual(result[0].dtype, 'datetime64[ns, Europe/London]') + + # Concat'ing 1+2 London times + first = pd.DataFrame([[datetime(2016, 1, 1)]]) + first[0] = first[0].dt.tz_localize('Europe/London') + + second = pd.DataFrame([[datetime(2016, 1, 2)], [datetime(2016, 1, 3)]]) + second[0] = second[0].dt.tz_localize('Europe/London') + + result = pd.concat([first, second]) + self.assertEqual(result[0].dtype, 'datetime64[ns, Europe/London]') + + def test_concat_tz_series_with_datetimelike(self): + # GH 12620 + # tz and timedelta + x = [pd.Timestamp('2011-01-01', tz='US/Eastern'), + pd.Timestamp('2011-02-01', tz='US/Eastern')] + y = [pd.Timedelta('1 day'), pd.Timedelta('2 day')] + result = concat([pd.Series(x), pd.Series(y)], ignore_index=True) + tm.assert_series_equal(result, pd.Series(x + y, dtype='object')) + + # tz and period + y = [pd.Period('2011-03', freq='M'), pd.Period('2011-04', freq='M')] + result = concat([pd.Series(x), pd.Series(y)], ignore_index=True) + tm.assert_series_equal(result, pd.Series(x + y, dtype='object')) + + def test_concat_tz_series_tzlocal(self): + # GH 13583 + tm._skip_if_no_dateutil() + import dateutil + x = [pd.Timestamp('2011-01-01', tz=dateutil.tz.tzlocal()), + pd.Timestamp('2011-02-01', tz=dateutil.tz.tzlocal())] + y = [pd.Timestamp('2012-01-01', tz=dateutil.tz.tzlocal()), + pd.Timestamp('2012-02-01', tz=dateutil.tz.tzlocal())] + result = concat([pd.Series(x), pd.Series(y)], ignore_index=True) + tm.assert_series_equal(result, pd.Series(x + y)) + self.assertEqual(result.dtype, 'datetime64[ns, tzlocal()]') + + def test_concat_period_series(self): + x = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='D')) + y = Series(pd.PeriodIndex(['2015-10-01', '2016-01-01'], freq='D')) + expected = Series([x[0], x[1], y[0], y[1]], dtype='object') + result = concat([x, y], ignore_index=True) + tm.assert_series_equal(result, expected) + self.assertEqual(result.dtype, 'object') + + # different freq + x = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='D')) + y = Series(pd.PeriodIndex(['2015-10-01', '2016-01-01'], freq='M')) + expected = Series([x[0], x[1], y[0], y[1]], dtype='object') + result = concat([x, y], ignore_index=True) + tm.assert_series_equal(result, expected) + self.assertEqual(result.dtype, 'object') + + x = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='D')) + y = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='M')) + expected = Series([x[0], x[1], y[0], y[1]], dtype='object') + result = concat([x, y], ignore_index=True) + tm.assert_series_equal(result, expected) + self.assertEqual(result.dtype, 'object') + + # non-period + x = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='D')) + y = Series(pd.DatetimeIndex(['2015-11-01', '2015-12-01'])) + expected = Series([x[0], x[1], y[0], y[1]], dtype='object') + result = concat([x, y], ignore_index=True) + tm.assert_series_equal(result, expected) + self.assertEqual(result.dtype, 'object') + + x = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='D')) + y = Series(['A', 'B']) + expected = Series([x[0], x[1], y[0], y[1]], dtype='object') + result = concat([x, y], ignore_index=True) + tm.assert_series_equal(result, expected) + self.assertEqual(result.dtype, 'object') + + def test_concat_empty_series(self): + # GH 11082 + s1 = pd.Series([1, 2, 3], name='x') + s2 = pd.Series(name='y') + res = pd.concat([s1, s2], axis=1) + exp = pd.DataFrame({'x': [1, 2, 3], 'y': [np.nan, np.nan, np.nan]}) + tm.assert_frame_equal(res, exp) + + s1 = pd.Series([1, 2, 3], name='x') + s2 = pd.Series(name='y') + res = pd.concat([s1, s2], axis=0) + # name will be reset + exp = pd.Series([1, 2, 3]) + tm.assert_series_equal(res, exp) + + # empty Series with no name + s1 = pd.Series([1, 2, 3], name='x') + s2 = pd.Series(name=None) + res = pd.concat([s1, s2], axis=1) + exp = pd.DataFrame({'x': [1, 2, 3], 0: [np.nan, np.nan, np.nan]}, + columns=['x', 0]) + tm.assert_frame_equal(res, exp) + + def test_default_index(self): + # is_series and ignore_index + s1 = pd.Series([1, 2, 3], name='x') + s2 = pd.Series([4, 5, 6], name='y') + res = pd.concat([s1, s2], axis=1, ignore_index=True) + self.assertIsInstance(res.columns, pd.RangeIndex) + exp = pd.DataFrame([[1, 4], [2, 5], [3, 6]]) + # use check_index_type=True to check the result have + # RangeIndex (default index) + tm.assert_frame_equal(res, exp, check_index_type=True, + check_column_type=True) + + # is_series and all inputs have no names + s1 = pd.Series([1, 2, 3]) + s2 = pd.Series([4, 5, 6]) + res = pd.concat([s1, s2], axis=1, ignore_index=False) + self.assertIsInstance(res.columns, pd.RangeIndex) + exp = pd.DataFrame([[1, 4], [2, 5], [3, 6]]) + exp.columns = pd.RangeIndex(2) + tm.assert_frame_equal(res, exp, check_index_type=True, + check_column_type=True) + + # is_dataframe and ignore_index + df1 = pd.DataFrame({'A': [1, 2], 'B': [5, 6]}) + df2 = pd.DataFrame({'A': [3, 4], 'B': [7, 8]}) + + res = pd.concat([df1, df2], axis=0, ignore_index=True) + exp = pd.DataFrame([[1, 5], [2, 6], [3, 7], [4, 8]], + columns=['A', 'B']) + tm.assert_frame_equal(res, exp, check_index_type=True, + check_column_type=True) + + res = pd.concat([df1, df2], axis=1, ignore_index=True) + exp = pd.DataFrame([[1, 5, 3, 7], [2, 6, 4, 8]]) + tm.assert_frame_equal(res, exp, check_index_type=True, + check_column_type=True) + + def test_concat_multiindex_rangeindex(self): + # GH13542 + # when multi-index levels are RangeIndex objects + # there is a bug in concat with objects of len 1 + + df = DataFrame(np.random.randn(9, 2)) + df.index = MultiIndex(levels=[pd.RangeIndex(3), pd.RangeIndex(3)], + labels=[np.repeat(np.arange(3), 3), + np.tile(np.arange(3), 3)]) + + res = concat([df.iloc[[2, 3, 4], :], df.iloc[[5], :]]) + exp = df.iloc[[2, 3, 4, 5], :] + tm.assert_frame_equal(res, exp) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/tools/tests/test_join.py b/pandas/tools/tests/test_join.py new file mode 100644 index 0000000000000..f33d5f16cd439 --- /dev/null +++ b/pandas/tools/tests/test_join.py @@ -0,0 +1,804 @@ +# pylint: disable=E1103 + +import nose + +from numpy.random import randn +import numpy as np + +import pandas as pd +from pandas.compat import lrange +import pandas.compat as compat +from pandas.tools.merge import merge, concat +from pandas.util.testing import assert_frame_equal +from pandas import DataFrame, MultiIndex, Series + +import pandas._join as _join +import pandas.util.testing as tm +from pandas.tools.tests.test_merge import get_test_data, N, NGROUPS + + +a_ = np.array + + +class TestJoin(tm.TestCase): + + _multiprocess_can_split_ = True + + def setUp(self): + # aggregate multiple columns + self.df = DataFrame({'key1': get_test_data(), + 'key2': get_test_data(), + 'data1': np.random.randn(N), + 'data2': np.random.randn(N)}) + + # exclude a couple keys for fun + self.df = self.df[self.df['key2'] > 1] + + self.df2 = DataFrame({'key1': get_test_data(n=N // 5), + 'key2': get_test_data(ngroups=NGROUPS // 2, + n=N // 5), + 'value': np.random.randn(N // 5)}) + + index, data = tm.getMixedTypeDict() + self.target = DataFrame(data, index=index) + + # Join on string value + self.source = DataFrame({'MergedA': data['A'], 'MergedD': data['D']}, + index=data['C']) + + def test_cython_left_outer_join(self): + left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) + right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64) + max_group = 5 + + ls, rs = _join.left_outer_join(left, right, max_group) + + exp_ls = left.argsort(kind='mergesort') + exp_rs = right.argsort(kind='mergesort') + + exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, + 6, 6, 7, 7, 8, 8, 9, 10]) + exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, + 4, 5, 4, 5, 4, 5, -1, -1]) + + exp_ls = exp_ls.take(exp_li) + exp_ls[exp_li == -1] = -1 + + exp_rs = exp_rs.take(exp_ri) + exp_rs[exp_ri == -1] = -1 + + self.assert_numpy_array_equal(ls, exp_ls, check_dtype=False) + self.assert_numpy_array_equal(rs, exp_rs, check_dtype=False) + + def test_cython_right_outer_join(self): + left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) + right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64) + max_group = 5 + + rs, ls = _join.left_outer_join(right, left, max_group) + + exp_ls = left.argsort(kind='mergesort') + exp_rs = right.argsort(kind='mergesort') + + # 0 1 1 1 + exp_li = a_([0, 1, 2, 3, 4, 5, 3, 4, 5, 3, 4, 5, + # 2 2 4 + 6, 7, 8, 6, 7, 8, -1]) + exp_ri = a_([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, + 4, 4, 4, 5, 5, 5, 6]) + + exp_ls = exp_ls.take(exp_li) + exp_ls[exp_li == -1] = -1 + + exp_rs = exp_rs.take(exp_ri) + exp_rs[exp_ri == -1] = -1 + + self.assert_numpy_array_equal(ls, exp_ls, check_dtype=False) + self.assert_numpy_array_equal(rs, exp_rs, check_dtype=False) + + def test_cython_inner_join(self): + left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) + right = a_([1, 1, 0, 4, 2, 2, 1, 4], dtype=np.int64) + max_group = 5 + + ls, rs = _join.inner_join(left, right, max_group) + + exp_ls = left.argsort(kind='mergesort') + exp_rs = right.argsort(kind='mergesort') + + exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, + 6, 6, 7, 7, 8, 8]) + exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, + 4, 5, 4, 5, 4, 5]) + + exp_ls = exp_ls.take(exp_li) + exp_ls[exp_li == -1] = -1 + + exp_rs = exp_rs.take(exp_ri) + exp_rs[exp_ri == -1] = -1 + + self.assert_numpy_array_equal(ls, exp_ls, check_dtype=False) + self.assert_numpy_array_equal(rs, exp_rs, check_dtype=False) + + def test_left_outer_join(self): + joined_key2 = merge(self.df, self.df2, on='key2') + _check_join(self.df, self.df2, joined_key2, ['key2'], how='left') + + joined_both = merge(self.df, self.df2) + _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], + how='left') + + def test_right_outer_join(self): + joined_key2 = merge(self.df, self.df2, on='key2', how='right') + _check_join(self.df, self.df2, joined_key2, ['key2'], how='right') + + joined_both = merge(self.df, self.df2, how='right') + _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], + how='right') + + def test_full_outer_join(self): + joined_key2 = merge(self.df, self.df2, on='key2', how='outer') + _check_join(self.df, self.df2, joined_key2, ['key2'], how='outer') + + joined_both = merge(self.df, self.df2, how='outer') + _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], + how='outer') + + def test_inner_join(self): + joined_key2 = merge(self.df, self.df2, on='key2', how='inner') + _check_join(self.df, self.df2, joined_key2, ['key2'], how='inner') + + joined_both = merge(self.df, self.df2, how='inner') + _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], + how='inner') + + def test_handle_overlap(self): + joined = merge(self.df, self.df2, on='key2', + suffixes=['.foo', '.bar']) + + self.assertIn('key1.foo', joined) + self.assertIn('key1.bar', joined) + + def test_handle_overlap_arbitrary_key(self): + joined = merge(self.df, self.df2, + left_on='key2', right_on='key1', + suffixes=['.foo', '.bar']) + self.assertIn('key1.foo', joined) + self.assertIn('key2.bar', joined) + + def test_join_on(self): + target = self.target + source = self.source + + merged = target.join(source, on='C') + self.assert_series_equal(merged['MergedA'], target['A'], + check_names=False) + self.assert_series_equal(merged['MergedD'], target['D'], + check_names=False) + + # join with duplicates (fix regression from DataFrame/Matrix merge) + df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']}) + df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c']) + joined = df.join(df2, on='key') + expected = DataFrame({'key': ['a', 'a', 'b', 'b', 'c'], + 'value': [0, 0, 1, 1, 2]}) + assert_frame_equal(joined, expected) + + # Test when some are missing + df_a = DataFrame([[1], [2], [3]], index=['a', 'b', 'c'], + columns=['one']) + df_b = DataFrame([['foo'], ['bar']], index=[1, 2], + columns=['two']) + df_c = DataFrame([[1], [2]], index=[1, 2], + columns=['three']) + joined = df_a.join(df_b, on='one') + joined = joined.join(df_c, on='one') + self.assertTrue(np.isnan(joined['two']['c'])) + self.assertTrue(np.isnan(joined['three']['c'])) + + # merge column not p resent + self.assertRaises(KeyError, target.join, source, on='E') + + # overlap + source_copy = source.copy() + source_copy['A'] = 0 + self.assertRaises(ValueError, target.join, source_copy, on='A') + + def test_join_on_fails_with_different_right_index(self): + with tm.assertRaises(ValueError): + df = DataFrame({'a': np.random.choice(['m', 'f'], size=3), + 'b': np.random.randn(3)}) + df2 = DataFrame({'a': np.random.choice(['m', 'f'], size=10), + 'b': np.random.randn(10)}, + index=tm.makeCustomIndex(10, 2)) + merge(df, df2, left_on='a', right_index=True) + + def test_join_on_fails_with_different_left_index(self): + with tm.assertRaises(ValueError): + df = DataFrame({'a': np.random.choice(['m', 'f'], size=3), + 'b': np.random.randn(3)}, + index=tm.makeCustomIndex(10, 2)) + df2 = DataFrame({'a': np.random.choice(['m', 'f'], size=10), + 'b': np.random.randn(10)}) + merge(df, df2, right_on='b', left_index=True) + + def test_join_on_fails_with_different_column_counts(self): + with tm.assertRaises(ValueError): + df = DataFrame({'a': np.random.choice(['m', 'f'], size=3), + 'b': np.random.randn(3)}) + df2 = DataFrame({'a': np.random.choice(['m', 'f'], size=10), + 'b': np.random.randn(10)}, + index=tm.makeCustomIndex(10, 2)) + merge(df, df2, right_on='a', left_on=['a', 'b']) + + def test_join_on_fails_with_wrong_object_type(self): + # GH12081 + wrongly_typed = [Series([0, 1]), 2, 'str', None, np.array([0, 1])] + df = DataFrame({'a': [1, 1]}) + + for obj in wrongly_typed: + with tm.assertRaisesRegexp(ValueError, str(type(obj))): + merge(obj, df, left_on='a', right_on='a') + with tm.assertRaisesRegexp(ValueError, str(type(obj))): + merge(df, obj, left_on='a', right_on='a') + + def test_join_on_pass_vector(self): + expected = self.target.join(self.source, on='C') + del expected['C'] + + join_col = self.target.pop('C') + result = self.target.join(self.source, on=join_col) + assert_frame_equal(result, expected) + + def test_join_with_len0(self): + # nothing to merge + merged = self.target.join(self.source.reindex([]), on='C') + for col in self.source: + self.assertIn(col, merged) + self.assertTrue(merged[col].isnull().all()) + + merged2 = self.target.join(self.source.reindex([]), on='C', + how='inner') + self.assert_index_equal(merged2.columns, merged.columns) + self.assertEqual(len(merged2), 0) + + def test_join_on_inner(self): + df = DataFrame({'key': ['a', 'a', 'd', 'b', 'b', 'c']}) + df2 = DataFrame({'value': [0, 1]}, index=['a', 'b']) + + joined = df.join(df2, on='key', how='inner') + + expected = df.join(df2, on='key') + expected = expected[expected['value'].notnull()] + self.assert_series_equal(joined['key'], expected['key'], + check_dtype=False) + self.assert_series_equal(joined['value'], expected['value'], + check_dtype=False) + self.assert_index_equal(joined.index, expected.index) + + def test_join_on_singlekey_list(self): + df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']}) + df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c']) + + # corner cases + joined = df.join(df2, on=['key']) + expected = df.join(df2, on='key') + + assert_frame_equal(joined, expected) + + def test_join_on_series(self): + result = self.target.join(self.source['MergedA'], on='C') + expected = self.target.join(self.source[['MergedA']], on='C') + assert_frame_equal(result, expected) + + def test_join_on_series_buglet(self): + # GH #638 + df = DataFrame({'a': [1, 1]}) + ds = Series([2], index=[1], name='b') + result = df.join(ds, on='a') + expected = DataFrame({'a': [1, 1], + 'b': [2, 2]}, index=df.index) + tm.assert_frame_equal(result, expected) + + def test_join_index_mixed(self): + df1 = DataFrame({'A': 1., 'B': 2, 'C': 'foo', 'D': True}, + index=np.arange(10), + columns=['A', 'B', 'C', 'D']) + self.assertEqual(df1['B'].dtype, np.int64) + self.assertEqual(df1['D'].dtype, np.bool_) + + df2 = DataFrame({'A': 1., 'B': 2, 'C': 'foo', 'D': True}, + index=np.arange(0, 10, 2), + columns=['A', 'B', 'C', 'D']) + + # overlap + joined = df1.join(df2, lsuffix='_one', rsuffix='_two') + expected_columns = ['A_one', 'B_one', 'C_one', 'D_one', + 'A_two', 'B_two', 'C_two', 'D_two'] + df1.columns = expected_columns[:4] + df2.columns = expected_columns[4:] + expected = _join_by_hand(df1, df2) + assert_frame_equal(joined, expected) + + # no overlapping blocks + df1 = DataFrame(index=np.arange(10)) + df1['bool'] = True + df1['string'] = 'foo' + + df2 = DataFrame(index=np.arange(5, 15)) + df2['int'] = 1 + df2['float'] = 1. + + for kind in ['inner', 'outer', 'left', 'right']: + + joined = df1.join(df2, how=kind) + expected = _join_by_hand(df1, df2, how=kind) + assert_frame_equal(joined, expected) + + joined = df2.join(df1, how=kind) + expected = _join_by_hand(df2, df1, how=kind) + assert_frame_equal(joined, expected) + + def test_join_empty_bug(self): + # generated an exception in 0.4.3 + x = DataFrame() + x.join(DataFrame([3], index=[0], columns=['A']), how='outer') + + def test_join_unconsolidated(self): + # GH #331 + a = DataFrame(randn(30, 2), columns=['a', 'b']) + c = Series(randn(30)) + a['c'] = c + d = DataFrame(randn(30, 1), columns=['q']) + + # it works! + a.join(d) + d.join(a) + + def test_join_multiindex(self): + index1 = MultiIndex.from_arrays([['a', 'a', 'a', 'b', 'b', 'b'], + [1, 2, 3, 1, 2, 3]], + names=['first', 'second']) + + index2 = MultiIndex.from_arrays([['b', 'b', 'b', 'c', 'c', 'c'], + [1, 2, 3, 1, 2, 3]], + names=['first', 'second']) + + df1 = DataFrame(data=np.random.randn(6), index=index1, + columns=['var X']) + df2 = DataFrame(data=np.random.randn(6), index=index2, + columns=['var Y']) + + df1 = df1.sortlevel(0) + df2 = df2.sortlevel(0) + + joined = df1.join(df2, how='outer') + ex_index = index1._tuple_index.union(index2._tuple_index) + expected = df1.reindex(ex_index).join(df2.reindex(ex_index)) + expected.index.names = index1.names + assert_frame_equal(joined, expected) + self.assertEqual(joined.index.names, index1.names) + + df1 = df1.sortlevel(1) + df2 = df2.sortlevel(1) + + joined = df1.join(df2, how='outer').sortlevel(0) + ex_index = index1._tuple_index.union(index2._tuple_index) + expected = df1.reindex(ex_index).join(df2.reindex(ex_index)) + expected.index.names = index1.names + + assert_frame_equal(joined, expected) + self.assertEqual(joined.index.names, index1.names) + + def test_join_inner_multiindex(self): + key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux', + 'qux', 'snap'] + key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two', + 'three', 'one'] + + data = np.random.randn(len(key1)) + data = DataFrame({'key1': key1, 'key2': key2, + 'data': data}) + + index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], + ['one', 'two', 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['first', 'second']) + to_join = DataFrame(np.random.randn(10, 3), index=index, + columns=['j_one', 'j_two', 'j_three']) + + joined = data.join(to_join, on=['key1', 'key2'], how='inner') + expected = merge(data, to_join.reset_index(), + left_on=['key1', 'key2'], + right_on=['first', 'second'], how='inner', + sort=False) + + expected2 = merge(to_join, data, + right_on=['key1', 'key2'], left_index=True, + how='inner', sort=False) + assert_frame_equal(joined, expected2.reindex_like(joined)) + + expected2 = merge(to_join, data, right_on=['key1', 'key2'], + left_index=True, how='inner', sort=False) + + expected = expected.drop(['first', 'second'], axis=1) + expected.index = joined.index + + self.assertTrue(joined.index.is_monotonic) + assert_frame_equal(joined, expected) + + # _assert_same_contents(expected, expected2.ix[:, expected.columns]) + + def test_join_hierarchical_mixed(self): + # GH 2024 + df = DataFrame([(1, 2, 3), (4, 5, 6)], columns=['a', 'b', 'c']) + new_df = df.groupby(['a']).agg({'b': [np.mean, np.sum]}) + other_df = DataFrame( + [(1, 2, 3), (7, 10, 6)], columns=['a', 'b', 'd']) + other_df.set_index('a', inplace=True) + # GH 9455, 12219 + with tm.assert_produces_warning(UserWarning): + result = merge(new_df, other_df, left_index=True, right_index=True) + self.assertTrue(('b', 'mean') in result) + self.assertTrue('b' in result) + + def test_join_float64_float32(self): + + a = DataFrame(randn(10, 2), columns=['a', 'b'], dtype=np.float64) + b = DataFrame(randn(10, 1), columns=['c'], dtype=np.float32) + joined = a.join(b) + self.assertEqual(joined.dtypes['a'], 'float64') + self.assertEqual(joined.dtypes['b'], 'float64') + self.assertEqual(joined.dtypes['c'], 'float32') + + a = np.random.randint(0, 5, 100).astype('int64') + b = np.random.random(100).astype('float64') + c = np.random.random(100).astype('float32') + df = DataFrame({'a': a, 'b': b, 'c': c}) + xpdf = DataFrame({'a': a, 'b': b, 'c': c}) + s = DataFrame(np.random.random(5).astype('float32'), columns=['md']) + rs = df.merge(s, left_on='a', right_index=True) + self.assertEqual(rs.dtypes['a'], 'int64') + self.assertEqual(rs.dtypes['b'], 'float64') + self.assertEqual(rs.dtypes['c'], 'float32') + self.assertEqual(rs.dtypes['md'], 'float32') + + xp = xpdf.merge(s, left_on='a', right_index=True) + assert_frame_equal(rs, xp) + + def test_join_many_non_unique_index(self): + df1 = DataFrame({"a": [1, 1], "b": [1, 1], "c": [10, 20]}) + df2 = DataFrame({"a": [1, 1], "b": [1, 2], "d": [100, 200]}) + df3 = DataFrame({"a": [1, 1], "b": [1, 2], "e": [1000, 2000]}) + idf1 = df1.set_index(["a", "b"]) + idf2 = df2.set_index(["a", "b"]) + idf3 = df3.set_index(["a", "b"]) + + result = idf1.join([idf2, idf3], how='outer') + + df_partially_merged = merge(df1, df2, on=['a', 'b'], how='outer') + expected = merge(df_partially_merged, df3, on=['a', 'b'], how='outer') + + result = result.reset_index() + expected = expected[result.columns] + expected['a'] = expected.a.astype('int64') + expected['b'] = expected.b.astype('int64') + assert_frame_equal(result, expected) + + df1 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 1], "c": [10, 20, 30]}) + df2 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "d": [100, 200, 300]}) + df3 = DataFrame( + {"a": [1, 1, 1], "b": [1, 1, 2], "e": [1000, 2000, 3000]}) + idf1 = df1.set_index(["a", "b"]) + idf2 = df2.set_index(["a", "b"]) + idf3 = df3.set_index(["a", "b"]) + result = idf1.join([idf2, idf3], how='inner') + + df_partially_merged = merge(df1, df2, on=['a', 'b'], how='inner') + expected = merge(df_partially_merged, df3, on=['a', 'b'], how='inner') + + result = result.reset_index() + + assert_frame_equal(result, expected.ix[:, result.columns]) + + # GH 11519 + df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'foo', 'foo'], + 'B': ['one', 'one', 'two', 'three', + 'two', 'two', 'one', 'three'], + 'C': np.random.randn(8), + 'D': np.random.randn(8)}) + s = Series(np.repeat(np.arange(8), 2), + index=np.repeat(np.arange(8), 2), name='TEST') + inner = df.join(s, how='inner') + outer = df.join(s, how='outer') + left = df.join(s, how='left') + right = df.join(s, how='right') + assert_frame_equal(inner, outer) + assert_frame_equal(inner, left) + assert_frame_equal(inner, right) + + def test_join_sort(self): + left = DataFrame({'key': ['foo', 'bar', 'baz', 'foo'], + 'value': [1, 2, 3, 4]}) + right = DataFrame({'value2': ['a', 'b', 'c']}, + index=['bar', 'baz', 'foo']) + + joined = left.join(right, on='key', sort=True) + expected = DataFrame({'key': ['bar', 'baz', 'foo', 'foo'], + 'value': [2, 3, 1, 4], + 'value2': ['a', 'b', 'c', 'c']}, + index=[1, 2, 0, 3]) + assert_frame_equal(joined, expected) + + # smoke test + joined = left.join(right, on='key', sort=False) + self.assert_index_equal(joined.index, pd.Index(lrange(4))) + + def test_join_mixed_non_unique_index(self): + # GH 12814, unorderable types in py3 with a non-unique index + df1 = DataFrame({'a': [1, 2, 3, 4]}, index=[1, 2, 3, 'a']) + df2 = DataFrame({'b': [5, 6, 7, 8]}, index=[1, 3, 3, 4]) + result = df1.join(df2) + expected = DataFrame({'a': [1, 2, 3, 3, 4], + 'b': [5, np.nan, 6, 7, np.nan]}, + index=[1, 2, 3, 3, 'a']) + tm.assert_frame_equal(result, expected) + + df3 = DataFrame({'a': [1, 2, 3, 4]}, index=[1, 2, 2, 'a']) + df4 = DataFrame({'b': [5, 6, 7, 8]}, index=[1, 2, 3, 4]) + result = df3.join(df4) + expected = DataFrame({'a': [1, 2, 3, 4], 'b': [5, 6, 6, np.nan]}, + index=[1, 2, 2, 'a']) + tm.assert_frame_equal(result, expected) + + def test_mixed_type_join_with_suffix(self): + # GH #916 + df = DataFrame(np.random.randn(20, 6), + columns=['a', 'b', 'c', 'd', 'e', 'f']) + df.insert(0, 'id', 0) + df.insert(5, 'dt', 'foo') + + grouped = df.groupby('id') + mn = grouped.mean() + cn = grouped.count() + + # it works! + mn.join(cn, rsuffix='_right') + + def test_join_many(self): + df = DataFrame(np.random.randn(10, 6), columns=list('abcdef')) + df_list = [df[['a', 'b']], df[['c', 'd']], df[['e', 'f']]] + + joined = df_list[0].join(df_list[1:]) + tm.assert_frame_equal(joined, df) + + df_list = [df[['a', 'b']][:-2], + df[['c', 'd']][2:], df[['e', 'f']][1:9]] + + def _check_diff_index(df_list, result, exp_index): + reindexed = [x.reindex(exp_index) for x in df_list] + expected = reindexed[0].join(reindexed[1:]) + tm.assert_frame_equal(result, expected) + + # different join types + joined = df_list[0].join(df_list[1:], how='outer') + _check_diff_index(df_list, joined, df.index) + + joined = df_list[0].join(df_list[1:]) + _check_diff_index(df_list, joined, df_list[0].index) + + joined = df_list[0].join(df_list[1:], how='inner') + _check_diff_index(df_list, joined, df.index[2:8]) + + self.assertRaises(ValueError, df_list[0].join, df_list[1:], on='a') + + def test_join_many_mixed(self): + df = DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D']) + df['key'] = ['foo', 'bar'] * 4 + df1 = df.ix[:, ['A', 'B']] + df2 = df.ix[:, ['C', 'D']] + df3 = df.ix[:, ['key']] + + result = df1.join([df2, df3]) + assert_frame_equal(result, df) + + def test_join_dups(self): + + # joining dups + df = concat([DataFrame(np.random.randn(10, 4), + columns=['A', 'A', 'B', 'B']), + DataFrame(np.random.randint(0, 10, size=20) + .reshape(10, 2), + columns=['A', 'C'])], + axis=1) + + expected = concat([df, df], axis=1) + result = df.join(df, rsuffix='_2') + result.columns = expected.columns + assert_frame_equal(result, expected) + + # GH 4975, invalid join on dups + w = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) + x = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) + y = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) + z = DataFrame(np.random.randn(4, 2), columns=["x", "y"]) + + dta = x.merge(y, left_index=True, right_index=True).merge( + z, left_index=True, right_index=True, how="outer") + dta = dta.merge(w, left_index=True, right_index=True) + expected = concat([x, y, z, w], axis=1) + expected.columns = ['x_x', 'y_x', 'x_y', + 'y_y', 'x_x', 'y_x', 'x_y', 'y_y'] + assert_frame_equal(dta, expected) + + def test_panel_join(self): + panel = tm.makePanel() + tm.add_nans(panel) + + p1 = panel.ix[:2, :10, :3] + p2 = panel.ix[2:, 5:, 2:] + + # left join + result = p1.join(p2) + expected = p1.copy() + expected['ItemC'] = p2['ItemC'] + tm.assert_panel_equal(result, expected) + + # right join + result = p1.join(p2, how='right') + expected = p2.copy() + expected['ItemA'] = p1['ItemA'] + expected['ItemB'] = p1['ItemB'] + expected = expected.reindex(items=['ItemA', 'ItemB', 'ItemC']) + tm.assert_panel_equal(result, expected) + + # inner join + result = p1.join(p2, how='inner') + expected = panel.ix[:, 5:10, 2:3] + tm.assert_panel_equal(result, expected) + + # outer join + result = p1.join(p2, how='outer') + expected = p1.reindex(major=panel.major_axis, + minor=panel.minor_axis) + expected = expected.join(p2.reindex(major=panel.major_axis, + minor=panel.minor_axis)) + tm.assert_panel_equal(result, expected) + + def test_panel_join_overlap(self): + panel = tm.makePanel() + tm.add_nans(panel) + + p1 = panel.ix[['ItemA', 'ItemB', 'ItemC']] + p2 = panel.ix[['ItemB', 'ItemC']] + + # Expected index is + # + # ItemA, ItemB_p1, ItemC_p1, ItemB_p2, ItemC_p2 + joined = p1.join(p2, lsuffix='_p1', rsuffix='_p2') + p1_suf = p1.ix[['ItemB', 'ItemC']].add_suffix('_p1') + p2_suf = p2.ix[['ItemB', 'ItemC']].add_suffix('_p2') + no_overlap = panel.ix[['ItemA']] + expected = no_overlap.join(p1_suf.join(p2_suf)) + tm.assert_panel_equal(joined, expected) + + def test_panel_join_many(self): + tm.K = 10 + panel = tm.makePanel() + tm.K = 4 + + panels = [panel.ix[:2], panel.ix[2:6], panel.ix[6:]] + + joined = panels[0].join(panels[1:]) + tm.assert_panel_equal(joined, panel) + + panels = [panel.ix[:2, :-5], panel.ix[2:6, 2:], panel.ix[6:, 5:-7]] + + data_dict = {} + for p in panels: + data_dict.update(p.iteritems()) + + joined = panels[0].join(panels[1:], how='inner') + expected = pd.Panel.from_dict(data_dict, intersect=True) + tm.assert_panel_equal(joined, expected) + + joined = panels[0].join(panels[1:], how='outer') + expected = pd.Panel.from_dict(data_dict, intersect=False) + tm.assert_panel_equal(joined, expected) + + # edge cases + self.assertRaises(ValueError, panels[0].join, panels[1:], + how='outer', lsuffix='foo', rsuffix='bar') + self.assertRaises(ValueError, panels[0].join, panels[1:], + how='right') + + +def _check_join(left, right, result, join_col, how='left', + lsuffix='_x', rsuffix='_y'): + + # some smoke tests + for c in join_col: + assert(result[c].notnull().all()) + + left_grouped = left.groupby(join_col) + right_grouped = right.groupby(join_col) + + for group_key, group in result.groupby(join_col): + l_joined = _restrict_to_columns(group, left.columns, lsuffix) + r_joined = _restrict_to_columns(group, right.columns, rsuffix) + + try: + lgroup = left_grouped.get_group(group_key) + except KeyError: + if how in ('left', 'inner'): + raise AssertionError('key %s should not have been in the join' + % str(group_key)) + + _assert_all_na(l_joined, left.columns, join_col) + else: + _assert_same_contents(l_joined, lgroup) + + try: + rgroup = right_grouped.get_group(group_key) + except KeyError: + if how in ('right', 'inner'): + raise AssertionError('key %s should not have been in the join' + % str(group_key)) + + _assert_all_na(r_joined, right.columns, join_col) + else: + _assert_same_contents(r_joined, rgroup) + + +def _restrict_to_columns(group, columns, suffix): + found = [c for c in group.columns + if c in columns or c.replace(suffix, '') in columns] + + # filter + group = group.ix[:, found] + + # get rid of suffixes, if any + group = group.rename(columns=lambda x: x.replace(suffix, '')) + + # put in the right order... + group = group.ix[:, columns] + + return group + + +def _assert_same_contents(join_chunk, source): + NA_SENTINEL = -1234567 # drop_duplicates not so NA-friendly... + + jvalues = join_chunk.fillna(NA_SENTINEL).drop_duplicates().values + svalues = source.fillna(NA_SENTINEL).drop_duplicates().values + + rows = set(tuple(row) for row in jvalues) + assert(len(rows) == len(source)) + assert(all(tuple(row) in rows for row in svalues)) + + +def _assert_all_na(join_chunk, source_columns, join_col): + for c in source_columns: + if c in join_col: + continue + assert(join_chunk[c].isnull().all()) + + +def _join_by_hand(a, b, how='left'): + join_index = a.index.join(b.index, how=how) + + a_re = a.reindex(join_index) + b_re = b.reindex(join_index) + + result_columns = a.columns.append(b.columns) + + for col, s in compat.iteritems(b_re): + a_re[col] = s + return a_re.reindex(columns=result_columns) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py index 2505309768997..6e36100ddd0b4 100644 --- a/pandas/tools/tests/test_merge.py +++ b/pandas/tools/tests/test_merge.py @@ -9,23 +9,17 @@ import random import pandas as pd -from pandas.compat import range, lrange, lzip +from pandas.compat import lrange, lzip from pandas.tools.merge import merge, concat, MergeError from pandas.util.testing import (assert_frame_equal, assert_series_equal, slow) -from pandas import (DataFrame, Index, MultiIndex, - Series, date_range, Categorical, - compat) -import pandas.algos as algos +from pandas import DataFrame, Index, MultiIndex, Series, Categorical import pandas.util.testing as tm -a_ = np.array - N = 50 NGROUPS = 8 -JOIN_TYPES = ['inner', 'outer', 'left', 'right'] def get_test_data(ngroups=NGROUPS, n=N): @@ -58,496 +52,16 @@ def setUp(self): n=N // 5), 'value': np.random.randn(N // 5)}) - index, data = tm.getMixedTypeDict() - self.target = DataFrame(data, index=index) - - # Join on string value - self.source = DataFrame({'MergedA': data['A'], 'MergedD': data['D']}, - index=data['C']) - self.left = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'e', 'a'], 'v1': np.random.randn(7)}) self.right = DataFrame({'v2': np.random.randn(4)}, index=['d', 'b', 'c', 'a']) - def test_cython_left_outer_join(self): - left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) - right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64) - max_group = 5 - - ls, rs = algos.left_outer_join(left, right, max_group) - - exp_ls = left.argsort(kind='mergesort') - exp_rs = right.argsort(kind='mergesort') - - exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, - 6, 6, 7, 7, 8, 8, 9, 10]) - exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, - 4, 5, 4, 5, 4, 5, -1, -1]) - - exp_ls = exp_ls.take(exp_li) - exp_ls[exp_li == -1] = -1 - - exp_rs = exp_rs.take(exp_ri) - exp_rs[exp_ri == -1] = -1 - - self.assert_numpy_array_equal(ls, exp_ls) - self.assert_numpy_array_equal(rs, exp_rs) - - def test_cython_right_outer_join(self): - left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) - right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64) - max_group = 5 - - rs, ls = algos.left_outer_join(right, left, max_group) - - exp_ls = left.argsort(kind='mergesort') - exp_rs = right.argsort(kind='mergesort') - - # 0 1 1 1 - exp_li = a_([0, 1, 2, 3, 4, 5, 3, 4, 5, 3, 4, 5, - # 2 2 4 - 6, 7, 8, 6, 7, 8, -1]) - exp_ri = a_([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, - 4, 4, 4, 5, 5, 5, 6]) - - exp_ls = exp_ls.take(exp_li) - exp_ls[exp_li == -1] = -1 - - exp_rs = exp_rs.take(exp_ri) - exp_rs[exp_ri == -1] = -1 - - self.assert_numpy_array_equal(ls, exp_ls) - self.assert_numpy_array_equal(rs, exp_rs) - - def test_cython_inner_join(self): - left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) - right = a_([1, 1, 0, 4, 2, 2, 1, 4], dtype=np.int64) - max_group = 5 - - ls, rs = algos.inner_join(left, right, max_group) - - exp_ls = left.argsort(kind='mergesort') - exp_rs = right.argsort(kind='mergesort') - - exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, - 6, 6, 7, 7, 8, 8]) - exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, - 4, 5, 4, 5, 4, 5]) - - exp_ls = exp_ls.take(exp_li) - exp_ls[exp_li == -1] = -1 - - exp_rs = exp_rs.take(exp_ri) - exp_rs[exp_ri == -1] = -1 - - self.assert_numpy_array_equal(ls, exp_ls) - self.assert_numpy_array_equal(rs, exp_rs) - - def test_left_outer_join(self): - joined_key2 = merge(self.df, self.df2, on='key2') - _check_join(self.df, self.df2, joined_key2, ['key2'], how='left') - - joined_both = merge(self.df, self.df2) - _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], - how='left') - - def test_right_outer_join(self): - joined_key2 = merge(self.df, self.df2, on='key2', how='right') - _check_join(self.df, self.df2, joined_key2, ['key2'], how='right') - - joined_both = merge(self.df, self.df2, how='right') - _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], - how='right') - - def test_full_outer_join(self): - joined_key2 = merge(self.df, self.df2, on='key2', how='outer') - _check_join(self.df, self.df2, joined_key2, ['key2'], how='outer') - - joined_both = merge(self.df, self.df2, how='outer') - _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], - how='outer') - - def test_inner_join(self): - joined_key2 = merge(self.df, self.df2, on='key2', how='inner') - _check_join(self.df, self.df2, joined_key2, ['key2'], how='inner') - - joined_both = merge(self.df, self.df2, how='inner') - _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], - how='inner') - - def test_handle_overlap(self): - joined = merge(self.df, self.df2, on='key2', - suffixes=['.foo', '.bar']) - - self.assertIn('key1.foo', joined) - self.assertIn('key1.bar', joined) - - def test_handle_overlap_arbitrary_key(self): - joined = merge(self.df, self.df2, - left_on='key2', right_on='key1', - suffixes=['.foo', '.bar']) - self.assertIn('key1.foo', joined) - self.assertIn('key2.bar', joined) - def test_merge_common(self): joined = merge(self.df, self.df2) exp = merge(self.df, self.df2, on=['key1', 'key2']) tm.assert_frame_equal(joined, exp) - def test_join_on(self): - target = self.target - source = self.source - - merged = target.join(source, on='C') - self.assert_series_equal(merged['MergedA'], target['A'], - check_names=False) - self.assert_series_equal(merged['MergedD'], target['D'], - check_names=False) - - # join with duplicates (fix regression from DataFrame/Matrix merge) - df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']}) - df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c']) - joined = df.join(df2, on='key') - expected = DataFrame({'key': ['a', 'a', 'b', 'b', 'c'], - 'value': [0, 0, 1, 1, 2]}) - assert_frame_equal(joined, expected) - - # Test when some are missing - df_a = DataFrame([[1], [2], [3]], index=['a', 'b', 'c'], - columns=['one']) - df_b = DataFrame([['foo'], ['bar']], index=[1, 2], - columns=['two']) - df_c = DataFrame([[1], [2]], index=[1, 2], - columns=['three']) - joined = df_a.join(df_b, on='one') - joined = joined.join(df_c, on='one') - self.assertTrue(np.isnan(joined['two']['c'])) - self.assertTrue(np.isnan(joined['three']['c'])) - - # merge column not p resent - self.assertRaises(KeyError, target.join, source, on='E') - - # overlap - source_copy = source.copy() - source_copy['A'] = 0 - self.assertRaises(ValueError, target.join, source_copy, on='A') - - def test_join_on_fails_with_different_right_index(self): - with tm.assertRaises(ValueError): - df = DataFrame({'a': np.random.choice(['m', 'f'], size=3), - 'b': np.random.randn(3)}) - df2 = DataFrame({'a': np.random.choice(['m', 'f'], size=10), - 'b': np.random.randn(10)}, - index=tm.makeCustomIndex(10, 2)) - merge(df, df2, left_on='a', right_index=True) - - def test_join_on_fails_with_different_left_index(self): - with tm.assertRaises(ValueError): - df = DataFrame({'a': np.random.choice(['m', 'f'], size=3), - 'b': np.random.randn(3)}, - index=tm.makeCustomIndex(10, 2)) - df2 = DataFrame({'a': np.random.choice(['m', 'f'], size=10), - 'b': np.random.randn(10)}) - merge(df, df2, right_on='b', left_index=True) - - def test_join_on_fails_with_different_column_counts(self): - with tm.assertRaises(ValueError): - df = DataFrame({'a': np.random.choice(['m', 'f'], size=3), - 'b': np.random.randn(3)}) - df2 = DataFrame({'a': np.random.choice(['m', 'f'], size=10), - 'b': np.random.randn(10)}, - index=tm.makeCustomIndex(10, 2)) - merge(df, df2, right_on='a', left_on=['a', 'b']) - - def test_join_on_fails_with_wrong_object_type(self): - # GH12081 - wrongly_typed = [Series([0, 1]), 2, 'str', None, np.array([0, 1])] - df = DataFrame({'a': [1, 1]}) - - for obj in wrongly_typed: - with tm.assertRaisesRegexp(ValueError, str(type(obj))): - merge(obj, df, left_on='a', right_on='a') - with tm.assertRaisesRegexp(ValueError, str(type(obj))): - merge(df, obj, left_on='a', right_on='a') - - def test_join_on_pass_vector(self): - expected = self.target.join(self.source, on='C') - del expected['C'] - - join_col = self.target.pop('C') - result = self.target.join(self.source, on=join_col) - assert_frame_equal(result, expected) - - def test_join_with_len0(self): - # nothing to merge - merged = self.target.join(self.source.reindex([]), on='C') - for col in self.source: - self.assertIn(col, merged) - self.assertTrue(merged[col].isnull().all()) - - merged2 = self.target.join(self.source.reindex([]), on='C', - how='inner') - self.assert_index_equal(merged2.columns, merged.columns) - self.assertEqual(len(merged2), 0) - - def test_join_on_inner(self): - df = DataFrame({'key': ['a', 'a', 'd', 'b', 'b', 'c']}) - df2 = DataFrame({'value': [0, 1]}, index=['a', 'b']) - - joined = df.join(df2, on='key', how='inner') - - expected = df.join(df2, on='key') - expected = expected[expected['value'].notnull()] - self.assert_series_equal(joined['key'], expected['key'], - check_dtype=False) - self.assert_series_equal(joined['value'], expected['value'], - check_dtype=False) - self.assert_index_equal(joined.index, expected.index) - - def test_join_on_singlekey_list(self): - df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']}) - df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c']) - - # corner cases - joined = df.join(df2, on=['key']) - expected = df.join(df2, on='key') - - assert_frame_equal(joined, expected) - - def test_join_on_series(self): - result = self.target.join(self.source['MergedA'], on='C') - expected = self.target.join(self.source[['MergedA']], on='C') - assert_frame_equal(result, expected) - - def test_join_on_series_buglet(self): - # GH #638 - df = DataFrame({'a': [1, 1]}) - ds = Series([2], index=[1], name='b') - result = df.join(ds, on='a') - expected = DataFrame({'a': [1, 1], - 'b': [2, 2]}, index=df.index) - tm.assert_frame_equal(result, expected) - - def test_join_index_mixed(self): - df1 = DataFrame({'A': 1., 'B': 2, 'C': 'foo', 'D': True}, - index=np.arange(10), - columns=['A', 'B', 'C', 'D']) - self.assertEqual(df1['B'].dtype, np.int64) - self.assertEqual(df1['D'].dtype, np.bool_) - - df2 = DataFrame({'A': 1., 'B': 2, 'C': 'foo', 'D': True}, - index=np.arange(0, 10, 2), - columns=['A', 'B', 'C', 'D']) - - # overlap - joined = df1.join(df2, lsuffix='_one', rsuffix='_two') - expected_columns = ['A_one', 'B_one', 'C_one', 'D_one', - 'A_two', 'B_two', 'C_two', 'D_two'] - df1.columns = expected_columns[:4] - df2.columns = expected_columns[4:] - expected = _join_by_hand(df1, df2) - assert_frame_equal(joined, expected) - - # no overlapping blocks - df1 = DataFrame(index=np.arange(10)) - df1['bool'] = True - df1['string'] = 'foo' - - df2 = DataFrame(index=np.arange(5, 15)) - df2['int'] = 1 - df2['float'] = 1. - - for kind in JOIN_TYPES: - - joined = df1.join(df2, how=kind) - expected = _join_by_hand(df1, df2, how=kind) - assert_frame_equal(joined, expected) - - joined = df2.join(df1, how=kind) - expected = _join_by_hand(df2, df1, how=kind) - assert_frame_equal(joined, expected) - - def test_join_empty_bug(self): - # generated an exception in 0.4.3 - x = DataFrame() - x.join(DataFrame([3], index=[0], columns=['A']), how='outer') - - def test_join_unconsolidated(self): - # GH #331 - a = DataFrame(randn(30, 2), columns=['a', 'b']) - c = Series(randn(30)) - a['c'] = c - d = DataFrame(randn(30, 1), columns=['q']) - - # it works! - a.join(d) - d.join(a) - - def test_join_multiindex(self): - index1 = MultiIndex.from_arrays([['a', 'a', 'a', 'b', 'b', 'b'], - [1, 2, 3, 1, 2, 3]], - names=['first', 'second']) - - index2 = MultiIndex.from_arrays([['b', 'b', 'b', 'c', 'c', 'c'], - [1, 2, 3, 1, 2, 3]], - names=['first', 'second']) - - df1 = DataFrame(data=np.random.randn(6), index=index1, - columns=['var X']) - df2 = DataFrame(data=np.random.randn(6), index=index2, - columns=['var Y']) - - df1 = df1.sortlevel(0) - df2 = df2.sortlevel(0) - - joined = df1.join(df2, how='outer') - ex_index = index1._tuple_index.union(index2._tuple_index) - expected = df1.reindex(ex_index).join(df2.reindex(ex_index)) - expected.index.names = index1.names - assert_frame_equal(joined, expected) - self.assertEqual(joined.index.names, index1.names) - - df1 = df1.sortlevel(1) - df2 = df2.sortlevel(1) - - joined = df1.join(df2, how='outer').sortlevel(0) - ex_index = index1._tuple_index.union(index2._tuple_index) - expected = df1.reindex(ex_index).join(df2.reindex(ex_index)) - expected.index.names = index1.names - - assert_frame_equal(joined, expected) - self.assertEqual(joined.index.names, index1.names) - - def test_join_inner_multiindex(self): - key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux', - 'qux', 'snap'] - key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two', - 'three', 'one'] - - data = np.random.randn(len(key1)) - data = DataFrame({'key1': key1, 'key2': key2, - 'data': data}) - - index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], - ['one', 'two', 'three']], - labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['first', 'second']) - to_join = DataFrame(np.random.randn(10, 3), index=index, - columns=['j_one', 'j_two', 'j_three']) - - joined = data.join(to_join, on=['key1', 'key2'], how='inner') - expected = merge(data, to_join.reset_index(), - left_on=['key1', 'key2'], - right_on=['first', 'second'], how='inner', - sort=False) - - expected2 = merge(to_join, data, - right_on=['key1', 'key2'], left_index=True, - how='inner', sort=False) - assert_frame_equal(joined, expected2.reindex_like(joined)) - - expected2 = merge(to_join, data, right_on=['key1', 'key2'], - left_index=True, how='inner', sort=False) - - expected = expected.drop(['first', 'second'], axis=1) - expected.index = joined.index - - self.assertTrue(joined.index.is_monotonic) - assert_frame_equal(joined, expected) - - # _assert_same_contents(expected, expected2.ix[:, expected.columns]) - - def test_join_hierarchical_mixed(self): - # GH 2024 - df = DataFrame([(1, 2, 3), (4, 5, 6)], columns=['a', 'b', 'c']) - new_df = df.groupby(['a']).agg({'b': [np.mean, np.sum]}) - other_df = DataFrame( - [(1, 2, 3), (7, 10, 6)], columns=['a', 'b', 'd']) - other_df.set_index('a', inplace=True) - # GH 9455, 12219 - with tm.assert_produces_warning(UserWarning): - result = merge(new_df, other_df, left_index=True, right_index=True) - self.assertTrue(('b', 'mean') in result) - self.assertTrue('b' in result) - - def test_join_float64_float32(self): - - a = DataFrame(randn(10, 2), columns=['a', 'b'], dtype=np.float64) - b = DataFrame(randn(10, 1), columns=['c'], dtype=np.float32) - joined = a.join(b) - self.assertEqual(joined.dtypes['a'], 'float64') - self.assertEqual(joined.dtypes['b'], 'float64') - self.assertEqual(joined.dtypes['c'], 'float32') - - a = np.random.randint(0, 5, 100).astype('int64') - b = np.random.random(100).astype('float64') - c = np.random.random(100).astype('float32') - df = DataFrame({'a': a, 'b': b, 'c': c}) - xpdf = DataFrame({'a': a, 'b': b, 'c': c}) - s = DataFrame(np.random.random(5).astype('float32'), columns=['md']) - rs = df.merge(s, left_on='a', right_index=True) - self.assertEqual(rs.dtypes['a'], 'int64') - self.assertEqual(rs.dtypes['b'], 'float64') - self.assertEqual(rs.dtypes['c'], 'float32') - self.assertEqual(rs.dtypes['md'], 'float32') - - xp = xpdf.merge(s, left_on='a', right_index=True) - assert_frame_equal(rs, xp) - - def test_join_many_non_unique_index(self): - df1 = DataFrame({"a": [1, 1], "b": [1, 1], "c": [10, 20]}) - df2 = DataFrame({"a": [1, 1], "b": [1, 2], "d": [100, 200]}) - df3 = DataFrame({"a": [1, 1], "b": [1, 2], "e": [1000, 2000]}) - idf1 = df1.set_index(["a", "b"]) - idf2 = df2.set_index(["a", "b"]) - idf3 = df3.set_index(["a", "b"]) - - result = idf1.join([idf2, idf3], how='outer') - - df_partially_merged = merge(df1, df2, on=['a', 'b'], how='outer') - expected = merge(df_partially_merged, df3, on=['a', 'b'], how='outer') - - result = result.reset_index() - expected = expected[result.columns] - expected['a'] = expected.a.astype('int64') - expected['b'] = expected.b.astype('int64') - assert_frame_equal(result, expected) - - df1 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 1], "c": [10, 20, 30]}) - df2 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "d": [100, 200, 300]}) - df3 = DataFrame( - {"a": [1, 1, 1], "b": [1, 1, 2], "e": [1000, 2000, 3000]}) - idf1 = df1.set_index(["a", "b"]) - idf2 = df2.set_index(["a", "b"]) - idf3 = df3.set_index(["a", "b"]) - result = idf1.join([idf2, idf3], how='inner') - - df_partially_merged = merge(df1, df2, on=['a', 'b'], how='inner') - expected = merge(df_partially_merged, df3, on=['a', 'b'], how='inner') - - result = result.reset_index() - - assert_frame_equal(result, expected.ix[:, result.columns]) - - # GH 11519 - df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'three', - 'two', 'two', 'one', 'three'], - 'C': np.random.randn(8), - 'D': np.random.randn(8)}) - s = Series(np.repeat(np.arange(8), 2), - index=np.repeat(np.arange(8), 2), name='TEST') - inner = df.join(s, how='inner') - outer = df.join(s, how='outer') - left = df.join(s, how='left') - right = df.join(s, how='right') - assert_frame_equal(inner, outer) - assert_frame_equal(inner, left) - assert_frame_equal(inner, right) - def test_merge_index_singlekey_right_vs_left(self): left = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'e', 'a'], 'v1': np.random.randn(7)}) @@ -651,23 +165,6 @@ def test_merge_nocopy(self): merged['d'] = 'peekaboo' self.assertTrue((right['d'] == 'peekaboo').all()) - def test_join_sort(self): - left = DataFrame({'key': ['foo', 'bar', 'baz', 'foo'], - 'value': [1, 2, 3, 4]}) - right = DataFrame({'value2': ['a', 'b', 'c']}, - index=['bar', 'baz', 'foo']) - - joined = left.join(right, on='key', sort=True) - expected = DataFrame({'key': ['bar', 'baz', 'foo', 'foo'], - 'value': [2, 3, 1, 4], - 'value2': ['a', 'b', 'c', 'c']}, - index=[1, 2, 0, 3]) - assert_frame_equal(joined, expected) - - # smoke test - joined = left.join(right, on='key', sort=False) - self.assert_index_equal(joined.index, pd.Index(lrange(4))) - def test_intelligently_handle_join_key(self): # #733, be a bit more 1337 about not returning unconsolidated DataFrame @@ -737,20 +234,6 @@ def test_handle_join_key_pass_array(self): merged = merge(left, right, left_index=True, right_on=key, how='outer') self.assert_series_equal(merged['key_0'], Series(key, name='key_0')) - def test_mixed_type_join_with_suffix(self): - # GH #916 - df = DataFrame(np.random.randn(20, 6), - columns=['a', 'b', 'c', 'd', 'e', 'f']) - df.insert(0, 'id', 0) - df.insert(5, 'dt', 'foo') - - grouped = df.groupby('id') - mn = grouped.mean() - cn = grouped.count() - - # it works! - mn.join(cn, rsuffix='_right') - def test_no_overlap_more_informative_error(self): dt = datetime.now() df1 = DataFrame({'x': ['a']}, index=[dt]) @@ -963,68 +446,6 @@ def _constructor(self): tm.assertIsInstance(result, NotADataFrame) - def test_empty_dtype_coerce(self): - - # xref to #12411 - # xref to #12045 - # xref to #11594 - # see below - - # 10571 - df1 = DataFrame(data=[[1, None], [2, None]], columns=['a', 'b']) - df2 = DataFrame(data=[[3, None], [4, None]], columns=['a', 'b']) - result = concat([df1, df2]) - expected = df1.dtypes - assert_series_equal(result.dtypes, expected) - - def test_dtype_coerceion(self): - - # 12411 - df = DataFrame({'date': [pd.Timestamp('20130101').tz_localize('UTC'), - pd.NaT]}) - - result = concat([df.iloc[[0]], df.iloc[[1]]]) - assert_series_equal(result.dtypes, df.dtypes) - - # 12045 - import datetime - df = DataFrame({'date': [datetime.datetime(2012, 1, 1), - datetime.datetime(1012, 1, 2)]}) - result = concat([df.iloc[[0]], df.iloc[[1]]]) - assert_series_equal(result.dtypes, df.dtypes) - - # 11594 - df = DataFrame({'text': ['some words'] + [None] * 9}) - result = concat([df.iloc[[0]], df.iloc[[1]]]) - assert_series_equal(result.dtypes, df.dtypes) - - def test_append_dtype_coerce(self): - - # GH 4993 - # appending with datetime will incorrectly convert datetime64 - import datetime as dt - from pandas import NaT - - df1 = DataFrame(index=[1, 2], data=[dt.datetime(2013, 1, 1, 0, 0), - dt.datetime(2013, 1, 2, 0, 0)], - columns=['start_time']) - df2 = DataFrame(index=[4, 5], data=[[dt.datetime(2013, 1, 3, 0, 0), - dt.datetime(2013, 1, 3, 6, 10)], - [dt.datetime(2013, 1, 4, 0, 0), - dt.datetime(2013, 1, 4, 7, 10)]], - columns=['start_time', 'end_time']) - - expected = concat([Series([NaT, NaT, dt.datetime(2013, 1, 3, 6, 10), - dt.datetime(2013, 1, 4, 7, 10)], - name='end_time'), - Series([dt.datetime(2013, 1, 1, 0, 0), - dt.datetime(2013, 1, 2, 0, 0), - dt.datetime(2013, 1, 3, 0, 0), - dt.datetime(2013, 1, 4, 0, 0)], - name='start_time')], axis=1) - result = df1.append(df2, ignore_index=True) - assert_frame_equal(result, expected) - def test_join_append_timedeltas(self): import datetime as dt @@ -1052,6 +473,47 @@ def test_join_append_timedeltas(self): '0r': Series([td, NaT], index=list('AB'))}) assert_frame_equal(result, expected) + def test_other_datetime_unit(self): + # GH 13389 + df1 = pd.DataFrame({'entity_id': [101, 102]}) + s = pd.Series([None, None], index=[101, 102], name='days') + + for dtype in ['datetime64[D]', 'datetime64[h]', 'datetime64[m]', + 'datetime64[s]', 'datetime64[ms]', 'datetime64[us]', + 'datetime64[ns]']: + + df2 = s.astype(dtype).to_frame('days') + # coerces to datetime64[ns], thus sholuld not be affected + self.assertEqual(df2['days'].dtype, 'datetime64[ns]') + + result = df1.merge(df2, left_on='entity_id', right_index=True) + + exp = pd.DataFrame({'entity_id': [101, 102], + 'days': np.array(['nat', 'nat'], + dtype='datetime64[ns]')}, + columns=['entity_id', 'days']) + tm.assert_frame_equal(result, exp) + + def test_other_timedelta_unit(self): + # GH 13389 + df1 = pd.DataFrame({'entity_id': [101, 102]}) + s = pd.Series([None, None], index=[101, 102], name='days') + + for dtype in ['timedelta64[D]', 'timedelta64[h]', 'timedelta64[m]', + 'timedelta64[s]', 'timedelta64[ms]', 'timedelta64[us]', + 'timedelta64[ns]']: + + df2 = s.astype(dtype).to_frame('days') + self.assertEqual(df2['days'].dtype, dtype) + + result = df1.merge(df2, left_on='entity_id', right_index=True) + + exp = pd.DataFrame({'entity_id': [101, 102], + 'days': np.array(['nat', 'nat'], + dtype=dtype)}, + columns=['entity_id', 'days']) + tm.assert_frame_equal(result, exp) + def test_overlapping_columns_error_message(self): df = DataFrame({'key': [1, 2, 3], 'v1': [4, 5, 6], @@ -1140,227 +602,6 @@ def test_merge_on_periods(self): self.assertEqual(result['value_x'].dtype, 'object') self.assertEqual(result['value_y'].dtype, 'object') - def test_concat_NaT_series(self): - # GH 11693 - # test for merging NaT series with datetime series. - x = Series(date_range('20151124 08:00', '20151124 09:00', - freq='1h', tz='US/Eastern')) - y = Series(pd.NaT, index=[0, 1], dtype='datetime64[ns, US/Eastern]') - expected = Series([x[0], x[1], pd.NaT, pd.NaT]) - - result = concat([x, y], ignore_index=True) - tm.assert_series_equal(result, expected) - - # all NaT with tz - expected = Series(pd.NaT, index=range(4), - dtype='datetime64[ns, US/Eastern]') - result = pd.concat([y, y], ignore_index=True) - tm.assert_series_equal(result, expected) - - # without tz - x = pd.Series(pd.date_range('20151124 08:00', - '20151124 09:00', freq='1h')) - y = pd.Series(pd.date_range('20151124 10:00', - '20151124 11:00', freq='1h')) - y[:] = pd.NaT - expected = pd.Series([x[0], x[1], pd.NaT, pd.NaT]) - result = pd.concat([x, y], ignore_index=True) - tm.assert_series_equal(result, expected) - - # all NaT without tz - x[:] = pd.NaT - expected = pd.Series(pd.NaT, index=range(4), - dtype='datetime64[ns]') - result = pd.concat([x, y], ignore_index=True) - tm.assert_series_equal(result, expected) - - def test_concat_tz_frame(self): - df2 = DataFrame(dict(A=pd.Timestamp('20130102', tz='US/Eastern'), - B=pd.Timestamp('20130603', tz='CET')), - index=range(5)) - - # concat - df3 = pd.concat([df2.A.to_frame(), df2.B.to_frame()], axis=1) - assert_frame_equal(df2, df3) - - def test_concat_tz_series(self): - # GH 11755 - # tz and no tz - x = Series(date_range('20151124 08:00', - '20151124 09:00', - freq='1h', tz='UTC')) - y = Series(date_range('2012-01-01', '2012-01-02')) - expected = Series([x[0], x[1], y[0], y[1]], - dtype='object') - result = concat([x, y], ignore_index=True) - tm.assert_series_equal(result, expected) - - # GH 11887 - # concat tz and object - x = Series(date_range('20151124 08:00', - '20151124 09:00', - freq='1h', tz='UTC')) - y = Series(['a', 'b']) - expected = Series([x[0], x[1], y[0], y[1]], - dtype='object') - result = concat([x, y], ignore_index=True) - tm.assert_series_equal(result, expected) - - # 12217 - # 12306 fixed I think - - # Concat'ing two UTC times - first = pd.DataFrame([[datetime(2016, 1, 1)]]) - first[0] = first[0].dt.tz_localize('UTC') - - second = pd.DataFrame([[datetime(2016, 1, 2)]]) - second[0] = second[0].dt.tz_localize('UTC') - - result = pd.concat([first, second]) - self.assertEqual(result[0].dtype, 'datetime64[ns, UTC]') - - # Concat'ing two London times - first = pd.DataFrame([[datetime(2016, 1, 1)]]) - first[0] = first[0].dt.tz_localize('Europe/London') - - second = pd.DataFrame([[datetime(2016, 1, 2)]]) - second[0] = second[0].dt.tz_localize('Europe/London') - - result = pd.concat([first, second]) - self.assertEqual(result[0].dtype, 'datetime64[ns, Europe/London]') - - # Concat'ing 2+1 London times - first = pd.DataFrame([[datetime(2016, 1, 1)], [datetime(2016, 1, 2)]]) - first[0] = first[0].dt.tz_localize('Europe/London') - - second = pd.DataFrame([[datetime(2016, 1, 3)]]) - second[0] = second[0].dt.tz_localize('Europe/London') - - result = pd.concat([first, second]) - self.assertEqual(result[0].dtype, 'datetime64[ns, Europe/London]') - - # Concat'ing 1+2 London times - first = pd.DataFrame([[datetime(2016, 1, 1)]]) - first[0] = first[0].dt.tz_localize('Europe/London') - - second = pd.DataFrame([[datetime(2016, 1, 2)], [datetime(2016, 1, 3)]]) - second[0] = second[0].dt.tz_localize('Europe/London') - - result = pd.concat([first, second]) - self.assertEqual(result[0].dtype, 'datetime64[ns, Europe/London]') - - def test_concat_tz_series_with_datetimelike(self): - # GH 12620 - # tz and timedelta - x = [pd.Timestamp('2011-01-01', tz='US/Eastern'), - pd.Timestamp('2011-02-01', tz='US/Eastern')] - y = [pd.Timedelta('1 day'), pd.Timedelta('2 day')] - result = concat([pd.Series(x), pd.Series(y)], ignore_index=True) - tm.assert_series_equal(result, pd.Series(x + y, dtype='object')) - - # tz and period - y = [pd.Period('2011-03', freq='M'), pd.Period('2011-04', freq='M')] - result = concat([pd.Series(x), pd.Series(y)], ignore_index=True) - tm.assert_series_equal(result, pd.Series(x + y, dtype='object')) - - def test_concat_period_series(self): - x = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='D')) - y = Series(pd.PeriodIndex(['2015-10-01', '2016-01-01'], freq='D')) - expected = Series([x[0], x[1], y[0], y[1]], dtype='object') - result = concat([x, y], ignore_index=True) - tm.assert_series_equal(result, expected) - self.assertEqual(result.dtype, 'object') - - # different freq - x = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='D')) - y = Series(pd.PeriodIndex(['2015-10-01', '2016-01-01'], freq='M')) - expected = Series([x[0], x[1], y[0], y[1]], dtype='object') - result = concat([x, y], ignore_index=True) - tm.assert_series_equal(result, expected) - self.assertEqual(result.dtype, 'object') - - x = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='D')) - y = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='M')) - expected = Series([x[0], x[1], y[0], y[1]], dtype='object') - result = concat([x, y], ignore_index=True) - tm.assert_series_equal(result, expected) - self.assertEqual(result.dtype, 'object') - - # non-period - x = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='D')) - y = Series(pd.DatetimeIndex(['2015-11-01', '2015-12-01'])) - expected = Series([x[0], x[1], y[0], y[1]], dtype='object') - result = concat([x, y], ignore_index=True) - tm.assert_series_equal(result, expected) - self.assertEqual(result.dtype, 'object') - - x = Series(pd.PeriodIndex(['2015-11-01', '2015-12-01'], freq='D')) - y = Series(['A', 'B']) - expected = Series([x[0], x[1], y[0], y[1]], dtype='object') - result = concat([x, y], ignore_index=True) - tm.assert_series_equal(result, expected) - self.assertEqual(result.dtype, 'object') - - def test_concat_empty_series(self): - # GH 11082 - s1 = pd.Series([1, 2, 3], name='x') - s2 = pd.Series(name='y') - res = pd.concat([s1, s2], axis=1) - exp = pd.DataFrame({'x': [1, 2, 3], 'y': [np.nan, np.nan, np.nan]}) - tm.assert_frame_equal(res, exp) - - s1 = pd.Series([1, 2, 3], name='x') - s2 = pd.Series(name='y') - res = pd.concat([s1, s2], axis=0) - # name will be reset - exp = pd.Series([1, 2, 3]) - tm.assert_series_equal(res, exp) - - # empty Series with no name - s1 = pd.Series([1, 2, 3], name='x') - s2 = pd.Series(name=None) - res = pd.concat([s1, s2], axis=1) - exp = pd.DataFrame({'x': [1, 2, 3], 0: [np.nan, np.nan, np.nan]}, - columns=['x', 0]) - tm.assert_frame_equal(res, exp) - - def test_default_index(self): - # is_series and ignore_index - s1 = pd.Series([1, 2, 3], name='x') - s2 = pd.Series([4, 5, 6], name='y') - res = pd.concat([s1, s2], axis=1, ignore_index=True) - self.assertIsInstance(res.columns, pd.RangeIndex) - exp = pd.DataFrame([[1, 4], [2, 5], [3, 6]]) - # use check_index_type=True to check the result have - # RangeIndex (default index) - tm.assert_frame_equal(res, exp, check_index_type=True, - check_column_type=True) - - # is_series and all inputs have no names - s1 = pd.Series([1, 2, 3]) - s2 = pd.Series([4, 5, 6]) - res = pd.concat([s1, s2], axis=1, ignore_index=False) - self.assertIsInstance(res.columns, pd.RangeIndex) - exp = pd.DataFrame([[1, 4], [2, 5], [3, 6]]) - exp.columns = pd.RangeIndex(2) - tm.assert_frame_equal(res, exp, check_index_type=True, - check_column_type=True) - - # is_dataframe and ignore_index - df1 = pd.DataFrame({'A': [1, 2], 'B': [5, 6]}) - df2 = pd.DataFrame({'A': [3, 4], 'B': [7, 8]}) - - res = pd.concat([df1, df2], axis=0, ignore_index=True) - exp = pd.DataFrame([[1, 5], [2, 6], [3, 7], [4, 8]], - columns=['A', 'B']) - tm.assert_frame_equal(res, exp, check_index_type=True, - check_column_type=True) - - res = pd.concat([df1, df2], axis=1, ignore_index=True) - exp = pd.DataFrame([[1, 5, 3, 7], [2, 6, 4, 8]]) - tm.assert_frame_equal(res, exp, check_index_type=True, - check_column_type=True) - def test_indicator(self): # PR #10054. xref #7412 and closes #8790. df1 = DataFrame({'col1': [0, 1], 'col_left': [ @@ -2122,90 +1363,6 @@ def f(): self.assertRaises(NotImplementedError, f) -def _check_join(left, right, result, join_col, how='left', - lsuffix='_x', rsuffix='_y'): - - # some smoke tests - for c in join_col: - assert(result[c].notnull().all()) - - left_grouped = left.groupby(join_col) - right_grouped = right.groupby(join_col) - - for group_key, group in result.groupby(join_col): - l_joined = _restrict_to_columns(group, left.columns, lsuffix) - r_joined = _restrict_to_columns(group, right.columns, rsuffix) - - try: - lgroup = left_grouped.get_group(group_key) - except KeyError: - if how in ('left', 'inner'): - raise AssertionError('key %s should not have been in the join' - % str(group_key)) - - _assert_all_na(l_joined, left.columns, join_col) - else: - _assert_same_contents(l_joined, lgroup) - - try: - rgroup = right_grouped.get_group(group_key) - except KeyError: - if how in ('right', 'inner'): - raise AssertionError('key %s should not have been in the join' - % str(group_key)) - - _assert_all_na(r_joined, right.columns, join_col) - else: - _assert_same_contents(r_joined, rgroup) - - -def _restrict_to_columns(group, columns, suffix): - found = [c for c in group.columns - if c in columns or c.replace(suffix, '') in columns] - - # filter - group = group.ix[:, found] - - # get rid of suffixes, if any - group = group.rename(columns=lambda x: x.replace(suffix, '')) - - # put in the right order... - group = group.ix[:, columns] - - return group - - -def _assert_same_contents(join_chunk, source): - NA_SENTINEL = -1234567 # drop_duplicates not so NA-friendly... - - jvalues = join_chunk.fillna(NA_SENTINEL).drop_duplicates().values - svalues = source.fillna(NA_SENTINEL).drop_duplicates().values - - rows = set(tuple(row) for row in jvalues) - assert(len(rows) == len(source)) - assert(all(tuple(row) in rows for row in svalues)) - - -def _assert_all_na(join_chunk, source_columns, join_col): - for c in source_columns: - if c in join_col: - continue - assert(join_chunk[c].isnull().all()) - - -def _join_by_hand(a, b, how='left'): - join_index = a.index.join(b.index, how=how) - - a_re = a.reindex(join_index) - b_re = b.reindex(join_index) - - result_columns = a.columns.append(b.columns) - - for col, s in compat.iteritems(b_re): - a_re[col] = s - return a_re.reindex(columns=result_columns) - - if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) diff --git a/pandas/tools/tests/test_merge_asof.py b/pandas/tools/tests/test_merge_asof.py new file mode 100644 index 0000000000000..f413618624592 --- /dev/null +++ b/pandas/tools/tests/test_merge_asof.py @@ -0,0 +1,464 @@ +import nose +import os + +import numpy as np +import pandas as pd +from pandas import (merge_asof, read_csv, + to_datetime, Timedelta) +from pandas.tools.merge import MergeError +from pandas.util import testing as tm +from pandas.util.testing import assert_frame_equal + + +class TestAsOfMerge(tm.TestCase): + _multiprocess_can_split_ = True + + def read_data(self, name, dedupe=False): + path = os.path.join(tm.get_data_path(), name) + x = read_csv(path) + if dedupe: + x = (x.drop_duplicates(['time', 'ticker'], keep='last') + .reset_index(drop=True) + ) + x.time = to_datetime(x.time) + return x + + def setUp(self): + + self.trades = self.read_data('trades.csv') + self.quotes = self.read_data('quotes.csv', dedupe=True) + self.asof = self.read_data('asof.csv') + self.tolerance = self.read_data('tolerance.csv') + self.allow_exact_matches = self.read_data('allow_exact_matches.csv') + self.allow_exact_matches_and_tolerance = self.read_data( + 'allow_exact_matches_and_tolerance.csv') + + def test_examples1(self): + """ doc-string examples """ + + left = pd.DataFrame({'a': [1, 5, 10], + 'left_val': ['a', 'b', 'c']}) + right = pd.DataFrame({'a': [1, 2, 3, 6, 7], + 'right_val': [1, 2, 3, 6, 7]}) + + pd.merge_asof(left, right, on='a') + + def test_examples2(self): + """ doc-string examples """ + + trades = pd.DataFrame({ + 'time': pd.to_datetime(['20160525 13:30:00.023', + '20160525 13:30:00.038', + '20160525 13:30:00.048', + '20160525 13:30:00.048', + '20160525 13:30:00.048']), + 'ticker': ['MSFT', 'MSFT', + 'GOOG', 'GOOG', 'AAPL'], + 'price': [51.95, 51.95, + 720.77, 720.92, 98.00], + 'quantity': [75, 155, + 100, 100, 100]}, + columns=['time', 'ticker', 'price', 'quantity']) + + quotes = pd.DataFrame({ + 'time': pd.to_datetime(['20160525 13:30:00.023', + '20160525 13:30:00.023', + '20160525 13:30:00.030', + '20160525 13:30:00.041', + '20160525 13:30:00.048', + '20160525 13:30:00.049', + '20160525 13:30:00.072', + '20160525 13:30:00.075']), + 'ticker': ['GOOG', 'MSFT', 'MSFT', + 'MSFT', 'GOOG', 'AAPL', 'GOOG', + 'MSFT'], + 'bid': [720.50, 51.95, 51.97, 51.99, + 720.50, 97.99, 720.50, 52.01], + 'ask': [720.93, 51.96, 51.98, 52.00, + 720.93, 98.01, 720.88, 52.03]}, + columns=['time', 'ticker', 'bid', 'ask']) + + pd.merge_asof(trades, quotes, + on='time', + by='ticker') + + pd.merge_asof(trades, quotes, + on='time', + by='ticker', + tolerance=pd.Timedelta('2ms')) + + pd.merge_asof(trades, quotes, + on='time', + by='ticker', + tolerance=pd.Timedelta('10ms'), + allow_exact_matches=False) + + def test_basic(self): + + expected = self.asof + trades = self.trades + quotes = self.quotes + + result = merge_asof(trades, quotes, + on='time', + by='ticker') + assert_frame_equal(result, expected) + + def test_basic_categorical(self): + + expected = self.asof + trades = self.trades.copy() + trades.ticker = trades.ticker.astype('category') + quotes = self.quotes.copy() + quotes.ticker = quotes.ticker.astype('category') + + result = merge_asof(trades, quotes, + on='time', + by='ticker') + assert_frame_equal(result, expected) + + def test_missing_right_by(self): + + expected = self.asof + trades = self.trades + quotes = self.quotes + + q = quotes[quotes.ticker != 'MSFT'] + result = merge_asof(trades, q, + on='time', + by='ticker') + expected.loc[expected.ticker == 'MSFT', ['bid', 'ask']] = np.nan + assert_frame_equal(result, expected) + + def test_basic2(self): + + expected = self.read_data('asof2.csv') + trades = self.read_data('trades2.csv') + quotes = self.read_data('quotes2.csv', dedupe=True) + + result = merge_asof(trades, quotes, + on='time', + by='ticker') + assert_frame_equal(result, expected) + + def test_basic_no_by(self): + f = lambda x: x[x.ticker == 'MSFT'].drop('ticker', axis=1) \ + .reset_index(drop=True) + + # just use a single ticker + expected = f(self.asof) + trades = f(self.trades) + quotes = f(self.quotes) + + result = merge_asof(trades, quotes, + on='time') + assert_frame_equal(result, expected) + + def test_valid_join_keys(self): + + trades = self.trades + quotes = self.quotes + + with self.assertRaises(MergeError): + merge_asof(trades, quotes, + left_on='time', + right_on='bid', + by='ticker') + + with self.assertRaises(MergeError): + merge_asof(trades, quotes, + on=['time', 'ticker'], + by='ticker') + + with self.assertRaises(MergeError): + merge_asof(trades, quotes, + by='ticker') + + def test_with_duplicates(self): + + q = pd.concat([self.quotes, self.quotes]).sort_values( + ['time', 'ticker']).reset_index(drop=True) + result = merge_asof(self.trades, q, + on='time', + by='ticker') + expected = self.read_data('asof.csv') + assert_frame_equal(result, expected) + + def test_with_duplicates_no_on(self): + + df1 = pd.DataFrame({'key': [1, 1, 3], + 'left_val': [1, 2, 3]}) + df2 = pd.DataFrame({'key': [1, 2, 2], + 'right_val': [1, 2, 3]}) + result = merge_asof(df1, df2, on='key') + expected = pd.DataFrame({'key': [1, 1, 3], + 'left_val': [1, 2, 3], + 'right_val': [1, 1, 3]}) + assert_frame_equal(result, expected) + + def test_valid_allow_exact_matches(self): + + trades = self.trades + quotes = self.quotes + + with self.assertRaises(MergeError): + merge_asof(trades, quotes, + on='time', + by='ticker', + allow_exact_matches='foo') + + def test_valid_tolerance(self): + + trades = self.trades + quotes = self.quotes + + # dti + merge_asof(trades, quotes, + on='time', + by='ticker', + tolerance=Timedelta('1s')) + + # integer + merge_asof(trades.reset_index(), quotes.reset_index(), + on='index', + by='ticker', + tolerance=1) + + # incompat + with self.assertRaises(MergeError): + merge_asof(trades, quotes, + on='time', + by='ticker', + tolerance=1) + + # invalid + with self.assertRaises(MergeError): + merge_asof(trades.reset_index(), quotes.reset_index(), + on='index', + by='ticker', + tolerance=1.0) + + # invalid negative + with self.assertRaises(MergeError): + merge_asof(trades, quotes, + on='time', + by='ticker', + tolerance=-Timedelta('1s')) + + with self.assertRaises(MergeError): + merge_asof(trades.reset_index(), quotes.reset_index(), + on='index', + by='ticker', + tolerance=-1) + + def test_non_sorted(self): + + trades = self.trades.sort_values('time', ascending=False) + quotes = self.quotes.sort_values('time', ascending=False) + + # we require that we are already sorted on time & quotes + self.assertFalse(trades.time.is_monotonic) + self.assertFalse(quotes.time.is_monotonic) + with self.assertRaises(ValueError): + merge_asof(trades, quotes, + on='time', + by='ticker') + + trades = self.trades.sort_values('time') + self.assertTrue(trades.time.is_monotonic) + self.assertFalse(quotes.time.is_monotonic) + with self.assertRaises(ValueError): + merge_asof(trades, quotes, + on='time', + by='ticker') + + quotes = self.quotes.sort_values('time') + self.assertTrue(trades.time.is_monotonic) + self.assertTrue(quotes.time.is_monotonic) + + # ok, though has dupes + merge_asof(trades, self.quotes, + on='time', + by='ticker') + + def test_tolerance(self): + + trades = self.trades + quotes = self.quotes + + result = merge_asof(trades, quotes, + on='time', + by='ticker', + tolerance=Timedelta('1day')) + expected = self.tolerance + assert_frame_equal(result, expected) + + def test_allow_exact_matches(self): + + result = merge_asof(self.trades, self.quotes, + on='time', + by='ticker', + allow_exact_matches=False) + expected = self.allow_exact_matches + assert_frame_equal(result, expected) + + def test_allow_exact_matches_and_tolerance(self): + + result = merge_asof(self.trades, self.quotes, + on='time', + by='ticker', + tolerance=Timedelta('100ms'), + allow_exact_matches=False) + expected = self.allow_exact_matches_and_tolerance + assert_frame_equal(result, expected) + + def test_allow_exact_matches_and_tolerance2(self): + # GH 13695 + df1 = pd.DataFrame({ + 'time': pd.to_datetime(['2016-07-15 13:30:00.030']), + 'username': ['bob']}) + df2 = pd.DataFrame({ + 'time': pd.to_datetime(['2016-07-15 13:30:00.000', + '2016-07-15 13:30:00.030']), + 'version': [1, 2]}) + + result = pd.merge_asof(df1, df2, on='time') + expected = pd.DataFrame({ + 'time': pd.to_datetime(['2016-07-15 13:30:00.030']), + 'username': ['bob'], + 'version': [2]}) + assert_frame_equal(result, expected) + + result = pd.merge_asof(df1, df2, on='time', allow_exact_matches=False) + expected = pd.DataFrame({ + 'time': pd.to_datetime(['2016-07-15 13:30:00.030']), + 'username': ['bob'], + 'version': [1]}) + assert_frame_equal(result, expected) + + result = pd.merge_asof(df1, df2, on='time', allow_exact_matches=False, + tolerance=pd.Timedelta('10ms')) + expected = pd.DataFrame({ + 'time': pd.to_datetime(['2016-07-15 13:30:00.030']), + 'username': ['bob'], + 'version': [np.nan]}) + assert_frame_equal(result, expected) + + def test_allow_exact_matches_and_tolerance3(self): + # GH 13709 + df1 = pd.DataFrame({ + 'time': pd.to_datetime(['2016-07-15 13:30:00.030', + '2016-07-15 13:30:00.030']), + 'username': ['bob', 'charlie']}) + df2 = pd.DataFrame({ + 'time': pd.to_datetime(['2016-07-15 13:30:00.000', + '2016-07-15 13:30:00.030']), + 'version': [1, 2]}) + + result = pd.merge_asof(df1, df2, on='time', allow_exact_matches=False, + tolerance=pd.Timedelta('10ms')) + expected = pd.DataFrame({ + 'time': pd.to_datetime(['2016-07-15 13:30:00.030', + '2016-07-15 13:30:00.030']), + 'username': ['bob', 'charlie'], + 'version': [np.nan, np.nan]}) + assert_frame_equal(result, expected) + + def test_by_int(self): + # we specialize by type, so test that this is correct + df1 = pd.DataFrame({ + 'time': pd.to_datetime(['20160525 13:30:00.020', + '20160525 13:30:00.030', + '20160525 13:30:00.040', + '20160525 13:30:00.050', + '20160525 13:30:00.060']), + 'key': [1, 2, 1, 3, 2], + 'value1': [1.1, 1.2, 1.3, 1.4, 1.5]}, + columns=['time', 'key', 'value1']) + + df2 = pd.DataFrame({ + 'time': pd.to_datetime(['20160525 13:30:00.015', + '20160525 13:30:00.020', + '20160525 13:30:00.025', + '20160525 13:30:00.035', + '20160525 13:30:00.040', + '20160525 13:30:00.055', + '20160525 13:30:00.060', + '20160525 13:30:00.065']), + 'key': [2, 1, 1, 3, 2, 1, 2, 3], + 'value2': [2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8]}, + columns=['time', 'key', 'value2']) + + result = pd.merge_asof(df1, df2, on='time', by='key') + + expected = pd.DataFrame({ + 'time': pd.to_datetime(['20160525 13:30:00.020', + '20160525 13:30:00.030', + '20160525 13:30:00.040', + '20160525 13:30:00.050', + '20160525 13:30:00.060']), + 'key': [1, 2, 1, 3, 2], + 'value1': [1.1, 1.2, 1.3, 1.4, 1.5], + 'value2': [2.2, 2.1, 2.3, 2.4, 2.7]}, + columns=['time', 'key', 'value1', 'value2']) + + assert_frame_equal(result, expected) + + def test_on_float(self): + # mimics how to determine the minimum-price variation + df1 = pd.DataFrame({ + 'price': [5.01, 0.0023, 25.13, 340.05, 30.78, 1040.90, 0.0078], + 'symbol': list("ABCDEFG")}, + columns=['symbol', 'price']) + + df2 = pd.DataFrame({ + 'price': [0.0, 1.0, 100.0], + 'mpv': [0.0001, 0.01, 0.05]}, + columns=['price', 'mpv']) + + df1 = df1.sort_values('price').reset_index(drop=True) + + result = pd.merge_asof(df1, df2, on='price') + + expected = pd.DataFrame({ + 'symbol': list("BGACEDF"), + 'price': [0.0023, 0.0078, 5.01, 25.13, 30.78, 340.05, 1040.90], + 'mpv': [0.0001, 0.0001, 0.01, 0.01, 0.01, 0.05, 0.05]}, + columns=['symbol', 'price', 'mpv']) + + assert_frame_equal(result, expected) + + def test_on_float_by_int(self): + # type specialize both "by" and "on" parameters + df1 = pd.DataFrame({ + 'symbol': list("AAABBBCCC"), + 'exch': [1, 2, 3, 1, 2, 3, 1, 2, 3], + 'price': [3.26, 3.2599, 3.2598, 12.58, 12.59, + 12.5, 378.15, 378.2, 378.25]}, + columns=['symbol', 'exch', 'price']) + + df2 = pd.DataFrame({ + 'exch': [1, 1, 1, 2, 2, 2, 3, 3, 3], + 'price': [0.0, 1.0, 100.0, 0.0, 5.0, 100.0, 0.0, 5.0, 1000.0], + 'mpv': [0.0001, 0.01, 0.05, 0.0001, 0.01, 0.1, 0.0001, 0.25, 1.0]}, + columns=['exch', 'price', 'mpv']) + + df1 = df1.sort_values('price').reset_index(drop=True) + df2 = df2.sort_values('price').reset_index(drop=True) + + result = pd.merge_asof(df1, df2, on='price', by='exch') + + expected = pd.DataFrame({ + 'symbol': list("AAABBBCCC"), + 'exch': [3, 2, 1, 3, 1, 2, 1, 2, 3], + 'price': [3.2598, 3.2599, 3.26, 12.5, 12.58, + 12.59, 378.15, 378.2, 378.25], + 'mpv': [0.0001, 0.0001, 0.01, 0.25, 0.01, 0.01, 0.05, 0.1, 0.25]}, + columns=['symbol', 'exch', 'price', 'mpv']) + + assert_frame_equal(result, expected) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tools/tests/test_ordered_merge.py b/pandas/tools/tests/test_merge_ordered.py similarity index 85% rename from pandas/tools/tests/test_ordered_merge.py rename to pandas/tools/tests/test_merge_ordered.py index 53f00d9761f32..0511a0ca6d1cf 100644 --- a/pandas/tools/tests/test_ordered_merge.py +++ b/pandas/tools/tests/test_merge_ordered.py @@ -1,7 +1,7 @@ import nose import pandas as pd -from pandas import DataFrame, ordered_merge +from pandas import DataFrame, merge_ordered from pandas.util import testing as tm from pandas.util.testing import assert_frame_equal @@ -17,10 +17,15 @@ def setUp(self): self.right = DataFrame({'key': ['b', 'c', 'd', 'f'], 'rvalue': [1, 2, 3., 4]}) + def test_deprecation(self): + + with tm.assert_produces_warning(FutureWarning): + pd.ordered_merge(self.left, self.right, on='key') + # GH #813 def test_basic(self): - result = ordered_merge(self.left, self.right, on='key') + result = merge_ordered(self.left, self.right, on='key') expected = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'], 'lvalue': [1, nan, 2, nan, 3, nan], 'rvalue': [nan, 1, 2, 3, nan, 4]}) @@ -28,7 +33,7 @@ def test_basic(self): assert_frame_equal(result, expected) def test_ffill(self): - result = ordered_merge( + result = merge_ordered( self.left, self.right, on='key', fill_method='ffill') expected = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'], 'lvalue': [1., 1, 2, 2, 3, 3.], @@ -42,7 +47,7 @@ def test_multigroup(self): left['group'] = ['a'] * 3 + ['b'] * 3 # right['group'] = ['a'] * 4 + ['b'] * 4 - result = ordered_merge(left, self.right, on='key', left_by='group', + result = merge_ordered(left, self.right, on='key', left_by='group', fill_method='ffill') expected = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'] * 2, 'lvalue': [1., 1, 2, 2, 3, 3.] * 2, @@ -51,11 +56,11 @@ def test_multigroup(self): assert_frame_equal(result, expected.ix[:, result.columns]) - result2 = ordered_merge(self.right, left, on='key', right_by='group', + result2 = merge_ordered(self.right, left, on='key', right_by='group', fill_method='ffill') assert_frame_equal(result, result2.ix[:, result.columns]) - result = ordered_merge(left, self.right, on='key', left_by='group') + result = merge_ordered(left, self.right, on='key', left_by='group') self.assertTrue(result['group'].notnull().all()) def test_merge_type(self): diff --git a/pandas/tools/tests/test_pivot.py b/pandas/tools/tests/test_pivot.py index 82feaae13f771..75c6db23b4bc7 100644 --- a/pandas/tools/tests/test_pivot.py +++ b/pandas/tools/tests/test_pivot.py @@ -779,6 +779,48 @@ def test_pivot_table_with_iterator_values(self): ) tm.assert_frame_equal(pivot_values_gen, pivot_values_list) + def test_pivot_table_margins_name_with_aggfunc_list(self): + # GH 13354 + margins_name = 'Weekly' + costs = pd.DataFrame( + {'item': ['bacon', 'cheese', 'bacon', 'cheese'], + 'cost': [2.5, 4.5, 3.2, 3.3], + 'day': ['M', 'M', 'T', 'T']} + ) + table = costs.pivot_table( + index="item", columns="day", margins=True, + margins_name=margins_name, aggfunc=[np.mean, max] + ) + ix = pd.Index( + ['bacon', 'cheese', margins_name], dtype='object', name='item' + ) + tups = [('mean', 'cost', 'M'), ('mean', 'cost', 'T'), + ('mean', 'cost', margins_name), ('max', 'cost', 'M'), + ('max', 'cost', 'T'), ('max', 'cost', margins_name)] + cols = pd.MultiIndex.from_tuples(tups, names=[None, None, 'day']) + expected = pd.DataFrame(table.values, index=ix, columns=cols) + tm.assert_frame_equal(table, expected) + + def test_categorical_margins(self): + # GH 10989 + df = pd.DataFrame({'x': np.arange(8), + 'y': np.arange(8) // 4, + 'z': np.arange(8) % 2}) + + expected = pd.DataFrame([[1.0, 2.0, 1.5], [5, 6, 5.5], [3, 4, 3.5]]) + expected.index = Index([0, 1, 'All'], name='y') + expected.columns = Index([0, 1, 'All'], name='z') + + data = df.copy() + table = data.pivot_table('x', 'y', 'z', margins=True) + tm.assert_frame_equal(table, expected) + + data = df.copy() + data.y = data.y.astype('category') + data.z = data.z.astype('category') + table = data.pivot_table('x', 'y', 'z', margins=True) + tm.assert_frame_equal(table, expected) + class TestCrosstab(tm.TestCase): @@ -853,7 +895,9 @@ def test_crosstab_margins(self): all_cols = result['All', ''] exp_cols = df.groupby(['a']).size().astype('i8') - exp_cols = exp_cols.append(Series([len(df)], index=['All'])) + # to keep index.name + exp_margin = Series([len(df)], index=Index(['All'], name='a')) + exp_cols = exp_cols.append(exp_margin) exp_cols.name = ('All', '') tm.assert_series_equal(all_cols, exp_cols) @@ -897,26 +941,6 @@ def test_crosstab_dropna(self): names=['b', 'c']) tm.assert_index_equal(res.columns, m) - def test_categorical_margins(self): - # GH 10989 - df = pd.DataFrame({'x': np.arange(8), - 'y': np.arange(8) // 4, - 'z': np.arange(8) % 2}) - - expected = pd.DataFrame([[1.0, 2.0, 1.5], [5, 6, 5.5], [3, 4, 3.5]]) - expected.index = Index([0, 1, 'All'], name='y') - expected.columns = Index([0, 1, 'All'], name='z') - - data = df.copy() - table = data.pivot_table('x', 'y', 'z', margins=True) - tm.assert_frame_equal(table, expected) - - data = df.copy() - data.y = data.y.astype('category') - data.z = data.z.astype('category') - table = data.pivot_table('x', 'y', 'z', margins=True) - tm.assert_frame_equal(table, expected) - def test_crosstab_no_overlap(self): # GS 10291 @@ -1062,7 +1086,6 @@ def test_crosstab_normalize(self): dtype='object'), columns=pd.Index([3, 4, 'All'], name='b')) - tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='index', margins=True), row_normal_margins) tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='columns', diff --git a/pandas/tools/tests/test_tile.py b/pandas/tools/tests/test_tile.py index 0b91fd1ef1c02..16731620a1dcd 100644 --- a/pandas/tools/tests/test_tile.py +++ b/pandas/tools/tests/test_tile.py @@ -19,8 +19,9 @@ class TestCut(tm.TestCase): def test_simple(self): data = np.ones(5) result = cut(data, 4, labels=False) - desired = np.array([1, 1, 1, 1, 1], dtype=np.int64) - tm.assert_numpy_array_equal(result, desired) + desired = np.array([1, 1, 1, 1, 1]) + tm.assert_numpy_array_equal(result, desired, + check_dtype=False) def test_bins(self): data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]) @@ -216,8 +217,7 @@ def test_label_formatting(self): def test_qcut_binning_issues(self): # #1978, 1979 - path = os.path.join(curpath(), 'cut_data.csv') - + path = os.path.join(tm.get_data_path(), 'cut_data.csv') arr = np.loadtxt(path) result = qcut(arr, 20) diff --git a/pandas/tools/tests/test_util.py b/pandas/tools/tests/test_util.py index c592b33bdab9a..7532997ef9d8e 100644 --- a/pandas/tools/tests/test_util.py +++ b/pandas/tools/tests/test_util.py @@ -120,7 +120,8 @@ def test_series_numeric(self): def test_error(self): s = pd.Series([1, -3.14, 'apple']) - with tm.assertRaises(ValueError): + msg = 'Unable to parse string "apple" at position 2' + with tm.assertRaisesRegexp(ValueError, msg): to_numeric(s, errors='raise') res = to_numeric(s, errors='ignore') @@ -131,9 +132,15 @@ def test_error(self): expected = pd.Series([1, -3.14, np.nan]) tm.assert_series_equal(res, expected) + s = pd.Series(['orange', 1, -3.14, 'apple']) + msg = 'Unable to parse string "orange" at position 0' + with tm.assertRaisesRegexp(ValueError, msg): + to_numeric(s, errors='raise') + def test_error_seen_bool(self): s = pd.Series([True, False, 'apple']) - with tm.assertRaises(ValueError): + msg = 'Unable to parse string "apple" at position 2' + with tm.assertRaisesRegexp(ValueError, msg): to_numeric(s, errors='raise') res = to_numeric(s, errors='ignore') @@ -291,6 +298,83 @@ def test_non_hashable(self): with self.assertRaisesRegexp(TypeError, "Invalid object type"): pd.to_numeric(s) + def test_downcast(self): + # see gh-13352 + mixed_data = ['1', 2, 3] + int_data = [1, 2, 3] + date_data = np.array(['1970-01-02', '1970-01-03', + '1970-01-04'], dtype='datetime64[D]') + + invalid_downcast = 'unsigned-integer' + msg = 'invalid downcasting method provided' + + smallest_int_dtype = np.dtype(np.typecodes['Integer'][0]) + smallest_uint_dtype = np.dtype(np.typecodes['UnsignedInteger'][0]) + + # support below np.float32 is rare and far between + float_32_char = np.dtype(np.float32).char + smallest_float_dtype = float_32_char + + for data in (mixed_data, int_data, date_data): + with self.assertRaisesRegexp(ValueError, msg): + pd.to_numeric(data, downcast=invalid_downcast) + + expected = np.array([1, 2, 3], dtype=np.int64) + + res = pd.to_numeric(data) + tm.assert_numpy_array_equal(res, expected) + + res = pd.to_numeric(data, downcast=None) + tm.assert_numpy_array_equal(res, expected) + + expected = np.array([1, 2, 3], dtype=smallest_int_dtype) + + for signed_downcast in ('integer', 'signed'): + res = pd.to_numeric(data, downcast=signed_downcast) + tm.assert_numpy_array_equal(res, expected) + + expected = np.array([1, 2, 3], dtype=smallest_uint_dtype) + res = pd.to_numeric(data, downcast='unsigned') + tm.assert_numpy_array_equal(res, expected) + + expected = np.array([1, 2, 3], dtype=smallest_float_dtype) + res = pd.to_numeric(data, downcast='float') + tm.assert_numpy_array_equal(res, expected) + + # if we can't successfully cast the given + # data to a numeric dtype, do not bother + # with the downcast parameter + data = ['foo', 2, 3] + expected = np.array(data, dtype=object) + res = pd.to_numeric(data, errors='ignore', + downcast='unsigned') + tm.assert_numpy_array_equal(res, expected) + + # cannot cast to an unsigned integer because + # we have a negative number + data = ['-1', 2, 3] + expected = np.array([-1, 2, 3], dtype=np.int64) + res = pd.to_numeric(data, downcast='unsigned') + tm.assert_numpy_array_equal(res, expected) + + # cannot cast to an integer (signed or unsigned) + # because we have a float number + data = ['1.1', 2, 3] + expected = np.array([1.1, 2, 3], dtype=np.float64) + + for downcast in ('integer', 'signed', 'unsigned'): + res = pd.to_numeric(data, downcast=downcast) + tm.assert_numpy_array_equal(res, expected) + + # the smallest integer dtype need not be np.(u)int8 + data = ['256', 257, 258] + + for downcast, expected_dtype in zip( + ['integer', 'signed', 'unsigned'], + [np.int16, np.int16, np.uint16]): + expected = np.array([256, 257, 258], dtype=expected_dtype) + res = pd.to_numeric(data, downcast=downcast) + tm.assert_numpy_array_equal(res, expected) if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/tools/tile.py b/pandas/tools/tile.py index b0bbf8ba70354..62bbfc2f630a5 100644 --- a/pandas/tools/tile.py +++ b/pandas/tools/tile.py @@ -2,12 +2,14 @@ Quantilization functions and related stuff """ +from pandas.types.missing import isnull +from pandas.types.common import (is_float, is_integer, + is_scalar) + from pandas.core.api import Series from pandas.core.categorical import Categorical import pandas.core.algorithms as algos -import pandas.core.common as com import pandas.core.nanops as nanops -import pandas.lib as lib from pandas.compat import zip import numpy as np @@ -80,7 +82,7 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, """ # NOTE: this binning code is changed a bit from histogram for var(x) == 0 if not np.iterable(bins): - if lib.isscalar(bins) and bins < 1: + if is_scalar(bins) and bins < 1: raise ValueError("`bins` should be a positive integer.") try: # for array-like sz = x.size @@ -164,7 +166,7 @@ def qcut(x, q, labels=None, retbins=False, precision=3): >>> pd.qcut(range(5), 4, labels=False) array([0, 0, 1, 2, 3], dtype=int64) """ - if com.is_integer(q): + if is_integer(q): quantiles = np.linspace(0, 1, q + 1) else: quantiles = q @@ -194,7 +196,7 @@ def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False, if include_lowest: ids[x == bins[0]] = 1 - na_mask = com.isnull(x) | (ids == len(bins)) | (ids == 0) + na_mask = isnull(x) | (ids == len(bins)) | (ids == 0) has_nas = na_mask.any() if labels is not False: @@ -264,7 +266,7 @@ def _format_label(x, precision=3): fmt_str = '%%.%dg' % precision if np.isinf(x): return str(x) - elif com.is_float(x): + elif is_float(x): frac, whole = np.modf(x) sgn = '-' if x < 0 else '' whole = abs(whole) diff --git a/pandas/tools/util.py b/pandas/tools/util.py index 61d2c0adce2fe..b8b28663387cc 100644 --- a/pandas/tools/util.py +++ b/pandas/tools/util.py @@ -1,6 +1,12 @@ import numpy as np import pandas.lib as lib +from pandas.types.common import (is_number, + is_numeric_dtype, + is_datetime_or_timedelta_dtype, + _ensure_object) +from pandas.types.cast import _possibly_downcast_to_dtype + import pandas as pd from pandas.compat import reduce from pandas.core.index import Index @@ -50,7 +56,7 @@ def compose(*funcs): return reduce(_compose2, funcs) -def to_numeric(arg, errors='raise'): +def to_numeric(arg, errors='raise', downcast=None): """ Convert argument to a numeric type. @@ -61,6 +67,27 @@ def to_numeric(arg, errors='raise'): - If 'raise', then invalid parsing will raise an exception - If 'coerce', then invalid parsing will be set as NaN - If 'ignore', then invalid parsing will return the input + downcast : {'integer', 'signed', 'unsigned', 'float'} , default None + If not None, and if the data has been successfully cast to a + numerical dtype (or if the data was numeric to begin with), + downcast that resulting data to the smallest numerical dtype + possible according to the following rules: + + - 'integer' or 'signed': smallest signed int dtype (min.: np.int8) + - 'unsigned': smallest unsigned int dtype (min.: np.uint8) + - 'float': smallest float dtype (min.: np.float32) + + As this behaviour is separate from the core conversion to + numeric values, any errors raised during the downcasting + will be surfaced regardless of the value of the 'errors' input. + + In addition, downcasting will only occur if the size + of the resulting data's dtype is strictly larger than + the dtype it is to be cast to, so if none of the dtypes + checked satisfy that specification, no downcasting will be + performed on the data. + + .. versionadded:: 0.19.0 Returns ------- @@ -74,10 +101,37 @@ def to_numeric(arg, errors='raise'): >>> import pandas as pd >>> s = pd.Series(['1.0', '2', -3]) >>> pd.to_numeric(s) + 0 1.0 + 1 2.0 + 2 -3.0 + dtype: float64 + >>> pd.to_numeric(s, downcast='float') + 0 1.0 + 1 2.0 + 2 -3.0 + dtype: float32 + >>> pd.to_numeric(s, downcast='signed') + 0 1 + 1 2 + 2 -3 + dtype: int8 >>> s = pd.Series(['apple', '1.0', '2', -3]) >>> pd.to_numeric(s, errors='ignore') + 0 apple + 1 1.0 + 2 2 + 3 -3 + dtype: object >>> pd.to_numeric(s, errors='coerce') + 0 NaN + 1 1.0 + 2 2.0 + 3 -3.0 + dtype: float64 """ + if downcast not in (None, 'integer', 'signed', 'unsigned', 'float'): + raise ValueError('invalid downcasting method provided') + is_series = False is_index = False is_scalar = False @@ -93,7 +147,7 @@ def to_numeric(arg, errors='raise'): elif isinstance(arg, (list, tuple)): values = np.array(arg, dtype='O') elif np.isscalar(arg): - if com.is_number(arg): + if is_number(arg): return arg is_scalar = True values = np.array([arg], dtype='O') @@ -102,20 +156,50 @@ def to_numeric(arg, errors='raise'): else: values = arg - if com.is_numeric_dtype(values): - pass - elif com.is_datetime_or_timedelta_dtype(values): - values = values.astype(np.int64) - else: - values = com._ensure_object(values) - coerce_numeric = False if errors in ('ignore', 'raise') else True - - try: + try: + if is_numeric_dtype(values): + pass + elif is_datetime_or_timedelta_dtype(values): + values = values.astype(np.int64) + else: + values = _ensure_object(values) + coerce_numeric = False if errors in ('ignore', 'raise') else True values = lib.maybe_convert_numeric(values, set(), coerce_numeric=coerce_numeric) - except: - if errors == 'raise': - raise + + except Exception: + if errors == 'raise': + raise + + # attempt downcast only if the data has been successfully converted + # to a numerical dtype and if a downcast method has been specified + if downcast is not None and is_numeric_dtype(values): + typecodes = None + + if downcast in ('integer', 'signed'): + typecodes = np.typecodes['Integer'] + elif downcast == 'unsigned' and np.min(values) > 0: + typecodes = np.typecodes['UnsignedInteger'] + elif downcast == 'float': + typecodes = np.typecodes['Float'] + + # pandas support goes only to np.float32, + # as float dtypes smaller than that are + # extremely rare and not well supported + float_32_char = np.dtype(np.float32).char + float_32_ind = typecodes.index(float_32_char) + typecodes = typecodes[float_32_ind:] + + if typecodes is not None: + # from smallest to largest + for dtype in typecodes: + if np.dtype(dtype).itemsize < values.dtype.itemsize: + values = _possibly_downcast_to_dtype( + values, dtype) + + # successful conversion + if values.dtype == dtype: + break if is_series: return pd.Series(values, index=arg.index, name=arg.name) diff --git a/pandas/tseries/base.py b/pandas/tseries/base.py index 42631d442a990..3b676b894d355 100644 --- a/pandas/tseries/base.py +++ b/pandas/tseries/base.py @@ -2,17 +2,22 @@ Base and utility classes for tseries type pandas objects. """ -import warnings from datetime import datetime, timedelta from pandas import compat from pandas.compat.numpy import function as nv import numpy as np - +from pandas.types.common import (is_integer, is_float, + is_bool_dtype, _ensure_int64, + is_scalar, is_dtype_equal, + is_list_like) +from pandas.types.generic import (ABCIndex, ABCSeries, + ABCPeriodIndex, ABCIndexClass) +from pandas.types.missing import isnull from pandas.core import common as com, algorithms -from pandas.core.common import (is_integer, is_float, is_bool_dtype, - AbstractMethodError) +from pandas.core.common import AbstractMethodError + import pandas.formats.printing as printing import pandas.tslib as tslib import pandas._period as prlib @@ -20,6 +25,7 @@ from pandas.core.index import Index from pandas.indexes.base import _index_shared_docs from pandas.util.decorators import Appender, cache_readonly +import pandas.types.concat as _concat import pandas.tseries.frequencies as frequencies import pandas.algos as _algos @@ -102,6 +108,34 @@ def ceil(self, freq): class DatetimeIndexOpsMixin(object): """ common ops mixin to support a unified inteface datetimelike Index """ + def equals(self, other): + """ + Determines if two Index objects contain the same elements. + """ + if self.is_(other): + return True + + if not isinstance(other, ABCIndexClass): + return False + elif not isinstance(other, type(self)): + try: + other = type(self)(other) + except: + return False + + if not is_dtype_equal(self.dtype, other.dtype): + # have different timezone + return False + + # ToDo: Remove this when PeriodDtype is added + elif isinstance(self, ABCPeriodIndex): + if not isinstance(other, ABCPeriodIndex): + return False + if self.freq != other.freq: + return False + + return np.array_equal(self.asi8, other.asi8) + def __iter__(self): return (self._box_func(v) for v in self.asi8) @@ -111,9 +145,9 @@ def _join_i8_wrapper(joinf, dtype, with_indexers=True): @staticmethod def wrapper(left, right): - if isinstance(left, (np.ndarray, com.ABCIndex, com.ABCSeries)): + if isinstance(left, (np.ndarray, ABCIndex, ABCSeries)): left = left.view('i8') - if isinstance(right, (np.ndarray, com.ABCIndex, com.ABCSeries)): + if isinstance(right, (np.ndarray, ABCIndex, ABCSeries)): right = right.view('i8') results = joinf(left, right) if with_indexers: @@ -133,16 +167,16 @@ def _evaluate_compare(self, other, op): # coerce to a similar object if not isinstance(other, type(self)): - if not com.is_list_like(other): + if not is_list_like(other): # scalar other = [other] - elif lib.isscalar(lib.item_from_zerodim(other)): + elif is_scalar(lib.item_from_zerodim(other)): # ndarray scalar other = [other.item()] other = type(self)(other) # compare - result = getattr(self.asi8, op)(other.asi8) + result = op(self.asi8, other.asi8) # technically we could support bool dtyped Index # for now just return the indexing array directly @@ -174,7 +208,7 @@ def _ensure_localized(self, result): # reconvert to local tz if getattr(self, 'tz', None) is not None: - if not isinstance(result, com.ABCIndexClass): + if not isinstance(result, ABCIndexClass): result = self._simple_new(result) result = result.tz_localize(self.tz) return result @@ -202,7 +236,7 @@ def _format_with_header(self, header, **kwargs): def __contains__(self, key): try: res = self.get_loc(key) - return lib.isscalar(res) or type(res) == slice or np.any(res) + return is_scalar(res) or type(res) == slice or np.any(res) except (KeyError, TypeError, ValueError): return False @@ -213,7 +247,7 @@ def __getitem__(self, key): """ is_int = is_integer(key) - if lib.isscalar(key) and not is_int: + if is_scalar(key) and not is_int: raise ValueError getitem = self._data.__getitem__ @@ -230,16 +264,25 @@ def __getitem__(self, key): attribs = self._get_attributes_dict() - freq = None - if isinstance(key, slice): - if self.freq is not None and key.step is not None: - freq = key.step * self.freq - else: - freq = self.freq + is_period = isinstance(self, ABCPeriodIndex) + if is_period: + freq = self.freq + else: + freq = None + if isinstance(key, slice): + if self.freq is not None and key.step is not None: + freq = key.step * self.freq + else: + freq = self.freq + attribs['freq'] = freq result = getitem(key) if result.ndim > 1: + # To support MPL which performs slicing with 2 dim + # even though it only has 1 dim by definition + if is_period: + return self._simple_new(result, **attribs) return result return self._simple_new(result, **attribs) @@ -282,7 +325,7 @@ def _nat_new(self, box=True): return result attribs = self._get_attributes_dict() - if not isinstance(self, com.ABCPeriodIndex): + if not isinstance(self, ABCPeriodIndex): attribs['freq'] = None return self._simple_new(result, **attribs) @@ -308,11 +351,11 @@ def sort_values(self, return_indexer=False, ascending=True): sorted_index = self.take(_as) return sorted_index, _as else: - sorted_values = np.sort(self.values) + sorted_values = np.sort(self._values) attribs = self._get_attributes_dict() freq = attribs['freq'] - if freq is not None and not isinstance(self, com.ABCPeriodIndex): + if freq is not None and not isinstance(self, ABCPeriodIndex): if freq.n > 0 and not ascending: freq = freq * -1 elif freq.n < 0 and ascending: @@ -328,7 +371,7 @@ def sort_values(self, return_indexer=False, ascending=True): def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): nv.validate_take(tuple(), kwargs) - indices = com._ensure_int64(indices) + indices = _ensure_int64(indices) maybe_slice = lib.maybe_indices_to_slice(indices, len(self)) if isinstance(maybe_slice, slice): @@ -340,23 +383,23 @@ def take(self, indices, axis=0, allow_fill=True, na_value=tslib.iNaT) # keep freq in PeriodIndex, reset otherwise - freq = self.freq if isinstance(self, com.ABCPeriodIndex) else None + freq = self.freq if isinstance(self, ABCPeriodIndex) else None return self._shallow_copy(taken, freq=freq) def get_duplicates(self): values = Index.get_duplicates(self) return self._simple_new(values) + _can_hold_na = True + + _na_value = tslib.NaT + """The expected NA value to use with this index.""" + @cache_readonly def _isnan(self): """ return if each value is nan""" return (self.asi8 == tslib.iNaT) - @cache_readonly - def hasnans(self): - """ return if I have any nans; enables various perf speedups """ - return self._isnan.any() - @property def asobject(self): """ @@ -545,7 +588,7 @@ def _convert_scalar_indexer(self, key, kind=None): # we don't allow integer/float indexing for loc # we don't allow float indexing for ix/getitem - if lib.isscalar(key): + if is_scalar(key): is_int = is_integer(key) is_flt = is_float(key) if kind in ['loc'] and (is_int or is_flt): @@ -584,14 +627,13 @@ def __add__(self, other): raise TypeError("cannot add TimedeltaIndex and {typ}" .format(typ=type(other))) elif isinstance(other, Index): - warnings.warn("using '+' to provide set union with " - "datetimelike Indexes is deprecated, " - "use .union()", FutureWarning, stacklevel=2) - return self.union(other) + raise TypeError("cannot add {typ1} and {typ2}" + .format(typ1=type(self).__name__, + typ2=type(other).__name__)) elif isinstance(other, (DateOffset, timedelta, np.timedelta64, tslib.Timedelta)): return self._add_delta(other) - elif com.is_integer(other): + elif is_integer(other): return self.shift(other) elif isinstance(other, (tslib.Timestamp, datetime)): return self._add_datelike(other) @@ -602,6 +644,7 @@ def __add__(self, other): def __sub__(self, other): from pandas.core.index import Index + from pandas.tseries.index import DatetimeIndex from pandas.tseries.tdi import TimedeltaIndex from pandas.tseries.offsets import DateOffset if isinstance(other, TimedeltaIndex): @@ -609,17 +652,18 @@ def __sub__(self, other): elif isinstance(self, TimedeltaIndex) and isinstance(other, Index): if not isinstance(other, TimedeltaIndex): raise TypeError("cannot subtract TimedeltaIndex and {typ}" - .format(typ=type(other))) + .format(typ=type(other).__name__)) return self._add_delta(-other) + elif isinstance(other, DatetimeIndex): + return self._sub_datelike(other) elif isinstance(other, Index): - warnings.warn("using '-' to provide set differences with " - "datetimelike Indexes is deprecated, " - "use .difference()", FutureWarning, stacklevel=2) - return self.difference(other) + raise TypeError("cannot subtract {typ1} and {typ2}" + .format(typ1=type(self).__name__, + typ2=type(other).__name__)) elif isinstance(other, (DateOffset, timedelta, np.timedelta64, tslib.Timedelta)): return self._add_delta(-other) - elif com.is_integer(other): + elif is_integer(other): return self.shift(-other) elif isinstance(other, (tslib.Timestamp, datetime)): return self._sub_datelike(other) @@ -725,29 +769,21 @@ def shift(self, n, freq=None): attribs['end'] = end return type(self)(**attribs) - def unique(self): - """ - Index.unique with handling for DatetimeIndex/PeriodIndex metadata - - Returns - ------- - result : DatetimeIndex or PeriodIndex - """ - from pandas.core.index import Int64Index - result = Int64Index.unique(self) - return self._simple_new(result, name=self.name, freq=self.freq, - tz=getattr(self, 'tz', None)) - def repeat(self, repeats, *args, **kwargs): """ Analogous to ndarray.repeat """ nv.validate_repeat(args, kwargs) - return self._shallow_copy(self.values.repeat(repeats), freq=None) + if isinstance(self, ABCPeriodIndex): + freq = self.freq + else: + freq = None + return self._shallow_copy(self.asi8.repeat(repeats), + freq=freq) def where(self, cond, other=None): """ - .. versionadded:: 0.18.2 + .. versionadded:: 0.19.0 Return an Index of same shape as self and whose corresponding entries are from self where cond is True and otherwise are from @@ -788,18 +824,38 @@ def summary(self, name=None): result = result.replace("'", "") return result + def _append_same_dtype(self, to_concat, name): + """ + Concatenate to_concat which has the same class + """ + attribs = self._get_attributes_dict() + attribs['name'] = name + + if not isinstance(self, ABCPeriodIndex): + # reset freq + attribs['freq'] = None + + if getattr(self, 'tz', None) is not None: + return _concat._concat_datetimetz(to_concat, name) + else: + new_data = np.concatenate([c.asi8 for c in to_concat]) + return self._simple_new(new_data, **attribs) + def _ensure_datetimelike_to_i8(other): """ helper for coercing an input scalar or array to i8 """ - if lib.isscalar(other) and com.isnull(other): + if lib.isscalar(other) and isnull(other): other = tslib.iNaT - elif isinstance(other, com.ABCIndexClass): - + elif isinstance(other, ABCIndexClass): # convert tz if needed if getattr(other, 'tz', None) is not None: other = other.tz_localize(None).asi8 else: other = other.asi8 else: - other = np.array(other, copy=False).view('i8') + try: + other = np.array(other, copy=False).view('i8') + except TypeError: + # period array cannot be coerces to int + other = Index(other).asi8 return other diff --git a/pandas/tseries/common.py b/pandas/tseries/common.py index 8937e83c7009a..46e8bd43e8ff8 100644 --- a/pandas/tseries/common.py +++ b/pandas/tseries/common.py @@ -3,19 +3,21 @@ """ import numpy as np + +from pandas.types.common import (_NS_DTYPE, _TD_DTYPE, + is_period_arraylike, + is_datetime_arraylike, is_integer_dtype, + is_datetime64_dtype, is_datetime64tz_dtype, + is_timedelta64_dtype, is_categorical_dtype, + is_list_like) + from pandas.core.base import PandasDelegate, NoNewAttributesMixin -from pandas.core import common as com from pandas.tseries.index import DatetimeIndex from pandas._period import IncompatibleFrequency # flake8: noqa from pandas.tseries.period import PeriodIndex from pandas.tseries.tdi import TimedeltaIndex from pandas import tslib from pandas.core.algorithms import take_1d -from pandas.core.common import (_NS_DTYPE, _TD_DTYPE, is_period_arraylike, - is_datetime_arraylike, is_integer_dtype, - is_list_like, - is_datetime64_dtype, is_datetime64tz_dtype, - is_timedelta64_dtype, is_categorical_dtype) def is_datetimelike(data): @@ -129,7 +131,7 @@ def _delegate_method(self, name, *args, **kwargs): method = getattr(self.values, name) result = method(*args, **kwargs) - if not com.is_list_like(result): + if not is_list_like(result): return result result = Series(result, index=self.index, name=self.name) diff --git a/pandas/tseries/converter.py b/pandas/tseries/converter.py index 78b185ae8cf31..8f8519a498a31 100644 --- a/pandas/tseries/converter.py +++ b/pandas/tseries/converter.py @@ -10,6 +10,14 @@ from matplotlib.ticker import Formatter, AutoLocator, Locator from matplotlib.transforms import nonsingular + +from pandas.types.common import (is_float, is_integer, + is_integer_dtype, + is_float_dtype, + is_datetime64_ns_dtype, + is_period_arraylike, + ) + from pandas.compat import lrange import pandas.compat as compat import pandas.lib as lib @@ -73,8 +81,8 @@ class TimeConverter(units.ConversionInterface): @staticmethod def convert(value, unit, axis): valid_types = (str, pydt.time) - if (isinstance(value, valid_types) or com.is_integer(value) or - com.is_float(value)): + if (isinstance(value, valid_types) or is_integer(value) or + is_float(value)): return time2num(value) if isinstance(value, Index): return value.map(time2num) @@ -129,15 +137,15 @@ def convert(values, units, axis): raise TypeError('Axis must have `freq` set to convert to Periods') valid_types = (compat.string_types, datetime, Period, pydt.date, pydt.time) - if (isinstance(values, valid_types) or com.is_integer(values) or - com.is_float(values)): + if (isinstance(values, valid_types) or is_integer(values) or + is_float(values)): return get_datevalue(values, axis.freq) if isinstance(values, PeriodIndex): - return values.asfreq(axis.freq).values + return values.asfreq(axis.freq)._values if isinstance(values, Index): return values.map(lambda x: get_datevalue(x, axis.freq)) - if com.is_period_arraylike(values): - return PeriodIndex(values, freq=axis.freq).values + if is_period_arraylike(values): + return PeriodIndex(values, freq=axis.freq)._values if isinstance(values, (list, tuple, np.ndarray, Index)): return [get_datevalue(x, axis.freq) for x in values] return values @@ -149,7 +157,7 @@ def get_datevalue(date, freq): elif isinstance(date, (compat.string_types, datetime, pydt.date, pydt.time)): return Period(date, freq).ordinal - elif (com.is_integer(date) or com.is_float(date) or + elif (is_integer(date) or is_float(date) or (isinstance(date, (np.ndarray, Index)) and (date.size == 1))): return date elif date is None: @@ -163,8 +171,8 @@ def _dt_to_float_ordinal(dt): preserving hours, minutes, seconds and microseconds. Return value is a :func:`float`. """ - if (isinstance(dt, (np.ndarray, Index, Series)) and - com.is_datetime64_ns_dtype(dt)): + if (isinstance(dt, (np.ndarray, Index, Series) + ) and is_datetime64_ns_dtype(dt)): base = dates.epoch2num(dt.asi8 / 1.0E9) else: base = dates.date2num(dt) @@ -188,7 +196,7 @@ def try_parse(values): return _dt_to_float_ordinal(lib.Timestamp(values)) elif isinstance(values, pydt.time): return dates.date2num(values) - elif (com.is_integer(values) or com.is_float(values)): + elif (is_integer(values) or is_float(values)): return values elif isinstance(values, compat.string_types): return try_parse(values) @@ -198,7 +206,7 @@ def try_parse(values): if not isinstance(values, np.ndarray): values = com._asarray_tuplesafe(values) - if com.is_integer_dtype(values) or com.is_float_dtype(values): + if is_integer_dtype(values) or is_float_dtype(values): return values try: @@ -208,7 +216,7 @@ def try_parse(values): else: values = [_dt_to_float_ordinal(x) for x in values] except Exception: - pass + values = _dt_to_float_ordinal(values) return values @@ -510,7 +518,7 @@ def _daily_finder(vmin, vmax, freq): info = np.zeros(span, dtype=[('val', np.int64), ('maj', bool), ('min', bool), ('fmt', '|S20')]) - info['val'][:] = dates_.values + info['val'][:] = dates_._values info['fmt'][:] = '' info['maj'][[0, -1]] = True # .. and set some shortcuts diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index 3f1d0c6d969a6..ac094c1f545f3 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -1,17 +1,22 @@ from datetime import timedelta -from pandas.compat import range, long, zip +from pandas.compat import long, zip from pandas import compat import re import warnings import numpy as np +from pandas.types.generic import ABCSeries +from pandas.types.common import (is_integer, + is_period_arraylike, + is_timedelta64_dtype, + is_datetime64_dtype) + import pandas.core.algorithms as algos from pandas.core.algorithms import unique from pandas.tseries.offsets import DateOffset -from pandas.util.decorators import cache_readonly +from pandas.util.decorators import cache_readonly, deprecate_kwarg import pandas.tseries.offsets as offsets -import pandas.core.common as com import pandas.lib as lib import pandas.tslib as tslib from pandas.tslib import Timedelta @@ -255,8 +260,8 @@ def get_freq_code(freqstr): freqstr = (freqstr.rule_code, freqstr.n) if isinstance(freqstr, tuple): - if (com.is_integer(freqstr[0]) and - com.is_integer(freqstr[1])): + if (is_integer(freqstr[0]) and + is_integer(freqstr[1])): # e.g., freqstr = (2000, 1) return freqstr else: @@ -265,13 +270,13 @@ def get_freq_code(freqstr): code = _period_str_to_code(freqstr[0]) stride = freqstr[1] except: - if com.is_integer(freqstr[1]): + if is_integer(freqstr[1]): raise code = _period_str_to_code(freqstr[1]) stride = freqstr[0] return code, stride - if com.is_integer(freqstr): + if is_integer(freqstr): return (freqstr, 1) base, stride = _base_and_stride(freqstr) @@ -351,34 +356,6 @@ def get_period_alias(offset_str): """ alias to closest period strings BQ->Q etc""" return _offset_to_period_map.get(offset_str, None) -_rule_aliases = { - # Legacy rules that will continue to map to their original values - # essentially for the rest of time - 'WEEKDAY': 'B', - 'EOM': 'BM', - 'W@MON': 'W-MON', - 'W@TUE': 'W-TUE', - 'W@WED': 'W-WED', - 'W@THU': 'W-THU', - 'W@FRI': 'W-FRI', - 'W@SAT': 'W-SAT', - 'W@SUN': 'W-SUN', - 'Q@JAN': 'BQ-JAN', - 'Q@FEB': 'BQ-FEB', - 'Q@MAR': 'BQ-MAR', - 'A@JAN': 'BA-JAN', - 'A@FEB': 'BA-FEB', - 'A@MAR': 'BA-MAR', - 'A@APR': 'BA-APR', - 'A@MAY': 'BA-MAY', - 'A@JUN': 'BA-JUN', - 'A@JUL': 'BA-JUL', - 'A@AUG': 'BA-AUG', - 'A@SEP': 'BA-SEP', - 'A@OCT': 'BA-OCT', - 'A@NOV': 'BA-NOV', - 'A@DEC': 'BA-DEC', -} _lite_rule_alias = { 'W': 'W-SUN', @@ -396,17 +373,6 @@ def get_period_alias(offset_str): 'ns': 'N' } -# TODO: Can this be killed? -for _i, _weekday in enumerate(['MON', 'TUE', 'WED', 'THU', 'FRI']): - for _iweek in range(4): - _name = 'WOM-%d%s' % (_iweek + 1, _weekday) - _rule_aliases[_name.replace('-', '@')] = _name - -# Note that _rule_aliases is not 1:1 (d[BA]==d[A@DEC]), and so traversal -# order matters when constructing an inverse. we pick one. #2331 -# Used in get_legacy_offset_name -_legacy_reverse_map = dict((v, k) for k, v in - reversed(sorted(compat.iteritems(_rule_aliases)))) _name_to_offset_map = {'days': Day(1), 'hours': Hour(1), @@ -417,37 +383,74 @@ def get_period_alias(offset_str): 'nanoseconds': Nano(1)} -def to_offset(freqstr): +_INVALID_FREQ_ERROR = "Invalid frequency: {0}" + + +@deprecate_kwarg(old_arg_name='freqstr', new_arg_name='freq') +def to_offset(freq): """ - Return DateOffset object from string representation or - Timedelta object + Return DateOffset object from string or tuple representation + or datetime.timedelta object + + Parameters + ---------- + freq : str, tuple, datetime.timedelta, DateOffset or None + + Returns + ------- + delta : DateOffset + None if freq is None + + Raises + ------ + ValueError + If freq is an invalid frequency + + See Also + -------- + pandas.DateOffset Examples -------- - >>> to_offset('5Min') - Minute(5) + >>> to_offset('5min') + <5 * Minutes> + + >>> to_offset('1D1H') + <25 * Hours> + + >>> to_offset(('W', 2)) + <2 * Weeks: weekday=6> + + >>> to_offset((2, 'B')) + <2 * BusinessDays> + + >>> to_offset(datetime.timedelta(days=1)) + + + >>> to_offset(Hour()) + """ - if freqstr is None: + if freq is None: return None - if isinstance(freqstr, DateOffset): - return freqstr + if isinstance(freq, DateOffset): + return freq - if isinstance(freqstr, tuple): - name = freqstr[0] - stride = freqstr[1] + if isinstance(freq, tuple): + name = freq[0] + stride = freq[1] if isinstance(stride, compat.string_types): name, stride = stride, name name, _ = _base_and_stride(name) delta = get_offset(name) * stride - elif isinstance(freqstr, timedelta): + elif isinstance(freq, timedelta): delta = None - freqstr = Timedelta(freqstr) + freq = Timedelta(freq) try: - for name in freqstr.components._fields: + for name in freq.components._fields: offset = _name_to_offset_map[name] - stride = getattr(freqstr.components, name) + stride = getattr(freq.components, name) if stride != 0: offset = stride * offset if delta is None: @@ -455,13 +458,20 @@ def to_offset(freqstr): else: delta = delta + offset except Exception: - raise ValueError("Could not evaluate %s" % freqstr) + raise ValueError(_INVALID_FREQ_ERROR.format(freq)) else: delta = None stride_sign = None try: - for stride, name, _ in opattern.findall(freqstr): + splitted = re.split(opattern, freq) + if splitted[-1] != '' and not splitted[-1].isspace(): + # the last element must be blank + raise ValueError('last element must be blank') + for sep, stride, name in zip(splitted[0::4], splitted[1::4], + splitted[2::4]): + if sep != '' and not sep.isspace(): + raise ValueError('separator must be spaces') offset = get_offset(name) if stride_sign is None: stride_sign = -1 if stride.startswith('-') else 1 @@ -474,16 +484,16 @@ def to_offset(freqstr): else: delta = delta + offset except Exception: - raise ValueError("Could not evaluate %s" % freqstr) + raise ValueError(_INVALID_FREQ_ERROR.format(freq)) if delta is None: - raise ValueError('Unable to understand %s as a frequency' % freqstr) + raise ValueError(_INVALID_FREQ_ERROR.format(freq)) return delta # hack to handle WOM-1MON -opattern = re.compile(r'([\-]?\d*)\s*([A-Za-z]+([\-@][\dA-Za-z\-]+)?)') +opattern = re.compile(r'([\-]?\d*)\s*([A-Za-z]+([\-][\dA-Za-z\-]+)?)') def _base_and_stride(freqstr): @@ -521,9 +531,6 @@ def get_base_alias(freqstr): _dont_uppercase = set(('MS', 'ms')) -_LEGACY_FREQ_WARNING = 'Freq "{0}" is deprecated, use "{1}" as alternative.' - - def get_offset(name): """ Return DateOffset object associated with rule name @@ -534,27 +541,9 @@ def get_offset(name): """ if name not in _dont_uppercase: name = name.upper() - - if name in _rule_aliases: - new = _rule_aliases[name] - warnings.warn(_LEGACY_FREQ_WARNING.format(name, new), - FutureWarning, stacklevel=2) - name = new - elif name.lower() in _rule_aliases: - new = _rule_aliases[name.lower()] - warnings.warn(_LEGACY_FREQ_WARNING.format(name, new), - FutureWarning, stacklevel=2) - name = new - name = _lite_rule_alias.get(name, name) name = _lite_rule_alias.get(name.lower(), name) - else: - if name in _rule_aliases: - new = _rule_aliases[name] - warnings.warn(_LEGACY_FREQ_WARNING.format(name, new), - FutureWarning, stacklevel=2) - name = new name = _lite_rule_alias.get(name, name) if name not in _offset_map: @@ -566,7 +555,7 @@ def get_offset(name): offset = klass._from_name(*split[1:]) except (ValueError, TypeError, KeyError): # bad prefix or suffix - raise ValueError('Bad rule name requested: %s.' % name) + raise ValueError(_INVALID_FREQ_ERROR.format(name)) # cache _offset_map[name] = offset # do not return cache because it's mutable @@ -590,29 +579,15 @@ def get_offset_name(offset): return offset.freqstr -def get_legacy_offset_name(offset): - """ - Return the pre pandas 0.8.0 name for the date offset - """ - - # This only used in test_timeseries_legacy.py - - name = offset.name - return _legacy_reverse_map.get(name, name) - - def get_standard_freq(freq): """ Return the standardized frequency string """ - if freq is None: - return None - - if isinstance(freq, DateOffset): - return freq.rule_code - code, stride = get_freq_code(freq) - return _get_freq_str(code, stride) + msg = ("get_standard_freq is deprecated. Use to_offset(freq).rule_code " + "instead.") + warnings.warn(msg, FutureWarning, stacklevel=2) + return to_offset(freq).rule_code # --------------------------------------------------------------------- # Period codes @@ -683,144 +658,19 @@ def get_standard_freq(freq): }) -def _period_alias_dictionary(): - """ - Build freq alias dictionary to support freqs from original c_dates.c file - of the scikits.timeseries library. - """ - alias_dict = {} - - M_aliases = ["M", "MTH", "MONTH", "MONTHLY"] - B_aliases = ["B", "BUS", "BUSINESS", "BUSINESSLY", "WEEKDAY"] - D_aliases = ["D", "DAY", "DLY", "DAILY"] - H_aliases = ["H", "HR", "HOUR", "HRLY", "HOURLY"] - T_aliases = ["T", "MIN", "MINUTE", "MINUTELY"] - S_aliases = ["S", "SEC", "SECOND", "SECONDLY"] - L_aliases = ["L", "ms", "MILLISECOND", "MILLISECONDLY"] - U_aliases = ["U", "US", "MICROSECOND", "MICROSECONDLY"] - N_aliases = ["N", "NS", "NANOSECOND", "NANOSECONDLY"] - - for k in M_aliases: - alias_dict[k] = 'M' - - for k in B_aliases: - alias_dict[k] = 'B' - - for k in D_aliases: - alias_dict[k] = 'D' - - for k in H_aliases: - alias_dict[k] = 'H' - - for k in T_aliases: - alias_dict[k] = 'T' - - for k in S_aliases: - alias_dict[k] = 'S' - - for k in L_aliases: - alias_dict[k] = 'L' - - for k in U_aliases: - alias_dict[k] = 'U' - - for k in N_aliases: - alias_dict[k] = 'N' - - A_prefixes = ["A", "Y", "ANN", "ANNUAL", "ANNUALLY", "YR", "YEAR", - "YEARLY"] - - Q_prefixes = ["Q", "QTR", "QUARTER", "QUARTERLY", "Q-E", - "QTR-E", "QUARTER-E", "QUARTERLY-E"] - - month_names = [ - ["DEC", "DECEMBER"], - ["JAN", "JANUARY"], - ["FEB", "FEBRUARY"], - ["MAR", "MARCH"], - ["APR", "APRIL"], - ["MAY", "MAY"], - ["JUN", "JUNE"], - ["JUL", "JULY"], - ["AUG", "AUGUST"], - ["SEP", "SEPTEMBER"], - ["OCT", "OCTOBER"], - ["NOV", "NOVEMBER"]] - - seps = ["@", "-"] - - for k in A_prefixes: - alias_dict[k] = 'A' - for m_tup in month_names: - for sep in seps: - m1, m2 = m_tup - alias_dict[k + sep + m1] = 'A-' + m1 - alias_dict[k + sep + m2] = 'A-' + m1 - - for k in Q_prefixes: - alias_dict[k] = 'Q' - for m_tup in month_names: - for sep in seps: - m1, m2 = m_tup - alias_dict[k + sep + m1] = 'Q-' + m1 - alias_dict[k + sep + m2] = 'Q-' + m1 - - W_prefixes = ["W", "WK", "WEEK", "WEEKLY"] - - day_names = [ - ["SUN", "SUNDAY"], - ["MON", "MONDAY"], - ["TUE", "TUESDAY"], - ["WED", "WEDNESDAY"], - ["THU", "THURSDAY"], - ["FRI", "FRIDAY"], - ["SAT", "SATURDAY"]] - - for k in W_prefixes: - alias_dict[k] = 'W' - for d_tup in day_names: - for sep in ["@", "-"]: - d1, d2 = d_tup - alias_dict[k + sep + d1] = 'W-' + d1 - alias_dict[k + sep + d2] = 'W-' + d1 - - return alias_dict - - -_period_alias_dict = _period_alias_dictionary() - - def _period_str_to_code(freqstr): - # hack - if freqstr in _rule_aliases: - new = _rule_aliases[freqstr] - warnings.warn(_LEGACY_FREQ_WARNING.format(freqstr, new), - FutureWarning, stacklevel=3) - freqstr = new freqstr = _lite_rule_alias.get(freqstr, freqstr) if freqstr not in _dont_uppercase: lower = freqstr.lower() - if lower in _rule_aliases: - new = _rule_aliases[lower] - warnings.warn(_LEGACY_FREQ_WARNING.format(lower, new), - FutureWarning, stacklevel=3) - freqstr = new freqstr = _lite_rule_alias.get(lower, freqstr) + if freqstr not in _dont_uppercase: + freqstr = freqstr.upper() try: - if freqstr not in _dont_uppercase: - freqstr = freqstr.upper() return _period_code_map[freqstr] except KeyError: - try: - alias = _period_alias_dict[freqstr] - warnings.warn(_LEGACY_FREQ_WARNING.format(freqstr, alias), - FutureWarning, stacklevel=3) - except KeyError: - raise ValueError("Unknown freqstr: %s" % freqstr) - - return _period_code_map[alias] + raise ValueError(_INVALID_FREQ_ERROR.format(freqstr)) def infer_freq(index, warn=True): @@ -843,16 +693,16 @@ def infer_freq(index, warn=True): """ import pandas as pd - if isinstance(index, com.ABCSeries): + if isinstance(index, ABCSeries): values = index._values - if not (com.is_datetime64_dtype(values) or - com.is_timedelta64_dtype(values) or + if not (is_datetime64_dtype(values) or + is_timedelta64_dtype(values) or values.dtype == object): raise TypeError("cannot infer freq from a non-convertible " "dtype on a Series of {0}".format(index.dtype)) index = values - if com.is_period_arraylike(index): + if is_period_arraylike(index): raise TypeError("PeriodIndex given. Check the `freq` attribute " "instead of using infer_freq.") elif isinstance(index, pd.TimedeltaIndex): diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 83ab5d2a2bce4..e26a0548fdc78 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -6,13 +6,27 @@ from datetime import timedelta import numpy as np from pandas.core.base import _shared_docs -from pandas.core.common import (_INT64_DTYPE, _NS_DTYPE, _maybe_box, - _values_from_object, ABCSeries, - DatetimeTZDtype, PerformanceWarning, - is_datetimetz, is_datetime64_dtype, - is_datetime64_ns_dtype, is_dtype_equal, - is_float, is_integer, is_integer_dtype, - is_object_dtype, is_string_dtype) + +from pandas.types.common import (_NS_DTYPE, _INT64_DTYPE, + is_object_dtype, is_datetime64_dtype, + is_datetimetz, is_dtype_equal, + is_integer, is_float, + is_integer_dtype, + is_datetime64_ns_dtype, + is_period_dtype, + is_bool_dtype, + is_string_dtype, + is_list_like, + is_scalar, + pandas_dtype, + _ensure_int64) +from pandas.types.generic import ABCSeries +from pandas.types.dtypes import DatetimeTZDtype +from pandas.types.missing import isnull + +import pandas.types.concat as _concat +from pandas.core.common import (_values_from_object, _maybe_box, + PerformanceWarning) from pandas.core.index import Index, Int64Index, Float64Index from pandas.indexes.base import _index_shared_docs @@ -27,7 +41,6 @@ from pandas.util.decorators import (Appender, cache_readonly, deprecate_kwarg, Substitution) import pandas.core.common as com -import pandas.types.concat as _concat import pandas.tseries.offsets as offsets import pandas.tseries.tools as tools @@ -35,6 +48,7 @@ import pandas.lib as lib import pandas.tslib as tslib import pandas._period as period +import pandas._join as _join import pandas.algos as _algos import pandas.index as _index @@ -61,11 +75,14 @@ def f(self): self.freq.kwds.get('month', 12)) if self.freq else 12) - result = tslib.get_start_end_field( - values, field, self.freqstr, month_kw) + result = tslib.get_start_end_field(values, field, self.freqstr, + month_kw) elif field in ['weekday_name']: result = tslib.get_date_name_field(values, field) return self._maybe_mask_results(result) + elif field in ['is_leap_year']: + # no need to mask NaT + return tslib.get_date_field(values, field) else: result = tslib.get_date_field(values, field) @@ -87,7 +104,7 @@ def wrapper(self, other): isinstance(other, compat.string_types)): other = _to_m8(other, tz=self.tz) result = func(other) - if com.isnull(other): + if isnull(other): result.fill(nat_result) else: if isinstance(other, list): @@ -109,7 +126,7 @@ def wrapper(self, other): result[self._isnan] = nat_result # support of bool dtype indexers - if com.is_bool_dtype(result): + if is_bool_dtype(result): return result return Index(result) @@ -178,6 +195,9 @@ class DatetimeIndex(DatelikeOps, TimelikeOps, DatetimeIndexOpsMixin, Attempt to infer fall dst-transition hours based on order name : object Name to be stored in the index + + To learn more about the frequency strings, please see `this link + `__. """ _typ = 'datetimeindex' @@ -187,11 +207,11 @@ def _join_i8_wrapper(joinf, **kwargs): return DatetimeIndexOpsMixin._join_i8_wrapper(joinf, dtype='M8[ns]', **kwargs) - _inner_indexer = _join_i8_wrapper(_algos.inner_join_indexer_int64) - _outer_indexer = _join_i8_wrapper(_algos.outer_join_indexer_int64) - _left_indexer = _join_i8_wrapper(_algos.left_join_indexer_int64) + _inner_indexer = _join_i8_wrapper(_join.inner_join_indexer_int64) + _outer_indexer = _join_i8_wrapper(_join.outer_join_indexer_int64) + _left_indexer = _join_i8_wrapper(_join.left_join_indexer_int64) _left_indexer_unique = _join_i8_wrapper( - _algos.left_join_indexer_unique_int64, with_indexers=False) + _join.left_join_indexer_unique_int64, with_indexers=False) _arrmap = None __eq__ = _dt_index_cmp('__eq__') @@ -213,7 +233,8 @@ def _join_i8_wrapper(joinf, **kwargs): 'daysinmonth', 'date', 'time', 'microsecond', 'nanosecond', 'is_month_start', 'is_month_end', 'is_quarter_start', 'is_quarter_end', 'is_year_start', - 'is_year_end', 'tz', 'freq', 'weekday_name'] + 'is_year_end', 'tz', 'freq', 'weekday_name', + 'is_leap_year'] _is_numeric_dtype = False _infer_as_myclass = True @@ -225,6 +246,15 @@ def __new__(cls, data=None, verify_integrity=True, normalize=False, closed=None, ambiguous='raise', dtype=None, **kwargs): + # This allows to later ensure that the 'copy' parameter is honored: + if isinstance(data, Index): + ref_to_data = data._data + else: + ref_to_data = data + + if name is None and hasattr(data, 'name'): + name = data.name + dayfirst = kwargs.pop('dayfirst', None) yearfirst = kwargs.pop('yearfirst', None) @@ -268,59 +298,36 @@ def __new__(cls, data=None, ambiguous=ambiguous) if not isinstance(data, (np.ndarray, Index, ABCSeries)): - if lib.isscalar(data): + if is_scalar(data): raise ValueError('DatetimeIndex() must be called with a ' 'collection of some kind, %s was passed' % repr(data)) - # other iterable of some kind if not isinstance(data, (list, tuple)): data = list(data) - data = np.asarray(data, dtype='O') + elif isinstance(data, ABCSeries): + data = data._values - # try a few ways to make it datetime64 - if lib.is_string_array(data): - data = tslib.parse_str_array_to_datetime(data, freq=freq, - dayfirst=dayfirst, - yearfirst=yearfirst) - else: - data = tools.to_datetime(data, errors='raise') - data.offset = freq - if isinstance(data, DatetimeIndex): - if name is not None: - data.name = name - - if tz is not None: - - # we might already be localized to this tz - # so passing the same tz is ok - # however any other tz is a no-no - if data.tz is None: - return data.tz_localize(tz, ambiguous=ambiguous) - elif str(tz) != str(data.tz): - raise TypeError("Already tz-aware, use tz_convert " - "to convert.") - - return data - - if issubclass(data.dtype.type, compat.string_types): - data = tslib.parse_str_array_to_datetime(data, freq=freq, - dayfirst=dayfirst, - yearfirst=yearfirst) + # data must be Index or np.ndarray here + if not (is_datetime64_dtype(data) or is_datetimetz(data) or + is_integer_dtype(data)): + data = tools.to_datetime(data, dayfirst=dayfirst, + yearfirst=yearfirst) if issubclass(data.dtype.type, np.datetime64) or is_datetimetz(data): - if isinstance(data, ABCSeries): - data = data._values + if isinstance(data, DatetimeIndex): if tz is None: tz = data.tz - + elif data.tz is None: + data = data.tz_localize(tz, ambiguous=ambiguous) else: # the tz's must match if str(tz) != str(data.tz): - raise TypeError("Already tz-aware, use tz_convert " - "to convert.") + msg = ('data is already tz-aware {0}, unable to ' + 'set specified tz: {1}') + raise TypeError(msg.format(data.tz, tz)) subarr = data.values @@ -332,42 +339,13 @@ def __new__(cls, data=None, subarr = tslib.cast_to_nanoseconds(data) else: subarr = data - elif data.dtype == _INT64_DTYPE: + else: + # must be integer dtype otherwise if isinstance(data, Int64Index): raise TypeError('cannot convert Int64Index->DatetimeIndex') - if copy: - subarr = np.asarray(data, dtype=_NS_DTYPE) - else: - subarr = data.view(_NS_DTYPE) - else: - if isinstance(data, (ABCSeries, Index)): - values = data._values - else: - values = data - - if lib.is_string_array(values): - subarr = tslib.parse_str_array_to_datetime( - values, freq=freq, dayfirst=dayfirst, yearfirst=yearfirst) - else: - try: - subarr = tools.to_datetime(data, box=False) - - # make sure that we have a index/ndarray like (and not a - # Series) - if isinstance(subarr, ABCSeries): - subarr = subarr._values - if subarr.dtype == np.object_: - subarr = tools._to_datetime(subarr, box=False) - - except ValueError: - # tz aware - subarr = tools._to_datetime(data, box=False, utc=True) - - # we may not have been able to convert - if not (is_datetimetz(subarr) or - np.issubdtype(subarr.dtype, np.datetime64)): - raise ValueError('Unable to convert %s to datetime dtype' - % str(data)) + if data.dtype != _INT64_DTYPE: + data = data.astype(np.int64) + subarr = data.view(_NS_DTYPE) if isinstance(subarr, DatetimeIndex): if tz is None: @@ -382,27 +360,21 @@ def __new__(cls, data=None, ints = subarr.view('i8') subarr = tslib.tz_localize_to_utc(ints, tz, ambiguous=ambiguous) - subarr = subarr.view(_NS_DTYPE) subarr = cls._simple_new(subarr, name=name, freq=freq, tz=tz) - - # if dtype is provided, coerce here if dtype is not None: - if not is_dtype_equal(subarr.dtype, dtype): - + # dtype must be coerced to DatetimeTZDtype above if subarr.tz is not None: raise ValueError("cannot localize from non-UTC data") - dtype = DatetimeTZDtype.construct_from_string(dtype) - subarr = subarr.tz_localize(dtype.tz) if verify_integrity and len(subarr) > 0: if freq is not None and not freq_infer: inferred = subarr.inferred_freq if inferred != freq.freqstr: - on_freq = cls._generate(subarr[0], None, len( - subarr), None, freq, tz=tz, ambiguous=ambiguous) + on_freq = cls._generate(subarr[0], None, len(subarr), None, + freq, tz=tz, ambiguous=ambiguous) if not np.array_equal(subarr.asi8, on_freq.asi8): raise ValueError('Inferred frequency {0} from passed ' 'dates does not conform to passed ' @@ -414,7 +386,7 @@ def __new__(cls, data=None, if inferred: subarr.offset = to_offset(inferred) - return subarr + return subarr._deepcopy_if_needed(ref_to_data, copy) @classmethod def _generate(cls, start, end, periods, name, offset, @@ -531,21 +503,27 @@ def _generate(cls, start, end, periods, name, offset, index = _generate_regular_range(start, end, periods, offset) if tz is not None and getattr(index, 'tz', None) is None: - index = tslib.tz_localize_to_utc(com._ensure_int64(index), tz, + index = tslib.tz_localize_to_utc(_ensure_int64(index), tz, ambiguous=ambiguous) index = index.view(_NS_DTYPE) + # index is localized datetime64 array -> have to convert + # start/end as well to compare + if start is not None: + start = start.tz_localize(tz).asm8 + if end is not None: + end = end.tz_localize(tz).asm8 + if not left_closed and len(index) and index[0] == start: index = index[1:] if not right_closed and len(index) and index[-1] == end: index = index[:-1] - index = cls._simple_new(index, name=name, freq=offset, tz=tz) return index @property def _box_func(self): - return lambda x: Timestamp(x, offset=self.offset, tz=self.tz) + return lambda x: Timestamp(x, freq=self.offset, tz=self.tz) def _convert_for_op(self, value): """ Convert value to be insertable to ndarray """ @@ -588,7 +566,7 @@ def _simple_new(cls, values, name=None, freq=None, tz=None, return cls(values, name=name, freq=freq, tz=tz, dtype=dtype, **kwargs).values elif not is_datetime64_dtype(values): - values = com._ensure_int64(values).view(_NS_DTYPE) + values = _ensure_int64(values).view(_NS_DTYPE) result = object.__new__(cls) result._data = values @@ -645,7 +623,7 @@ def _cached_range(cls, start=None, end=None, periods=None, offset=None, xdr = generate_range(offset=offset, start=_CACHE_START, end=_CACHE_END) - arr = tools._to_datetime(list(xdr), box=False) + arr = tools.to_datetime(list(xdr), box=False) cachedRange = DatetimeIndex._simple_new(arr) cachedRange.offset = offset @@ -691,9 +669,6 @@ def _mpl_repr(self): # how to represent ourselves to matplotlib return tslib.ints_to_pydatetime(self.asi8, self.tz) - _na_value = tslib.NaT - """The expected NA value to use with this index.""" - @cache_readonly def _is_dates_only(self): from pandas.formats.format import _is_dates_only @@ -756,19 +731,43 @@ def _add_datelike(self, other): def _sub_datelike(self, other): # subtract a datetime from myself, yielding a TimedeltaIndex from pandas import TimedeltaIndex - other = Timestamp(other) - if other is tslib.NaT: - result = self._nat_new(box=False) - # require tz compat - elif not self._has_same_tz(other): - raise TypeError("Timestamp subtraction must have the same " - "timezones or no timezones") + if isinstance(other, DatetimeIndex): + # require tz compat + if not self._has_same_tz(other): + raise TypeError("DatetimeIndex subtraction must have the same " + "timezones or no timezones") + result = self._sub_datelike_dti(other) + elif isinstance(other, (tslib.Timestamp, datetime)): + other = Timestamp(other) + if other is tslib.NaT: + result = self._nat_new(box=False) + # require tz compat + elif not self._has_same_tz(other): + raise TypeError("Timestamp subtraction must have the same " + "timezones or no timezones") + else: + i8 = self.asi8 + result = i8 - other.value + result = self._maybe_mask_results(result, + fill_value=tslib.iNaT) else: - i8 = self.asi8 - result = i8 - other.value - result = self._maybe_mask_results(result, fill_value=tslib.iNaT) + raise TypeError("cannot subtract DatetimeIndex and {typ}" + .format(typ=type(other).__name__)) return TimedeltaIndex(result, name=self.name, copy=False) + def _sub_datelike_dti(self, other): + """subtraction of two DatetimeIndexes""" + if not len(self) == len(other): + raise ValueError("cannot add indices of unequal length") + + self_i8 = self.asi8 + other_i8 = other.asi8 + new_values = self_i8 - other_i8 + if self.hasnans or other.hasnans: + mask = (self._isnan) | (other._isnan) + new_values[mask] = tslib.iNaT + return new_values.view('i8') + def _maybe_update_attributes(self, attrs): """ Update Index attributes (e.g. freq) depending on op """ freq = attrs.get('freq', None) @@ -829,8 +828,7 @@ def to_datetime(self, dayfirst=False): @Appender(_index_shared_docs['astype']) def astype(self, dtype, copy=True): - dtype = np.dtype(dtype) - + dtype = pandas_dtype(dtype) if is_object_dtype(dtype): return self.asobject elif is_integer_dtype(dtype): @@ -844,6 +842,8 @@ def astype(self, dtype, copy=True): return self elif is_string_dtype(dtype): return Index(self.format(), name=self.name, dtype=object) + elif is_period_dtype(dtype): + return self.to_period(freq=dtype.freq) raise ValueError('Cannot cast DatetimeIndex to dtype %s' % dtype) def _get_time_micros(self): @@ -1032,36 +1032,6 @@ def union_many(self, others): this.offset = to_offset(this.inferred_freq) return this - def append(self, other): - """ - Append a collection of Index options together - - Parameters - ---------- - other : Index or list/tuple of indices - - Returns - ------- - appended : Index - """ - name = self.name - to_concat = [self] - - if isinstance(other, (list, tuple)): - to_concat = to_concat + list(other) - else: - to_concat.append(other) - - for obj in to_concat: - if isinstance(obj, Index) and obj.name != name: - name = None - break - - to_concat = self._ensure_compat_concat(to_concat) - to_concat, factory = _process_concat_data(to_concat, name) - - return factory(to_concat) - def join(self, other, how='left', level=None, return_indexers=False): """ See Index.join @@ -1186,8 +1156,9 @@ def __iter__(self): for i in range(chunks): start_i = i * chunksize end_i = min((i + 1) * chunksize, l) - converted = tslib.ints_to_pydatetime( - data[start_i:end_i], tz=self.tz, offset=self.offset, box=True) + converted = tslib.ints_to_pydatetime(data[start_i:end_i], + tz=self.tz, freq=self.freq, + box=True) for v in converted: yield v @@ -1549,29 +1520,21 @@ def _set_freq(self, value): doc="get/set the frequncy of the Index") year = _field_accessor('year', 'Y', "The year of the datetime") - month = _field_accessor( - 'month', 'M', "The month as January=1, December=12") + month = _field_accessor('month', 'M', + "The month as January=1, December=12") day = _field_accessor('day', 'D', "The days of the datetime") hour = _field_accessor('hour', 'h', "The hours of the datetime") minute = _field_accessor('minute', 'm', "The minutes of the datetime") second = _field_accessor('second', 's', "The seconds of the datetime") - microsecond = _field_accessor( - 'microsecond', - 'us', - "The microseconds of the datetime") - nanosecond = _field_accessor( - 'nanosecond', - 'ns', - "The nanoseconds of the datetime") - weekofyear = _field_accessor( - 'weekofyear', - 'woy', - "The week ordinal of the year") + microsecond = _field_accessor('microsecond', 'us', + "The microseconds of the datetime") + nanosecond = _field_accessor('nanosecond', 'ns', + "The nanoseconds of the datetime") + weekofyear = _field_accessor('weekofyear', 'woy', + "The week ordinal of the year") week = weekofyear - dayofweek = _field_accessor( - 'dayofweek', - 'dow', - "The day of the week with Monday=0, Sunday=6") + dayofweek = _field_accessor('dayofweek', 'dow', + "The day of the week with Monday=0, Sunday=6") weekday = dayofweek weekday_name = _field_accessor( @@ -1579,14 +1542,9 @@ def _set_freq(self, value): 'weekday_name', "The name of day in a week (ex: Friday)\n\n.. versionadded:: 0.18.1") - dayofyear = _field_accessor( - 'dayofyear', - 'doy', - "The ordinal day of the year") - quarter = _field_accessor( - 'quarter', - 'q', - "The quarter of the date") + dayofyear = _field_accessor('dayofyear', 'doy', + "The ordinal day of the year") + quarter = _field_accessor('quarter', 'q', "The quarter of the date") days_in_month = _field_accessor( 'days_in_month', 'dim', @@ -1616,6 +1574,10 @@ def _set_freq(self, value): 'is_year_end', 'is_year_end', "Logical indicating if last day of year (defined by frequency)") + is_leap_year = _field_accessor( + 'is_leap_year', + 'is_leap_year', + "Logical indicating if the date belongs to a leap year") @property def time(self): @@ -1629,7 +1591,8 @@ def time(self): @property def date(self): """ - Returns numpy array of datetime.date. The date part of the Timestamps. + Returns numpy array of python datetime.date objects (namely, the date + part of Timestamps without timezone information). """ return self._maybe_mask_results(_algos.arrmap_object( self.asobject.values, lambda x: x.date())) @@ -1669,7 +1632,7 @@ def inferred_type(self): def dtype(self): if self.tz is None: return _NS_DTYPE - return com.DatetimeTZDtype('ns', self.tz) + return DatetimeTZDtype('ns', self.tz) @property def is_all_dates(self): @@ -1686,26 +1649,6 @@ def is_normalized(self): def _resolution(self): return period.resolution(self.asi8, self.tz) - def equals(self, other): - """ - Determines if two Index objects contain the same elements. - """ - if self.is_(other): - return True - - if (not hasattr(other, 'inferred_type') or - other.inferred_type != 'datetime64'): - if self.offset is not None: - return False - try: - other = DatetimeIndex(other) - except: - return False - - if self._has_same_tz(other): - return np.array_equal(self.asi8, other.asi8) - return False - def insert(self, loc, item): """ Make new Index inserting new item at location @@ -1773,9 +1716,9 @@ def delete(self, loc): if loc in (0, -len(self), -1, len(self) - 1): freq = self.freq else: - if com.is_list_like(loc): + if is_list_like(loc): loc = lib.maybe_indices_to_slice( - com._ensure_int64(np.array(loc)), len(self)) + _ensure_int64(np.array(loc)), len(self)) if isinstance(loc, slice) and loc.step in (1, None): if (loc.start in (0, None) or loc.stop in (len(self), None)): freq = self.freq @@ -1844,7 +1787,7 @@ def tz_localize(self, tz, ambiguous='raise', errors='raise'): - 'coerce' will return NaT if the timestamp can not be converted into the specified timezone - .. versionadded:: 0.18.2 + .. versionadded:: 0.19.0 infer_dst : boolean, default False (DEPRECATED) Attempt to infer fall dst-transition hours based on order @@ -1992,7 +1935,8 @@ def _generate_regular_range(start, end, periods, offset): b = Timestamp(start).value # cannot just use e = Timestamp(end) + 1 because arange breaks when # stride is too large, see GH10887 - e = b + (Timestamp(end).value - b) // stride * stride + stride // 2 + e = (b + (Timestamp(end).value - b) // stride * stride + + stride // 2 + 1) # end.tz == start.tz by this point due to _generate implementation tz = start.tz elif start is not None: @@ -2057,6 +2001,9 @@ def date_range(start=None, end=None, periods=None, freq='D', tz=None, ----- 2 of start, end, or periods must be specified + To learn more about the frequency strings, please see `this link + `__. + Returns ------- rng : DatetimeIndex @@ -2097,6 +2044,9 @@ def bdate_range(start=None, end=None, periods=None, freq='B', tz=None, ----- 2 of start, end, or periods must be specified + To learn more about the frequency strings, please see `this link + `__. + Returns ------- rng : DatetimeIndex @@ -2148,6 +2098,9 @@ def cdate_range(start=None, end=None, periods=None, freq='C', tz=None, ----- 2 of start, end, or periods must be specified + To learn more about the frequency strings, please see `this link + `__. + Returns ------- rng : DatetimeIndex @@ -2201,56 +2154,3 @@ def _use_cached_range(offset, _normalized, start, end): def _time_to_micros(time): seconds = time.hour * 60 * 60 + 60 * time.minute + time.second return 1000000 * seconds + time.microsecond - - -def _process_concat_data(to_concat, name): - klass = Index - kwargs = {} - concat = np.concatenate - - all_dti = True - need_utc_convert = False - has_naive = False - tz = None - - for x in to_concat: - if not isinstance(x, DatetimeIndex): - all_dti = False - else: - if tz is None: - tz = x.tz - - if x.tz is None: - has_naive = True - - if x.tz != tz: - need_utc_convert = True - tz = 'UTC' - - if all_dti: - need_obj_convert = False - if has_naive and tz is not None: - need_obj_convert = True - - if need_obj_convert: - to_concat = [x.asobject.values for x in to_concat] - - else: - if need_utc_convert: - to_concat = [x.tz_convert('UTC').values for x in to_concat] - else: - to_concat = [x.values for x in to_concat] - - # well, technically not a "class" anymore...oh well - klass = DatetimeIndex._simple_new - kwargs = {'tz': tz} - concat = _concat._concat_compat - else: - for i, x in enumerate(to_concat): - if isinstance(x, DatetimeIndex): - to_concat[i] = x.asobject.values - elif isinstance(x, Index): - to_concat[i] = x.values - - factory_func = lambda x: klass(concat(x), name=name, **kwargs) - return to_concat, factory_func diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index 7d3255add4f64..051cc8aa4d018 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -3,8 +3,9 @@ from pandas import compat import numpy as np +from pandas.types.generic import ABCSeries, ABCDatetimeIndex, ABCPeriod from pandas.tseries.tools import to_datetime, normalize_date -from pandas.core.common import ABCSeries, ABCDatetimeIndex, ABCPeriod +from pandas.core.common import AbstractMethodError # import after tools, dateutil check from dateutil.relativedelta import relativedelta, weekday @@ -18,6 +19,7 @@ __all__ = ['Day', 'BusinessDay', 'BDay', 'CustomBusinessDay', 'CDay', 'CBMonthEnd', 'CBMonthBegin', 'MonthBegin', 'BMonthBegin', 'MonthEnd', 'BMonthEnd', + 'SemiMonthEnd', 'SemiMonthBegin', 'BusinessHour', 'CustomBusinessHour', 'YearBegin', 'BYearBegin', 'YearEnd', 'BYearEnd', 'QuarterBegin', 'BQuarterBegin', 'QuarterEnd', 'BQuarterEnd', @@ -812,7 +814,7 @@ def apply(self, other): if bd != 0: skip_bd = BusinessDay(n=bd) - # midnight busienss hour may not on BusinessDay + # midnight business hour may not on BusinessDay if not self.next_bday.onOffset(other): remain = other - self._prev_opening_time(other) other = self._next_opening_time(other + skip_bd) + remain @@ -1160,6 +1162,214 @@ def onOffset(self, dt): _prefix = 'MS' +class SemiMonthOffset(DateOffset): + _adjust_dst = True + _default_day_of_month = 15 + _min_day_of_month = 2 + + def __init__(self, n=1, day_of_month=None, normalize=False, **kwds): + if day_of_month is None: + self.day_of_month = self._default_day_of_month + else: + self.day_of_month = int(day_of_month) + if not self._min_day_of_month <= self.day_of_month <= 27: + raise ValueError('day_of_month must be ' + '{}<=day_of_month<=27, got {}'.format( + self._min_day_of_month, self.day_of_month)) + self.n = int(n) + self.normalize = normalize + self.kwds = kwds + self.kwds['day_of_month'] = self.day_of_month + + @classmethod + def _from_name(cls, suffix=None): + return cls(day_of_month=suffix) + + @property + def rule_code(self): + suffix = '-{}'.format(self.day_of_month) + return self._prefix + suffix + + @apply_wraps + def apply(self, other): + n = self.n + if not self.onOffset(other): + _, days_in_month = tslib.monthrange(other.year, other.month) + if 1 < other.day < self.day_of_month: + other += relativedelta(day=self.day_of_month) + if n > 0: + # rollforward so subtract 1 + n -= 1 + elif self.day_of_month < other.day < days_in_month: + other += relativedelta(day=self.day_of_month) + if n < 0: + # rollforward in the negative direction so add 1 + n += 1 + elif n == 0: + n = 1 + + return self._apply(n, other) + + def _apply(self, n, other): + """Handle specific apply logic for child classes""" + raise AbstractMethodError(self) + + @apply_index_wraps + def apply_index(self, i): + # determine how many days away from the 1st of the month we are + days_from_start = i.to_perioddelta('M').asi8 + delta = Timedelta(days=self.day_of_month - 1).value + + # get boolean array for each element before the day_of_month + before_day_of_month = days_from_start < delta + + # get boolean array for each element after the day_of_month + after_day_of_month = days_from_start > delta + + # determine the correct n for each date in i + roll = self._get_roll(i, before_day_of_month, after_day_of_month) + + # isolate the time since it will be striped away one the next line + time = i.to_perioddelta('D') + + # apply the correct number of months + i = (i.to_period('M') + (roll // 2)).to_timestamp() + + # apply the correct day + i = self._apply_index_days(i, roll) + + return i + time + + def _get_roll(self, i, before_day_of_month, after_day_of_month): + """Return an array with the correct n for each date in i. + + The roll array is based on the fact that i gets rolled back to + the first day of the month. + """ + raise AbstractMethodError(self) + + def _apply_index_days(self, i, roll): + """Apply the correct day for each date in i""" + raise AbstractMethodError(self) + + +class SemiMonthEnd(SemiMonthOffset): + """ + Two DateOffset's per month repeating on the last + day of the month and day_of_month. + + .. versionadded:: 0.19.0 + + Parameters + ---------- + n: int + normalize : bool, default False + day_of_month: int, {1, 3,...,27}, default 15 + """ + _prefix = 'SM' + _min_day_of_month = 1 + + def onOffset(self, dt): + if self.normalize and not _is_normalized(dt): + return False + _, days_in_month = tslib.monthrange(dt.year, dt.month) + return dt.day in (self.day_of_month, days_in_month) + + def _apply(self, n, other): + # if other.day is not day_of_month move to day_of_month and update n + if other.day < self.day_of_month: + other += relativedelta(day=self.day_of_month) + if n > 0: + n -= 1 + elif other.day > self.day_of_month: + other += relativedelta(day=self.day_of_month) + if n == 0: + n = 1 + else: + n += 1 + + months = n // 2 + day = 31 if n % 2 else self.day_of_month + return other + relativedelta(months=months, day=day) + + def _get_roll(self, i, before_day_of_month, after_day_of_month): + n = self.n + is_month_end = i.is_month_end + if n > 0: + roll_end = np.where(is_month_end, 1, 0) + roll_before = np.where(before_day_of_month, n, n + 1) + roll = roll_end + roll_before + elif n == 0: + roll_after = np.where(after_day_of_month, 2, 0) + roll_before = np.where(~after_day_of_month, 1, 0) + roll = roll_before + roll_after + else: + roll = np.where(after_day_of_month, n + 2, n + 1) + return roll + + def _apply_index_days(self, i, roll): + i += (roll % 2) * Timedelta(days=self.day_of_month).value + return i + Timedelta(days=-1) + + +class SemiMonthBegin(SemiMonthOffset): + """ + Two DateOffset's per month repeating on the first + day of the month and day_of_month. + + .. versionadded:: 0.19.0 + + Parameters + ---------- + n: int + normalize : bool, default False + day_of_month: int, {2, 3,...,27}, default 15 + """ + _prefix = 'SMS' + + def onOffset(self, dt): + if self.normalize and not _is_normalized(dt): + return False + return dt.day in (1, self.day_of_month) + + def _apply(self, n, other): + # if other.day is not day_of_month move to day_of_month and update n + if other.day < self.day_of_month: + other += relativedelta(day=self.day_of_month) + if n == 0: + n = -1 + else: + n -= 1 + elif other.day > self.day_of_month: + other += relativedelta(day=self.day_of_month) + if n == 0: + n = 1 + elif n < 0: + n += 1 + + months = n // 2 + n % 2 + day = 1 if n % 2 else self.day_of_month + return other + relativedelta(months=months, day=day) + + def _get_roll(self, i, before_day_of_month, after_day_of_month): + n = self.n + is_month_start = i.is_month_start + if n > 0: + roll = np.where(before_day_of_month, n, n + 1) + elif n == 0: + roll_start = np.where(is_month_start, 0, 1) + roll_after = np.where(after_day_of_month, 1, 0) + roll = roll_start + roll_after + else: + roll_after = np.where(after_day_of_month, n + 2, n + 1) + roll_start = np.where(is_month_start, -1, 0) + roll = roll_after + roll_start + return roll + + def _apply_index_days(self, i, roll): + return i + (roll % 2) * Timedelta(days=self.day_of_month - 1).value + + class BusinessMonthEnd(MonthOffset): """DateOffset increments between business EOM dates""" @@ -2720,6 +2930,8 @@ def generate_range(start=None, end=None, periods=None, CustomBusinessHour, # 'CBH' MonthEnd, # 'M' MonthBegin, # 'MS' + SemiMonthEnd, # 'SM' + SemiMonthBegin, # 'SMS' Week, # 'W' Second, # 'S' Minute, # 'T' diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index c3deee5f6dab2..d5d89c8dc2614 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -1,6 +1,27 @@ # pylint: disable=E1101,E1103,W0232 from datetime import datetime, timedelta import numpy as np +import warnings + + +from pandas.core import common as com +from pandas.types.common import (is_integer, + is_float, + is_object_dtype, + is_integer_dtype, + is_float_dtype, + is_scalar, + is_datetime64_dtype, + is_datetime64tz_dtype, + is_timedelta64_dtype, + is_period_dtype, + is_bool_dtype, + pandas_dtype, + _ensure_int64, + _ensure_object) +from pandas.types.dtypes import PeriodDtype +from pandas.types.generic import ABCSeries + import pandas.tseries.frequencies as frequencies from pandas.tseries.frequencies import get_freq_code as _gfc from pandas.tseries.index import DatetimeIndex, Int64Index, Index @@ -15,43 +36,29 @@ _quarter_to_myear) from pandas.core.base import _shared_docs -from pandas.indexes.base import _index_shared_docs +from pandas.indexes.base import _index_shared_docs, _ensure_index -import pandas.core.common as com -from pandas.core.common import ( - _maybe_box, _values_from_object, ABCSeries, is_float, is_integer, - is_integer_dtype, is_object_dtype, isnull) from pandas import compat -from pandas.compat.numpy import function as nv from pandas.util.decorators import Appender, cache_readonly, Substitution -from pandas.lib import Timedelta -import pandas.lib as lib +from pandas.lib import infer_dtype import pandas.tslib as tslib -import pandas.core.missing as missing from pandas.compat import zip, u def _field_accessor(name, alias, docstring=None): def f(self): base, mult = _gfc(self.freq) - return get_period_field_arr(alias, self.values, base) + return get_period_field_arr(alias, self._values, base) f.__name__ = name f.__doc__ = docstring return property(f) -def _get_ordinals(data, freq): - f = lambda x: Period(x, freq=freq).ordinal - if isinstance(data[0], Period): - return period.extract_ordinals(data, freq) - else: - return lib.map_infer(data, f) - - def dt64arr_to_periodarr(data, freq, tz): if data.dtype != np.dtype('M8[ns]'): raise ValueError('Wrong dtype: %s' % data.dtype) + freq = Period._maybe_convert_freq(freq) base, mult = _gfc(freq) return period.dt64arr_to_periodarr(data.view('i8'), base, tz) @@ -67,7 +74,7 @@ def _period_index_cmp(opname, nat_result=False): def wrapper(self, other): if isinstance(other, Period): - func = getattr(self.values, opname) + func = getattr(self._values, opname) other_base, _ = _gfc(other.freq) if other.freq != self.freq: msg = _DIFFERENT_FREQ_INDEX.format(self.freqstr, other.freqstr) @@ -79,24 +86,23 @@ def wrapper(self, other): msg = _DIFFERENT_FREQ_INDEX.format(self.freqstr, other.freqstr) raise IncompatibleFrequency(msg) - result = getattr(self.values, opname)(other.values) + result = getattr(self._values, opname)(other._values) - mask = (missing.mask_missing(self.values, tslib.iNaT) | - missing.mask_missing(other.values, tslib.iNaT)) + mask = self._isnan | other._isnan if mask.any(): result[mask] = nat_result return result + elif other is tslib.NaT: + result = np.empty(len(self._values), dtype=bool) + result.fill(nat_result) else: other = Period(other, freq=self.freq) - func = getattr(self.values, opname) + func = getattr(self._values, opname) result = func(other.ordinal) - if other.ordinal == tslib.iNaT: - result.fill(nat_result) - mask = self.values == tslib.iNaT - if mask.any(): - result[mask] = nat_result + if self.hasnans: + result[self._isnan] = nat_result return result return wrapper @@ -122,7 +128,6 @@ class PeriodIndex(DatelikeOps, DatetimeIndexOpsMixin, Int64Index): ---------- data : array-like (1-dimensional), optional Optional period-like data to construct index with - dtype : NumPy dtype (default: i8) copy : bool Make a copy of input ndarray freq : string or period object, optional @@ -145,6 +150,7 @@ class PeriodIndex(DatelikeOps, DatetimeIndexOpsMixin, Int64Index): second : int, array, or Series, default None tz : object, default None Timezone for converting datetime64 data to Periods + dtype : str or PeriodDtype, default None Examples -------- @@ -159,7 +165,8 @@ class PeriodIndex(DatelikeOps, DatetimeIndexOpsMixin, Int64Index): 'weekofyear', 'week', 'dayofweek', 'weekday', 'dayofyear', 'quarter', 'qyear', 'freq', 'days_in_month', 'daysinmonth', - 'to_timestamp', 'asfreq', 'start_time', 'end_time'] + 'to_timestamp', 'asfreq', 'start_time', 'end_time', + 'is_leap_year'] _is_numeric_dtype = False _infer_as_myclass = True @@ -173,7 +180,8 @@ class PeriodIndex(DatelikeOps, DatetimeIndexOpsMixin, Int64Index): __ge__ = _period_index_cmp('__ge__') def __new__(cls, data=None, ordinal=None, freq=None, start=None, end=None, - periods=None, copy=False, name=None, tz=None, **kwargs): + periods=None, copy=False, name=None, tz=None, dtype=None, + **kwargs): if periods is not None: if is_float(periods): @@ -182,6 +190,19 @@ def __new__(cls, data=None, ordinal=None, freq=None, start=None, end=None, raise ValueError('Periods must be a number, got %s' % str(periods)) + if name is None and hasattr(data, 'name'): + name = data.name + + if dtype is not None: + dtype = pandas_dtype(dtype) + if not is_period_dtype(dtype): + raise ValueError('dtype must be PeriodDtype') + if freq is None: + freq = dtype.freq + elif freq != dtype.freq: + msg = 'specified freq and dtype are different' + raise IncompatibleFrequency(msg) + if data is None: if ordinal is not None: data = np.asarray(ordinal, dtype=np.int64) @@ -190,12 +211,15 @@ def __new__(cls, data=None, ordinal=None, freq=None, start=None, end=None, freq, kwargs) else: ordinal, freq = cls._from_arraylike(data, freq, tz) - data = np.array(ordinal, dtype=np.int64, copy=False) + data = np.array(ordinal, dtype=np.int64, copy=copy) return cls._simple_new(data, name=name, freq=freq) @classmethod def _generate_range(cls, start, end, periods, freq, fields): + if freq is not None: + freq = Period._maybe_convert_freq(freq) + field_count = len(fields) if com._count_not_none(start, end) > 0: if field_count > 0: @@ -212,9 +236,12 @@ def _generate_range(cls, start, end, periods, freq, fields): @classmethod def _from_arraylike(cls, data, freq, tz): + if freq is not None: + freq = Period._maybe_convert_freq(freq) + if not isinstance(data, (np.ndarray, PeriodIndex, DatetimeIndex, Int64Index)): - if lib.isscalar(data) or isinstance(data, Period): + if is_scalar(data) or isinstance(data, Period): raise ValueError('PeriodIndex() must be called with a ' 'collection of some kind, %s was passed' % repr(data)) @@ -224,58 +251,57 @@ def _from_arraylike(cls, data, freq, tz): data = list(data) try: - data = com._ensure_int64(data) + data = _ensure_int64(data) if freq is None: raise ValueError('freq not specified') - data = np.array([Period(x, freq=freq).ordinal for x in data], + data = np.array([Period(x, freq=freq) for x in data], dtype=np.int64) except (TypeError, ValueError): - data = com._ensure_object(data) - - if freq is None and len(data) > 0: - freq = getattr(data[0], 'freq', None) + data = _ensure_object(data) if freq is None: - raise ValueError('freq not specified and cannot be ' - 'inferred from first element') - - data = _get_ordinals(data, freq) + freq = period.extract_freq(data) + data = period.extract_ordinals(data, freq) else: if isinstance(data, PeriodIndex): if freq is None or freq == data.freq: freq = data.freq - data = data.values + data = data._values else: base1, _ = _gfc(data.freq) base2, _ = _gfc(freq) - data = period.period_asfreq_arr(data.values, + data = period.period_asfreq_arr(data._values, base1, base2, 1) else: - if freq is None and len(data) > 0: - freq = getattr(data[0], 'freq', None) + if is_object_dtype(data): + inferred = infer_dtype(data) + if inferred == 'integer': + data = data.astype(np.int64) + + if freq is None and is_object_dtype(data): + # must contain Period instance and thus extract ordinals + freq = period.extract_freq(data) + data = period.extract_ordinals(data, freq) if freq is None: - raise ValueError('freq not specified and cannot be ' - 'inferred from first element') + msg = 'freq not specified and cannot be inferred' + raise ValueError(msg) if data.dtype != np.int64: if np.issubdtype(data.dtype, np.datetime64): data = dt64arr_to_periodarr(data, freq, tz) else: - try: - data = com._ensure_int64(data) - except (TypeError, ValueError): - data = com._ensure_object(data) - data = _get_ordinals(data, freq) + data = _ensure_object(data) + data = period.extract_ordinals(data, freq) return data, freq @classmethod def _simple_new(cls, values, name=None, freq=None, **kwargs): - if not com.is_integer_dtype(values): + if not is_integer_dtype(values): values = np.array(values, copy=False) - if (len(values) > 0 and com.is_float_dtype(values)): + if (len(values) > 0 and is_float_dtype(values)): raise TypeError("PeriodIndex can't take floats") else: return PeriodIndex(values, name=name, freq=freq, **kwargs) @@ -311,20 +337,42 @@ def _coerce_scalar_to_index(self, item): """ return PeriodIndex([item], **self._get_attributes_dict()) - @property - def _na_value(self): - return self._box_func(tslib.iNaT) - def __contains__(self, key): - if not isinstance(key, Period) or key.freq != self.freq: - if isinstance(key, compat.string_types): - try: - self.get_loc(key) - return True - except Exception: - return False + if isinstance(key, Period): + if key.freq != self.freq: + return False + else: + return key.ordinal in self._engine + else: + try: + self.get_loc(key) + return True + except Exception: + return False return False - return key.ordinal in self._engine + + @property + def asi8(self): + return self._values.view('i8') + + @property + def _int64index(self): + # do not cache, same as .asi8 + return Int64Index(self.asi8, name=self.name, fastpath=True) + + @property + def values(self): + return self.asobject.values + + @property + def _values(self): + return self._data + + def __array__(self, dtype=None): + if is_integer_dtype(dtype): + return self.asi8 + else: + return self.asobject.values def __array_wrap__(self, result, context=None): """ @@ -336,9 +384,17 @@ def __array_wrap__(self, result, context=None): if isinstance(context, tuple) and len(context) > 0: func = context[0] if (func is np.add): - return self._add_delta(context[1][1]) + pass elif (func is np.subtract): - return self._add_delta(-context[1][1]) + name = self.name + left = context[1][0] + right = context[1][1] + if (isinstance(left, PeriodIndex) and + isinstance(right, PeriodIndex)): + name = left.name if left.name == right.name else None + return Index(result, name=name) + elif isinstance(left, Period) or isinstance(right, Period): + return Index(result, name=name) elif isinstance(func, np.ufunc): if 'M->M' not in func.types: msg = "ufunc '{0}' not supported for the PeriodIndex" @@ -346,8 +402,10 @@ def __array_wrap__(self, result, context=None): # from here because numpy catches. raise ValueError(msg.format(func.__name__)) - if com.is_bool_dtype(result): + if is_bool_dtype(result): return result + # the result is object dtype array of Period + # cannot pass _simple_new as it is return PeriodIndex(result, freq=self.freq, name=self.name) @property @@ -374,27 +432,32 @@ def asof_locs(self, where, mask): if isinstance(where_idx, DatetimeIndex): where_idx = PeriodIndex(where_idx.values, freq=self.freq) - locs = self.values[mask].searchsorted(where_idx.values, side='right') + locs = self._values[mask].searchsorted(where_idx._values, side='right') locs = np.where(locs > 0, locs - 1, 0) result = np.arange(len(self))[mask].take(locs) first = mask.argmax() - result[(locs == 0) & (where_idx.values < self.values[first])] = -1 + result[(locs == 0) & (where_idx._values < self._values[first])] = -1 return result - def _array_values(self): - return self.asobject - @Appender(_index_shared_docs['astype']) - def astype(self, dtype, copy=True): - dtype = np.dtype(dtype) + def astype(self, dtype, copy=True, how='start'): + dtype = pandas_dtype(dtype) if is_object_dtype(dtype): return self.asobject elif is_integer_dtype(dtype): - return Index(self.values.astype('i8', copy=copy), name=self.name, - dtype='i8') + if copy: + return self._int64index.copy() + else: + return self._int64index + elif is_datetime64_dtype(dtype): + return self.to_timestamp(how=how) + elif is_datetime64tz_dtype(dtype): + return self.to_timestamp(how=how).tz_localize(dtype.tz) + elif is_period_dtype(dtype): + return self.asfreq(freq=dtype.freq) raise ValueError('Cannot cast PeriodIndex to dtype %s' % dtype) @Substitution(klass='PeriodIndex', value='key') @@ -408,7 +471,7 @@ def searchsorted(self, key, side='left', sorter=None): elif isinstance(key, compat.string_types): key = Period(key, freq=self.freq).ordinal - return self.values.searchsorted(key, side=side, sorter=sorter) + return self._values.searchsorted(key, side=side, sorter=sorter) @property def is_all_dates(self): @@ -467,7 +530,7 @@ def asfreq(self, freq=None, how='E'): """ how = _validate_end_alias(how) - freq = frequencies.get_standard_freq(freq) + freq = Period._maybe_convert_freq(freq) base1, mult1 = _gfc(self.freq) base2, mult2 = _gfc(freq) @@ -483,12 +546,18 @@ def asfreq(self, freq=None, how='E'): new_data = period.period_asfreq_arr(ordinal, base1, base2, end) if self.hasnans: - mask = asi8 == tslib.iNaT - new_data[mask] = tslib.iNaT + new_data[self._isnan] = tslib.iNaT return self._simple_new(new_data, self.name, freq=freq) def to_datetime(self, dayfirst=False): + """ + DEPRECATED: use :meth:`to_timestamp` instead. + + Cast to DatetimeIndex. + """ + warnings.warn("to_datetime is deprecated. Use self.to_timestamp(...)", + FutureWarning, stacklevel=2) return self.to_timestamp() year = _field_accessor('year', 0, "The year of the period") @@ -499,17 +568,22 @@ def to_datetime(self, dayfirst=False): second = _field_accessor('second', 7, "The second of the period") weekofyear = _field_accessor('week', 8, "The week ordinal of the year") week = weekofyear - dayofweek = _field_accessor( - 'dayofweek', 10, "The day of the week with Monday=0, Sunday=6") + dayofweek = _field_accessor('dayofweek', 10, + "The day of the week with Monday=0, Sunday=6") weekday = dayofweek - dayofyear = day_of_year = _field_accessor( - 'dayofyear', 9, "The ordinal day of the year") + dayofyear = day_of_year = _field_accessor('dayofyear', 9, + "The ordinal day of the year") quarter = _field_accessor('quarter', 2, "The quarter of the date") qyear = _field_accessor('qyear', 1) - days_in_month = _field_accessor( - 'days_in_month', 11, "The number of days in the month") + days_in_month = _field_accessor('days_in_month', 11, + "The number of days in the month") daysinmonth = days_in_month + @property + def is_leap_year(self): + """ Logical indicating if the date belongs to a leap year """ + return tslib._isleapyear_arr(self.year) + @property def start_time(self): return self.to_timestamp(how='start') @@ -518,30 +592,9 @@ def start_time(self): def end_time(self): return self.to_timestamp(how='end') - def _get_object_array(self): - freq = self.freq - return np.array([Period._from_ordinal(ordinal=x, freq=freq) - for x in self.values], copy=False) - def _mpl_repr(self): # how to represent ourselves to matplotlib - return self._get_object_array() - - def equals(self, other): - """ - Determines if two Index objects contain the same elements. - """ - if self.is_(other): - return True - - if (not hasattr(other, 'inferred_type') or - other.inferred_type != 'int64'): - try: - other = PeriodIndex(other) - except: - return False - - return np.array_equal(self.asi8, other.asi8) + return self.asobject.values def to_timestamp(self, freq=None, how='start'): """ @@ -563,16 +616,17 @@ def to_timestamp(self, freq=None, how='start'): if freq is None: base, mult = _gfc(self.freq) freq = frequencies.get_to_timestamp_base(base) + else: + freq = Period._maybe_convert_freq(freq) base, mult = _gfc(freq) new_data = self.asfreq(freq, how) - new_data = period.periodarr_to_dt64arr(new_data.values, base) + new_data = period.periodarr_to_dt64arr(new_data._values, base) return DatetimeIndex(new_data, freq='infer', name=self.name) def _maybe_convert_timedelta(self, other): - if isinstance(other, (timedelta, np.timedelta64, - offsets.Tick, Timedelta)): + if isinstance(other, (timedelta, np.timedelta64, offsets.Tick)): offset = frequencies.to_offset(self.freq.rule_code) if isinstance(offset, offsets.Tick): nanos = tslib._delta_to_nanoseconds(other) @@ -580,22 +634,27 @@ def _maybe_convert_timedelta(self, other): if nanos % offset_nanos == 0: return nanos // offset_nanos elif isinstance(other, offsets.DateOffset): - freqstr = frequencies.get_standard_freq(other) + freqstr = other.rule_code base = frequencies.get_base_alias(freqstr) if base == self.freq.rule_code: return other.n msg = _DIFFERENT_FREQ_INDEX.format(self.freqstr, other.freqstr) raise IncompatibleFrequency(msg) elif isinstance(other, np.ndarray): - if com.is_integer_dtype(other): + if is_integer_dtype(other): return other - elif com.is_timedelta64_dtype(other): + elif is_timedelta64_dtype(other): offset = frequencies.to_offset(self.freq) if isinstance(offset, offsets.Tick): nanos = tslib._delta_to_nanoseconds(other) offset_nanos = tslib._delta_to_nanoseconds(offset) if (nanos % offset_nanos).all() == 0: return nanos // offset_nanos + elif is_integer(other): + # integer is passed to .shift via + # _add_datetimelike_methods basically + # but ufunc may pass integer to _add_delta + return other # raise when input doesn't have freq msg = "Input has different freq from PeriodIndex(freq={0})" raise IncompatibleFrequency(msg.format(self.freqstr)) @@ -616,17 +675,12 @@ def _sub_period(self, other): msg = _DIFFERENT_FREQ_INDEX.format(self.freqstr, other.freqstr) raise IncompatibleFrequency(msg) - if other.ordinal == tslib.iNaT: - new_data = np.empty(len(self)) - new_data.fill(np.nan) - else: - asi8 = self.asi8 - new_data = asi8 - other.ordinal + asi8 = self.asi8 + new_data = asi8 - other.ordinal - if self.hasnans: - mask = asi8 == tslib.iNaT - new_data = new_data.astype(np.float64) - new_data[mask] = np.nan + if self.hasnans: + new_data = new_data.astype(np.float64) + new_data[self._isnan] = np.nan # result must be Int64Index or Float64Index return Index(new_data, name=self.name) @@ -643,15 +697,14 @@ def shift(self, n): ------- shifted : PeriodIndex """ - values = self.values + n * self.freq.n + values = self._values + n * self.freq.n if self.hasnans: values[self._isnan] = tslib.iNaT return PeriodIndex(data=values, name=self.name, freq=self.freq) @cache_readonly - def dtype_str(self): - """ return the dtype str of the underlying data """ - return self.inferred_type + def dtype(self): + return PeriodDtype.construct_from_string(self.freq) @property def inferred_type(self): @@ -664,17 +717,18 @@ def get_value(self, series, key): Fast lookup of value from 1-dimensional ndarray. Only use this if you know what you're doing """ - s = _values_from_object(series) + s = com._values_from_object(series) try: - return _maybe_box(self, super(PeriodIndex, self).get_value(s, key), - series, key) + return com._maybe_box(self, + super(PeriodIndex, self).get_value(s, key), + series, key) except (KeyError, IndexError): try: asdt, parsed, reso = parse_time_string(key, self.freq) grp = frequencies.Resolution.get_freq_group(reso) freqn = frequencies.get_freq_group(self.freq) - vals = self.values + vals = self._values # if our data is higher resolution than requested key, slice if grp < freqn: @@ -685,27 +739,45 @@ def get_value(self, series, key): if ord2 < vals[0] or ord1 > vals[-1]: raise KeyError(key) - pos = np.searchsorted(self.values, [ord1, ord2]) + pos = np.searchsorted(self._values, [ord1, ord2]) key = slice(pos[0], pos[1] + 1) return series[key] elif grp == freqn: key = Period(asdt, freq=self.freq).ordinal - return _maybe_box(self, self._engine.get_value(s, key), - series, key) + return com._maybe_box(self, self._engine.get_value(s, key), + series, key) else: raise KeyError(key) except TypeError: pass key = Period(key, self.freq).ordinal - return _maybe_box(self, self._engine.get_value(s, key), - series, key) + return com._maybe_box(self, self._engine.get_value(s, key), + series, key) def get_indexer(self, target, method=None, limit=None, tolerance=None): + target = _ensure_index(target) + if hasattr(target, 'freq') and target.freq != self.freq: msg = _DIFFERENT_FREQ_INDEX.format(self.freqstr, target.freqstr) raise IncompatibleFrequency(msg) - return Index.get_indexer(self, target, method, limit, tolerance) + + if isinstance(target, PeriodIndex): + target = target.asi8 + + if tolerance is not None: + tolerance = self._convert_tolerance(tolerance) + return Index.get_indexer(self._int64index, target, method, + limit, tolerance) + + def _get_unique_index(self, dropna=False): + """ + wrap Index._get_unique_index to handle NaT + """ + res = super(PeriodIndex, self)._get_unique_index(dropna=dropna) + if dropna: + res = res.dropna() + return res def get_loc(self, key, method=None, tolerance=None): """ @@ -733,8 +805,13 @@ def get_loc(self, key, method=None, tolerance=None): # we cannot construct the Period # as we have an invalid type raise KeyError(key) + try: - return Index.get_loc(self, key.ordinal, method, tolerance) + ordinal = tslib.iNaT if key is tslib.NaT else key.ordinal + if tolerance is not None: + tolerance = self._convert_tolerance(tolerance) + return self._int64index.get_loc(ordinal, method, tolerance) + except KeyError: raise KeyError(key) @@ -819,6 +896,14 @@ def _convert_tolerance(self, tolerance): tolerance = DatetimeIndexOpsMixin._convert_tolerance(self, tolerance) return self._maybe_convert_timedelta(tolerance) + def insert(self, loc, item): + if not isinstance(item, Period) or self.freq != item.freq: + return self.asobject.insert(loc, item) + + idx = np.concatenate((self[:loc].asi8, np.array([item.ordinal]), + self[loc:].asi8)) + return self._shallow_copy(idx) + def join(self, other, how='left', level=None, return_indexers=False): """ See Index.join @@ -854,91 +939,25 @@ def _apply_meta(self, rawarr): rawarr = PeriodIndex(rawarr, freq=self.freq) return rawarr - def __getitem__(self, key): - getitem = self._data.__getitem__ - if lib.isscalar(key): - val = getitem(key) - return Period(ordinal=val, freq=self.freq) - else: - if com.is_bool_indexer(key): - key = np.asarray(key) - - result = getitem(key) - if result.ndim > 1: - # MPL kludge - # values = np.asarray(list(values), dtype=object) - # return values.reshape(result.shape) - - return PeriodIndex(result, name=self.name, freq=self.freq) - - return PeriodIndex(result, name=self.name, freq=self.freq) - def _format_native_types(self, na_rep=u('NaT'), date_format=None, **kwargs): - values = np.array(list(self), dtype=object) - mask = isnull(self.values) - values[mask] = na_rep - imask = ~mask + values = self.asobject.values if date_format: formatter = lambda dt: dt.strftime(date_format) else: formatter = lambda dt: u('%s') % dt - values[imask] = np.array([formatter(dt) for dt in values[imask]]) - return values - - def append(self, other): - """ - Append a collection of Index options together - - Parameters - ---------- - other : Index or list/tuple of indices - Returns - ------- - appended : Index - """ - name = self.name - to_concat = [self] - - if isinstance(other, (list, tuple)): - to_concat = to_concat + list(other) + if self.hasnans: + mask = self._isnan + values[mask] = na_rep + imask = ~mask + values[imask] = np.array([formatter(dt) for dt + in values[imask]]) else: - to_concat.append(other) - - for obj in to_concat: - if isinstance(obj, Index) and obj.name != name: - name = None - break - - to_concat = self._ensure_compat_concat(to_concat) - - if isinstance(to_concat[0], PeriodIndex): - if len(set([x.freq for x in to_concat])) > 1: - # box - to_concat = [x.asobject.values for x in to_concat] - else: - cat_values = np.concatenate([x.values for x in to_concat]) - return PeriodIndex(cat_values, freq=self.freq, name=name) - - to_concat = [x.values if isinstance(x, Index) else x - for x in to_concat] - return Index(com._concat_compat(to_concat), name=name) - - def repeat(self, n, *args, **kwargs): - """ - Return a new Index of the values repeated `n` times. - - See also - -------- - numpy.ndarray.repeat - """ - nv.validate_repeat(args, kwargs) - - # overwrites method from DatetimeIndexOpsMixin - return self._shallow_copy(self.values.repeat(n)) + values = np.array([formatter(dt) for dt in values]) + return values def __setstate__(self, state): """Necessary for making this object picklable""" @@ -1037,8 +1056,7 @@ def _get_ordinal_range(start, end, periods, freq, mult=1): if is_start_per and is_end_per and start.freq != end.freq: raise ValueError('Start and end must have same freq') - if ((is_start_per and start.ordinal == tslib.iNaT) or - (is_end_per and end.ordinal == tslib.iNaT)): + if (start is tslib.NaT or end is tslib.NaT): raise ValueError('Start and end must not be NaT') if freq is None: diff --git a/pandas/tseries/resample.py b/pandas/tseries/resample.py index 8d6955ab43711..f1a209053445a 100644 --- a/pandas/tseries/resample.py +++ b/pandas/tseries/resample.py @@ -60,12 +60,15 @@ class Resampler(_GroupBy): 'loffset', 'base', 'kind'] # API compat of allowed attributes - _deprecated_valids = _attributes + ['_ipython_display_', '__doc__', - '_cache', '_attributes', 'binner', - 'grouper', 'groupby', 'keys', - 'sort', 'kind', 'squeeze', - 'group_keys', 'as_index', - 'exclusions', '_groupby'] + _deprecated_valids = _attributes + ['__doc__', '_cache', '_attributes', + 'binner', 'grouper', 'groupby', + 'sort', 'kind', 'squeeze', 'keys', + 'group_keys', 'as_index', 'exclusions', + '_groupby'] + + # don't raise deprecation warning on attributes starting with these + # patterns - prevents warnings caused by IPython introspection + _deprecated_valid_patterns = ['_ipython', '_repr'] # API compat of disallowed attributes _deprecated_invalids = ['iloc', 'loc', 'ix', 'iat', 'at'] @@ -109,9 +112,21 @@ def _typ(self): return 'series' return 'dataframe' - def _deprecated(self): - warnings.warn(".resample() is now a deferred operation\n" - "use .resample(...).mean() instead of .resample(...)", + @property + def _from_selection(self): + """ is the resampling from a DataFrame column or MultiIndex level """ + # upsampling and PeriodIndex resampling do not work + # with selection, this state used to catch and raise an error + return (self.groupby is not None and + (self.groupby.key is not None or + self.groupby.level is not None)) + + def _deprecated(self, op): + warnings.warn(("\n.resample() is now a deferred operation\n" + "You called {op}(...) on this deferred object " + "which materialized it into a {klass}\nby implicitly " + "taking the mean. Use .resample(...).mean() " + "instead").format(op=op, klass=self._typ), FutureWarning, stacklevel=3) return self.mean() @@ -119,20 +134,20 @@ def _make_deprecated_binop(op): # op is a string def _evaluate_numeric_binop(self, other): - result = self._deprecated() + result = self._deprecated(op) return getattr(result, op)(other) return _evaluate_numeric_binop - def _make_deprecated_unary(op): + def _make_deprecated_unary(op, name): # op is a callable def _evaluate_numeric_unary(self): - result = self._deprecated() + result = self._deprecated(name) return op(result) return _evaluate_numeric_unary def __array__(self): - return self._deprecated().__array__() + return self._deprecated('__array__').__array__() __gt__ = _make_deprecated_binop('__gt__') __ge__ = _make_deprecated_binop('__ge__') @@ -148,10 +163,10 @@ def __array__(self): __truediv__ = __rtruediv__ = _make_deprecated_binop('__truediv__') if not compat.PY3: __div__ = __rdiv__ = _make_deprecated_binop('__div__') - __neg__ = _make_deprecated_unary(lambda x: -x) - __pos__ = _make_deprecated_unary(lambda x: x) - __abs__ = _make_deprecated_unary(lambda x: np.abs(x)) - __inv__ = _make_deprecated_unary(lambda x: -x) + __neg__ = _make_deprecated_unary(lambda x: -x, '__neg__') + __pos__ = _make_deprecated_unary(lambda x: x, '__pos__') + __abs__ = _make_deprecated_unary(lambda x: np.abs(x), '__abs__') + __inv__ = _make_deprecated_unary(lambda x: -x, '__inv__') def __getattr__(self, attr): if attr in self._internal_names_set: @@ -165,8 +180,12 @@ def __getattr__(self, attr): raise ValueError(".resample() is now a deferred operation\n" "\tuse .resample(...).mean() instead of " ".resample(...)") - if attr not in self._deprecated_valids: - self = self._deprecated() + + matches_pattern = any(attr.startswith(x) for x + in self._deprecated_valid_patterns) + if not matches_pattern and attr not in self._deprecated_valids: + self = self._deprecated(attr) + return object.__getattribute__(self, attr) def __setattr__(self, attr, value): @@ -182,7 +201,7 @@ def __getitem__(self, key): # compat for deprecated if isinstance(self.obj, com.ABCSeries): - return self._deprecated()[key] + return self._deprecated('__getitem__')[key] raise @@ -197,6 +216,10 @@ def _convert_obj(self, obj): Parameters ---------- obj : the object to be resampled + + Returns + ------- + obj : converted object """ obj = obj.consolidate() return obj @@ -230,7 +253,7 @@ def _assure_grouper(self): def plot(self, *args, **kwargs): # for compat with prior versions, we want to # have the warnings shown here and just have this work - return self._deprecated().plot(*args, **kwargs) + return self._deprecated('plot').plot(*args, **kwargs) def aggregate(self, arg, *args, **kwargs): """ @@ -696,6 +719,11 @@ def _upsample(self, method, limit=None): self._set_binner() if self.axis: raise AssertionError('axis must be 0') + if self._from_selection: + raise ValueError("Upsampling from level= or on= selection" + " is not supported, use .set_index(...)" + " to explicitly set index to" + " datetime-like") ax = self.ax obj = self._selected_obj @@ -753,7 +781,15 @@ def _convert_obj(self, obj): # convert to timestamp if not (self.kind is None or self.kind == 'period'): - obj = obj.to_timestamp(how=self.convention) + if self._from_selection: + # see GH 14008, GH 12871 + msg = ("Resampling from level= or on= selection" + " with a PeriodIndex is not currently supported," + " use .set_index(...) to explicitly set index") + raise NotImplementedError(msg) + else: + obj = obj.to_timestamp(how=self.convention) + return obj def aggregate(self, arg, *args, **kwargs): @@ -774,7 +810,7 @@ def _get_new_index(self): else: start = ax[0].asfreq(self.freq, how=self.convention) end = ax[-1].asfreq(self.freq, how='end') - values = period_range(start, end, freq=self.freq).values + values = period_range(start, end, freq=self.freq).asi8 return ax._shallow_copy(values, freq=self.freq) @@ -805,7 +841,8 @@ def _downsample(self, how, **kwargs): if len(new_index) == 0: bins = [] else: - rng = np.arange(memb.values[0], memb.values[-1] + 1) + i8 = memb.asi8 + rng = np.arange(i8[0], i8[-1] + 1) bins = memb.searchsorted(rng, side='right') grouper = BinGrouper(bins, new_index) return self._groupby_and_aggregate(how, grouper=grouper) @@ -830,6 +867,11 @@ def _upsample(self, method, limit=None): .fillna """ + if self._from_selection: + raise ValueError("Upsampling from level= or on= selection" + " is not supported, use .set_index(...)" + " to explicitly set index to" + " datetime-like") # we may need to actually resample as if we are timestamps if self.kind == 'timestamp': return super(PeriodIndexResampler, self)._upsample(method, @@ -1046,7 +1088,12 @@ def _get_binner_for_grouping(self, obj): l = [] for key, group in grouper.get_iterator(self.ax): l.extend([key] * len(group)) - grouper = binner.__class__(l, freq=binner.freq, name=binner.name) + + if isinstance(self.ax, PeriodIndex): + grouper = binner.__class__(l, freq=binner.freq, name=binner.name) + else: + # resampling causes duplicated values, specifying freq is invalid + grouper = binner.__class__(l, name=binner.name) # since we may have had to sort # may need to reorder groups here diff --git a/pandas/tseries/tdi.py b/pandas/tseries/tdi.py index 3e12cf14e7485..c527bbad555f9 100644 --- a/pandas/tseries/tdi.py +++ b/pandas/tseries/tdi.py @@ -2,11 +2,20 @@ from datetime import timedelta import numpy as np -from pandas.core.common import (ABCSeries, _TD_DTYPE, _maybe_box, - _values_from_object, isnull, - is_integer, is_float, is_integer_dtype, - is_object_dtype, is_timedelta64_dtype, - is_timedelta64_ns_dtype) +from pandas.types.common import (_TD_DTYPE, + is_integer, is_float, + is_bool_dtype, + is_list_like, + is_scalar, + is_integer_dtype, + is_object_dtype, + is_timedelta64_dtype, + is_timedelta64_ns_dtype, + _ensure_int64) +from pandas.types.missing import isnull +from pandas.types.generic import ABCSeries +from pandas.core.common import _maybe_box, _values_from_object + from pandas.core.index import Index, Int64Index import pandas.compat as compat from pandas.compat import u @@ -23,7 +32,7 @@ import pandas.lib as lib import pandas.tslib as tslib -import pandas.algos as _algos +import pandas._join as _join import pandas.index as _index Timedelta = tslib.Timedelta @@ -35,16 +44,20 @@ def _td_index_cmp(opname, nat_result=False): """ def wrapper(self, other): + msg = "cannot compare a TimedeltaIndex with type {0}" func = getattr(super(TimedeltaIndex, self), opname) - if _is_convertible_to_td(other): - other = _to_m8(other) + if _is_convertible_to_td(other) or other is tslib.NaT: + try: + other = _to_m8(other) + except ValueError: + # failed to parse as timedelta + raise TypeError(msg.format(type(other))) result = func(other) - if com.isnull(other): + if isnull(other): result.fill(nat_result) else: - if not com.is_list_like(other): - raise TypeError("cannot compare a TimedeltaIndex with type " - "{0}".format(type(other))) + if not is_list_like(other): + raise TypeError(msg.format(type(other))) other = TimedeltaIndex(other).values result = func(other) @@ -62,7 +75,7 @@ def wrapper(self, other): result[self._isnan] = nat_result # support of bool dtype indexers - if com.is_bool_dtype(result): + if is_bool_dtype(result): return result return Index(result) @@ -97,6 +110,9 @@ class TimedeltaIndex(DatetimeIndexOpsMixin, TimelikeOps, Int64Index): the 'left', 'right', or both sides (None) name : object Name to be stored in the index + + To learn more about the frequency strings, please see `this link + `__. """ _typ = 'timedeltaindex' @@ -106,11 +122,11 @@ def _join_i8_wrapper(joinf, **kwargs): return DatetimeIndexOpsMixin._join_i8_wrapper( joinf, dtype='m8[ns]', **kwargs) - _inner_indexer = _join_i8_wrapper(_algos.inner_join_indexer_int64) - _outer_indexer = _join_i8_wrapper(_algos.outer_join_indexer_int64) - _left_indexer = _join_i8_wrapper(_algos.left_join_indexer_int64) + _inner_indexer = _join_i8_wrapper(_join.inner_join_indexer_int64) + _outer_indexer = _join_i8_wrapper(_join.outer_join_indexer_int64) + _left_indexer = _join_i8_wrapper(_join.left_join_indexer_int64) _left_indexer_unique = _join_i8_wrapper( - _algos.left_join_indexer_unique_int64, with_indexers=False) + _join.left_join_indexer_unique_int64, with_indexers=False) _arrmap = None _datetimelike_ops = ['days', 'seconds', 'microseconds', 'nanoseconds', 'freq', 'components'] @@ -138,8 +154,9 @@ def __new__(cls, data=None, unit=None, if isinstance(data, TimedeltaIndex) and freq is None and name is None: if copy: - data = data.copy() - return data + return data.copy() + else: + return data._shallow_copy() freq_infer = False if not isinstance(freq, DateOffset): @@ -170,7 +187,7 @@ def __new__(cls, data=None, unit=None, data = to_timedelta(data, unit=unit, box=False) if not isinstance(data, (np.ndarray, Index, ABCSeries)): - if lib.isscalar(data): + if is_scalar(data): raise ValueError('TimedeltaIndex() must be called with a ' 'collection of some kind, %s was passed' % repr(data)) @@ -256,7 +273,7 @@ def _simple_new(cls, values, name=None, freq=None, **kwargs): if values.dtype == np.object_: values = tslib.array_to_timedelta64(values) if values.dtype != _TD_DTYPE: - values = com._ensure_int64(values).view(_TD_DTYPE) + values = _ensure_int64(values).view(_TD_DTYPE) result = object.__new__(cls) result._data = values @@ -265,9 +282,6 @@ def _simple_new(cls, values, name=None, freq=None, **kwargs): result._reset_identity() return result - _na_value = tslib.NaT - """The expected NA value to use with this index.""" - @property def _formatter_func(self): from pandas.formats.format import _get_format_timedelta64 @@ -491,34 +505,6 @@ def union(self, other): result.freq = to_offset(result.inferred_freq) return result - def append(self, other): - """ - Append a collection of Index options together - - Parameters - ---------- - other : Index or list/tuple of indices - - Returns - ------- - appended : Index - """ - name = self.name - to_concat = [self] - - if isinstance(other, (list, tuple)): - to_concat = to_concat + list(other) - else: - to_concat.append(other) - - for obj in to_concat: - if isinstance(obj, Index) and obj.name != name: - name = None - break - - to_concat = self._ensure_compat_concat(to_concat) - return Index(_concat._concat_compat(to_concat), name=name) - def join(self, other, how='left', level=None, return_indexers=False): """ See Index.join @@ -683,6 +669,10 @@ def get_loc(self, key, method=None, tolerance=None): ------- loc : int """ + + if isnull(key): + key = tslib.NaT + if tolerance is not None: # try converting tolerance now, so errors don't get swallowed by # the try/except clauses below @@ -740,7 +730,7 @@ def _maybe_cast_slice_bound(self, label, side, kind): def _get_string_slice(self, key, use_lhs=True, use_rhs=True): freq = getattr(self, 'freqstr', getattr(self, 'inferred_freq', None)) - if is_integer(key) or is_float(key): + if is_integer(key) or is_float(key) or key is tslib.NaT: self._invalid_indexer('slice', key) loc = self._partial_td_slice(key, freq, use_lhs=use_lhs, use_rhs=use_rhs) @@ -816,22 +806,6 @@ def dtype(self): def is_all_dates(self): return True - def equals(self, other): - """ - Determines if two Index objects contain the same elements. - """ - if self.is_(other): - return True - - if (not hasattr(other, 'inferred_type') or - other.inferred_type != 'timedelta64'): - try: - other = TimedeltaIndex(other) - except: - return False - - return np.array_equal(self.asi8, other.asi8) - def insert(self, loc, item): """ Make new Index inserting new item at location @@ -900,9 +874,9 @@ def delete(self, loc): if loc in (0, -len(self), -1, len(self) - 1): freq = self.freq else: - if com.is_list_like(loc): + if is_list_like(loc): loc = lib.maybe_indices_to_slice( - com._ensure_int64(np.array(loc)), len(self)) + _ensure_int64(np.array(loc)), len(self)) if isinstance(loc, slice) and loc.step in (1, None): if (loc.start in (0, None) or loc.stop in (len(self), None)): freq = self.freq @@ -989,13 +963,16 @@ def timedelta_range(start=None, end=None, periods=None, freq='D', Make the interval closed with respect to the given frequency to the 'left', 'right', or both sides (None) - Notes - ----- - 2 of start, end, or periods must be specified - Returns ------- rng : TimedeltaIndex + + Notes + ----- + 2 of start, end, or periods must be specified. + + To learn more about the frequency strings, please see `this link + `__. """ return TimedeltaIndex(start=start, end=end, periods=periods, freq=freq, name=name, diff --git a/pandas/tseries/tests/test_base.py b/pandas/tseries/tests/test_base.py index 7077a23d5abcb..8a86fcba32ecb 100644 --- a/pandas/tseries/tests/test_base.py +++ b/pandas/tseries/tests/test_base.py @@ -124,10 +124,11 @@ def test_minmax(self): def test_numpy_minmax(self): dr = pd.date_range(start='2016-01-15', end='2016-01-20') - self.assertEqual(np.min(dr), Timestamp( - '2016-01-15 00:00:00', offset='D')) - self.assertEqual(np.max(dr), Timestamp( - '2016-01-20 00:00:00', offset='D')) + + self.assertEqual(np.min(dr), + Timestamp('2016-01-15 00:00:00', freq='D')) + self.assertEqual(np.max(dr), + Timestamp('2016-01-20 00:00:00', freq='D')) errmsg = "the 'out' parameter is not supported" tm.assertRaisesRegexp(ValueError, errmsg, np.min, dr, out=0) @@ -148,42 +149,60 @@ def test_round(self): elt = rng[1] expected_rng = DatetimeIndex([ - Timestamp('2016-01-01 00:00:00', tz=tz, offset='30T'), - Timestamp('2016-01-01 00:00:00', tz=tz, offset='30T'), - Timestamp('2016-01-01 01:00:00', tz=tz, offset='30T'), - Timestamp('2016-01-01 02:00:00', tz=tz, offset='30T'), - Timestamp('2016-01-01 02:00:00', tz=tz, offset='30T'), + Timestamp('2016-01-01 00:00:00', tz=tz, freq='30T'), + Timestamp('2016-01-01 00:00:00', tz=tz, freq='30T'), + Timestamp('2016-01-01 01:00:00', tz=tz, freq='30T'), + Timestamp('2016-01-01 02:00:00', tz=tz, freq='30T'), + Timestamp('2016-01-01 02:00:00', tz=tz, freq='30T'), ]) expected_elt = expected_rng[1] tm.assert_index_equal(rng.round(freq='H'), expected_rng) self.assertEqual(elt.round(freq='H'), expected_elt) - msg = "Could not evaluate foo" - tm.assertRaisesRegexp(ValueError, msg, rng.round, freq='foo') - tm.assertRaisesRegexp(ValueError, msg, elt.round, freq='foo') + msg = pd.tseries.frequencies._INVALID_FREQ_ERROR + with tm.assertRaisesRegexp(ValueError, msg): + rng.round(freq='foo') + with tm.assertRaisesRegexp(ValueError, msg): + elt.round(freq='foo') msg = " is a non-fixed frequency" tm.assertRaisesRegexp(ValueError, msg, rng.round, freq='M') tm.assertRaisesRegexp(ValueError, msg, elt.round, freq='M') - def test_repeat(self): - reps = 2 - - for tz in self.tz: - rng = pd.date_range(start='2016-01-01', periods=2, - freq='30Min', tz=tz) + def test_repeat_range(self): + rng = date_range('1/1/2000', '1/1/2001') - expected_rng = DatetimeIndex([ - Timestamp('2016-01-01 00:00:00', tz=tz, offset='30T'), - Timestamp('2016-01-01 00:00:00', tz=tz, offset='30T'), - Timestamp('2016-01-01 00:30:00', tz=tz, offset='30T'), - Timestamp('2016-01-01 00:30:00', tz=tz, offset='30T'), - ]) + result = rng.repeat(5) + self.assertIsNone(result.freq) + self.assertEqual(len(result), 5 * len(rng)) - tm.assert_index_equal(rng.repeat(reps), expected_rng) + for tz in self.tz: + index = pd.date_range('2001-01-01', periods=2, freq='D', tz=tz) + exp = pd.DatetimeIndex(['2001-01-01', '2001-01-01', + '2001-01-02', '2001-01-02'], tz=tz) + for res in [index.repeat(2), np.repeat(index, 2)]: + tm.assert_index_equal(res, exp) + self.assertIsNone(res.freq) + + index = pd.date_range('2001-01-01', periods=2, freq='2D', tz=tz) + exp = pd.DatetimeIndex(['2001-01-01', '2001-01-01', + '2001-01-03', '2001-01-03'], tz=tz) + for res in [index.repeat(2), np.repeat(index, 2)]: + tm.assert_index_equal(res, exp) + self.assertIsNone(res.freq) + + index = pd.DatetimeIndex(['2001-01-01', 'NaT', '2003-01-01'], + tz=tz) + exp = pd.DatetimeIndex(['2001-01-01', '2001-01-01', '2001-01-01', + 'NaT', 'NaT', 'NaT', + '2003-01-01', '2003-01-01', '2003-01-01'], + tz=tz) + for res in [index.repeat(3), np.repeat(index, 3)]: + tm.assert_index_equal(res, exp) + self.assertIsNone(res.freq) - def test_numpy_repeat(self): + def test_repeat(self): reps = 2 msg = "the 'axis' parameter is not supported" @@ -192,12 +211,16 @@ def test_numpy_repeat(self): freq='30Min', tz=tz) expected_rng = DatetimeIndex([ - Timestamp('2016-01-01 00:00:00', tz=tz, offset='30T'), - Timestamp('2016-01-01 00:00:00', tz=tz, offset='30T'), - Timestamp('2016-01-01 00:30:00', tz=tz, offset='30T'), - Timestamp('2016-01-01 00:30:00', tz=tz, offset='30T'), + Timestamp('2016-01-01 00:00:00', tz=tz, freq='30T'), + Timestamp('2016-01-01 00:00:00', tz=tz, freq='30T'), + Timestamp('2016-01-01 00:30:00', tz=tz, freq='30T'), + Timestamp('2016-01-01 00:30:00', tz=tz, freq='30T'), ]) + res = rng.repeat(reps) + tm.assert_index_equal(res, expected_rng) + self.assertIsNone(res.freq) + tm.assert_index_equal(np.repeat(rng, reps), expected_rng) tm.assertRaisesRegexp(ValueError, msg, np.repeat, rng, reps, axis=1) @@ -332,12 +355,12 @@ def test_resolution(self): ['day', 'day', 'day', 'day', 'hour', 'minute', 'second', 'millisecond', 'microsecond']): - for tz in [None, 'Asia/Tokyo', 'US/Eastern']: + for tz in self.tz: idx = pd.date_range(start='2013-04-01', periods=30, freq=freq, tz=tz) self.assertEqual(idx.resolution, expected) - def test_add_iadd(self): + def test_union(self): for tz in self.tz: # union rng1 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) @@ -355,17 +378,12 @@ def test_add_iadd(self): for rng, other, expected in [(rng1, other1, expected1), (rng2, other2, expected2), (rng3, other3, expected3)]: - # GH9094 - with tm.assert_produces_warning(FutureWarning): - result_add = rng + other - result_union = rng.union(other) - tm.assert_index_equal(result_add, expected) + result_union = rng.union(other) tm.assert_index_equal(result_union, expected) - # GH9094 - with tm.assert_produces_warning(FutureWarning): - rng += other - tm.assert_index_equal(rng, expected) + + def test_add_iadd(self): + for tz in self.tz: # offset offsets = [pd.offsets.Hour(2), timedelta(hours=2), @@ -398,7 +416,26 @@ def test_add_iadd(self): with tm.assertRaisesRegexp(TypeError, msg): Timestamp('2011-01-01') + idx - def test_sub_isub(self): + def test_add_dti_dti(self): + # previously performed setop (deprecated in 0.16.0), now raises + # TypeError (GH14164) + + dti = date_range('20130101', periods=3) + dti_tz = date_range('20130101', periods=3).tz_localize('US/Eastern') + + with tm.assertRaises(TypeError): + dti + dti + + with tm.assertRaises(TypeError): + dti_tz + dti_tz + + with tm.assertRaises(TypeError): + dti_tz + dti + + with tm.assertRaises(TypeError): + dti + dti_tz + + def test_difference(self): for tz in self.tz: # diff rng1 = pd.date_range('1/1/2000', freq='D', periods=5, tz=tz) @@ -416,9 +453,11 @@ def test_sub_isub(self): for rng, other, expected in [(rng1, other1, expected1), (rng2, other2, expected2), (rng3, other3, expected3)]: - result_union = rng.difference(other) + result_diff = rng.difference(other) + tm.assert_index_equal(result_diff, expected) - tm.assert_index_equal(result_union, expected) + def test_sub_isub(self): + for tz in self.tz: # offset offsets = [pd.offsets.Hour(2), timedelta(hours=2), @@ -426,9 +465,10 @@ def test_sub_isub(self): for delta in offsets: rng = pd.date_range('2000-01-01', '2000-02-01', tz=tz) - result = rng - delta expected = pd.date_range('1999-12-31 22:00', '2000-01-31 22:00', tz=tz) + + result = rng - delta tm.assert_index_equal(result, expected) rng -= delta tm.assert_index_equal(rng, expected) @@ -443,18 +483,101 @@ def test_sub_isub(self): rng -= 1 tm.assert_index_equal(rng, expected) + def test_sub_dti_dti(self): + # previously performed setop (deprecated in 0.16.0), now changed to + # return subtraction -> TimeDeltaIndex (GH ...) + + dti = date_range('20130101', periods=3) + dti_tz = date_range('20130101', periods=3).tz_localize('US/Eastern') + dti_tz2 = date_range('20130101', periods=3).tz_localize('UTC') + expected = TimedeltaIndex([0, 0, 0]) + + result = dti - dti + tm.assert_index_equal(result, expected) + + result = dti_tz - dti_tz + tm.assert_index_equal(result, expected) + + with tm.assertRaises(TypeError): + dti_tz - dti + + with tm.assertRaises(TypeError): + dti - dti_tz + + with tm.assertRaises(TypeError): + dti_tz - dti_tz2 + + # isub + dti -= dti + tm.assert_index_equal(dti, expected) + + # different length raises ValueError + dti1 = date_range('20130101', periods=3) + dti2 = date_range('20130101', periods=4) + with tm.assertRaises(ValueError): + dti1 - dti2 + + # NaN propagation + dti1 = DatetimeIndex(['2012-01-01', np.nan, '2012-01-03']) + dti2 = DatetimeIndex(['2012-01-02', '2012-01-03', np.nan]) + expected = TimedeltaIndex(['1 days', np.nan, np.nan]) + result = dti2 - dti1 + tm.assert_index_equal(result, expected) + + def test_sub_period(self): + # GH 13078 + # not supported, check TypeError + p = pd.Period('2011-01-01', freq='D') + + for freq in [None, 'D']: + idx = pd.DatetimeIndex(['2011-01-01', '2011-01-02'], freq=freq) + + with tm.assertRaises(TypeError): + idx - p + + with tm.assertRaises(TypeError): + p - idx + + def test_comp_nat(self): + left = pd.DatetimeIndex([pd.Timestamp('2011-01-01'), pd.NaT, + pd.Timestamp('2011-01-03')]) + right = pd.DatetimeIndex([pd.NaT, pd.NaT, pd.Timestamp('2011-01-03')]) + + for l, r in [(left, right), (left.asobject, right.asobject)]: + result = l == r + expected = np.array([False, False, True]) + tm.assert_numpy_array_equal(result, expected) + + result = l != r + expected = np.array([True, True, False]) + tm.assert_numpy_array_equal(result, expected) + + expected = np.array([False, False, False]) + tm.assert_numpy_array_equal(l == pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT == r, expected) + + expected = np.array([True, True, True]) + tm.assert_numpy_array_equal(l != pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT != l, expected) + + expected = np.array([False, False, False]) + tm.assert_numpy_array_equal(l < pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT > l, expected) + def test_value_counts_unique(self): # GH 7735 - for tz in [None, 'UTC', 'Asia/Tokyo', 'US/Eastern']: + for tz in self.tz: idx = pd.date_range('2011-01-01 09:00', freq='H', periods=10) # create repeated values, 'n'th element is repeated by n+1 times - idx = DatetimeIndex( - np.repeat(idx.values, range(1, len(idx) + 1)), tz=tz) + idx = DatetimeIndex(np.repeat(idx.values, range(1, len(idx) + 1)), + tz=tz) exp_idx = pd.date_range('2011-01-01 18:00', freq='-1H', periods=10, tz=tz) expected = Series(range(10, 0, -1), index=exp_idx, dtype='int64') - tm.assert_series_equal(idx.value_counts(), expected) + + for obj in [idx, Series(idx)]: + tm.assert_series_equal(obj.value_counts(), expected) expected = pd.date_range('2011-01-01 09:00', freq='H', periods=10, tz=tz) @@ -464,15 +587,20 @@ def test_value_counts_unique(self): '2013-01-01 09:00', '2013-01-01 08:00', '2013-01-01 08:00', pd.NaT], tz=tz) - exp_idx = DatetimeIndex( - ['2013-01-01 09:00', '2013-01-01 08:00'], tz=tz) + exp_idx = DatetimeIndex(['2013-01-01 09:00', '2013-01-01 08:00'], + tz=tz) expected = Series([3, 2], index=exp_idx) - tm.assert_series_equal(idx.value_counts(), expected) - exp_idx = DatetimeIndex( - ['2013-01-01 09:00', '2013-01-01 08:00', pd.NaT], tz=tz) + for obj in [idx, Series(idx)]: + tm.assert_series_equal(obj.value_counts(), expected) + + exp_idx = DatetimeIndex(['2013-01-01 09:00', '2013-01-01 08:00', + pd.NaT], tz=tz) expected = Series([3, 2, 1], index=exp_idx) - tm.assert_series_equal(idx.value_counts(dropna=False), expected) + + for obj in [idx, Series(idx)]: + tm.assert_series_equal(obj.value_counts(dropna=False), + expected) tm.assert_index_equal(idx.unique(), exp_idx) @@ -485,8 +613,8 @@ def test_nonunique_contains(self): def test_order(self): # with freq - idx1 = DatetimeIndex( - ['2011-01-01', '2011-01-02', '2011-01-03'], freq='D', name='idx') + idx1 = DatetimeIndex(['2011-01-01', '2011-01-02', + '2011-01-03'], freq='D', name='idx') idx2 = DatetimeIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00'], freq='H', tz='Asia/Tokyo', name='tzidx') @@ -505,7 +633,8 @@ def test_order(self): ordered, indexer = idx.sort_values(return_indexer=True) self.assert_index_equal(ordered, idx) self.assert_numpy_array_equal(indexer, - np.array([0, 1, 2], dtype=np.int64)) + np.array([0, 1, 2]), + check_dtype=False) self.assertEqual(ordered.freq, idx.freq) ordered, indexer = idx.sort_values(return_indexer=True, @@ -513,54 +642,56 @@ def test_order(self): expected = idx[::-1] self.assert_index_equal(ordered, expected) self.assert_numpy_array_equal(indexer, - np.array([2, 1, 0], dtype=np.int64)) + np.array([2, 1, 0]), + check_dtype=False) self.assertEqual(ordered.freq, expected.freq) self.assertEqual(ordered.freq.n, -1) # without freq - idx1 = DatetimeIndex(['2011-01-01', '2011-01-03', '2011-01-05', - '2011-01-02', '2011-01-01'], name='idx1') - exp1 = DatetimeIndex(['2011-01-01', '2011-01-01', '2011-01-02', - '2011-01-03', '2011-01-05'], name='idx1') - - idx2 = DatetimeIndex(['2011-01-01', '2011-01-03', '2011-01-05', - '2011-01-02', '2011-01-01'], - tz='Asia/Tokyo', name='idx2') - - # TODO(wesm): unused? - - # exp2 = DatetimeIndex(['2011-01-01', '2011-01-01', '2011-01-02', - # '2011-01-03', '2011-01-05'], - # tz='Asia/Tokyo', name='idx2') - - # idx3 = DatetimeIndex([pd.NaT, '2011-01-03', '2011-01-05', - # '2011-01-02', pd.NaT], name='idx3') - # exp3 = DatetimeIndex([pd.NaT, pd.NaT, '2011-01-02', '2011-01-03', - # '2011-01-05'], name='idx3') - - for idx, expected in [(idx1, exp1), (idx1, exp1), (idx1, exp1)]: - ordered = idx.sort_values() - self.assert_index_equal(ordered, expected) - self.assertIsNone(ordered.freq) - - ordered = idx.sort_values(ascending=False) - self.assert_index_equal(ordered, expected[::-1]) - self.assertIsNone(ordered.freq) - - ordered, indexer = idx.sort_values(return_indexer=True) - self.assert_index_equal(ordered, expected) - - exp = np.array([0, 4, 3, 1, 2], dtype=np.int64) - self.assert_numpy_array_equal(indexer, exp) - self.assertIsNone(ordered.freq) - - ordered, indexer = idx.sort_values(return_indexer=True, - ascending=False) - self.assert_index_equal(ordered, expected[::-1]) - - exp = np.array([2, 1, 3, 4, 0], dtype=np.int64) - self.assert_numpy_array_equal(indexer, exp) - self.assertIsNone(ordered.freq) + for tz in self.tz: + idx1 = DatetimeIndex(['2011-01-01', '2011-01-03', '2011-01-05', + '2011-01-02', '2011-01-01'], + tz=tz, name='idx1') + exp1 = DatetimeIndex(['2011-01-01', '2011-01-01', '2011-01-02', + '2011-01-03', '2011-01-05'], + tz=tz, name='idx1') + + idx2 = DatetimeIndex(['2011-01-01', '2011-01-03', '2011-01-05', + '2011-01-02', '2011-01-01'], + tz=tz, name='idx2') + + exp2 = DatetimeIndex(['2011-01-01', '2011-01-01', '2011-01-02', + '2011-01-03', '2011-01-05'], + tz=tz, name='idx2') + + idx3 = DatetimeIndex([pd.NaT, '2011-01-03', '2011-01-05', + '2011-01-02', pd.NaT], tz=tz, name='idx3') + exp3 = DatetimeIndex([pd.NaT, pd.NaT, '2011-01-02', '2011-01-03', + '2011-01-05'], tz=tz, name='idx3') + + for idx, expected in [(idx1, exp1), (idx2, exp2), (idx3, exp3)]: + ordered = idx.sort_values() + self.assert_index_equal(ordered, expected) + self.assertIsNone(ordered.freq) + + ordered = idx.sort_values(ascending=False) + self.assert_index_equal(ordered, expected[::-1]) + self.assertIsNone(ordered.freq) + + ordered, indexer = idx.sort_values(return_indexer=True) + self.assert_index_equal(ordered, expected) + + exp = np.array([0, 4, 3, 1, 2]) + self.assert_numpy_array_equal(indexer, exp, check_dtype=False) + self.assertIsNone(ordered.freq) + + ordered, indexer = idx.sort_values(return_indexer=True, + ascending=False) + self.assert_index_equal(ordered, expected[::-1]) + + exp = np.array([2, 1, 3, 4, 0]) + self.assert_numpy_array_equal(indexer, exp, check_dtype=False) + self.assertIsNone(ordered.freq) def test_getitem(self): idx1 = pd.date_range('2011-01-01', '2011-01-31', freq='D', name='idx') @@ -609,6 +740,27 @@ def test_drop_duplicates_metadata(self): self.assert_index_equal(idx, result) self.assertIsNone(result.freq) + def test_drop_duplicates(self): + # to check Index/Series compat + base = pd.date_range('2011-01-01', '2011-01-31', freq='D', name='idx') + idx = base.append(base[:5]) + + res = idx.drop_duplicates() + tm.assert_index_equal(res, base) + res = Series(idx).drop_duplicates() + tm.assert_series_equal(res, Series(base)) + + res = idx.drop_duplicates(keep='last') + exp = base[5:].append(base[:5]) + tm.assert_index_equal(res, exp) + res = Series(idx).drop_duplicates(keep='last') + tm.assert_series_equal(res, Series(exp, index=np.arange(5, 36))) + + res = idx.drop_duplicates(keep=False) + tm.assert_index_equal(res, base[5:]) + res = Series(idx).drop_duplicates(keep=False) + tm.assert_series_equal(res, Series(base[5:], index=np.arange(5, 31))) + def test_take(self): # GH 10295 idx1 = pd.date_range('2011-01-01', '2011-01-31', freq='D', name='idx') @@ -689,7 +841,7 @@ def test_nat_new(self): def test_shift(self): # GH 9903 - for tz in [None, 'US/Eastern', 'Asia/Tokyo']: + for tz in self.tz: idx = pd.DatetimeIndex([], name='xxx', tz=tz) tm.assert_index_equal(idx.shift(0, freq='H'), idx) tm.assert_index_equal(idx.shift(3, freq='H'), idx) @@ -704,6 +856,58 @@ def test_shift(self): '2011-01-01 09:00'], name='xxx', tz=tz) tm.assert_index_equal(idx.shift(-3, freq='H'), exp) + def test_nat(self): + self.assertIs(pd.DatetimeIndex._na_value, pd.NaT) + self.assertIs(pd.DatetimeIndex([])._na_value, pd.NaT) + + for tz in [None, 'US/Eastern', 'UTC']: + idx = pd.DatetimeIndex(['2011-01-01', '2011-01-02'], tz=tz) + self.assertTrue(idx._can_hold_na) + + tm.assert_numpy_array_equal(idx._isnan, np.array([False, False])) + self.assertFalse(idx.hasnans) + tm.assert_numpy_array_equal(idx._nan_idxs, + np.array([], dtype=np.int64)) + + idx = pd.DatetimeIndex(['2011-01-01', 'NaT'], tz=tz) + self.assertTrue(idx._can_hold_na) + + tm.assert_numpy_array_equal(idx._isnan, np.array([False, True])) + self.assertTrue(idx.hasnans) + tm.assert_numpy_array_equal(idx._nan_idxs, + np.array([1], dtype=np.int64)) + + def test_equals(self): + # GH 13107 + for tz in [None, 'UTC', 'US/Eastern', 'Asia/Tokyo']: + idx = pd.DatetimeIndex(['2011-01-01', '2011-01-02', 'NaT']) + self.assertTrue(idx.equals(idx)) + self.assertTrue(idx.equals(idx.copy())) + self.assertTrue(idx.equals(idx.asobject)) + self.assertTrue(idx.asobject.equals(idx)) + self.assertTrue(idx.asobject.equals(idx.asobject)) + self.assertFalse(idx.equals(list(idx))) + self.assertFalse(idx.equals(pd.Series(idx))) + + idx2 = pd.DatetimeIndex(['2011-01-01', '2011-01-02', 'NaT'], + tz='US/Pacific') + self.assertFalse(idx.equals(idx2)) + self.assertFalse(idx.equals(idx2.copy())) + self.assertFalse(idx.equals(idx2.asobject)) + self.assertFalse(idx.asobject.equals(idx2)) + self.assertFalse(idx.equals(list(idx2))) + self.assertFalse(idx.equals(pd.Series(idx2))) + + # same internal, different tz + idx3 = pd.DatetimeIndex._simple_new(idx.asi8, tz='US/Pacific') + tm.assert_numpy_array_equal(idx.asi8, idx3.asi8) + self.assertFalse(idx.equals(idx3)) + self.assertFalse(idx.equals(idx3.copy())) + self.assertFalse(idx.equals(idx3.asobject)) + self.assertFalse(idx.asobject.equals(idx3)) + self.assertFalse(idx.equals(list(idx3))) + self.assertFalse(idx.equals(pd.Series(idx3))) + class TestTimedeltaIndexOps(Ops): def setUp(self): @@ -804,9 +1008,11 @@ def test_round(self): tm.assert_index_equal(td.round(freq='H'), expected_rng) self.assertEqual(elt.round(freq='H'), expected_elt) - msg = "Could not evaluate foo" - tm.assertRaisesRegexp(ValueError, msg, td.round, freq='foo') - tm.assertRaisesRegexp(ValueError, msg, elt.round, freq='foo') + msg = pd.tseries.frequencies._INVALID_FREQ_ERROR + with self.assertRaisesRegexp(ValueError, msg): + td.round(freq='foo') + with tm.assertRaisesRegexp(ValueError, msg): + elt.round(freq='foo') msg = " is a non-fixed frequency" tm.assertRaisesRegexp(ValueError, msg, td.round, freq='M') @@ -1027,11 +1233,11 @@ def test_subtraction_ops_with_tz(self): # check that dt/dti subtraction ops with tz are validated dti = date_range('20130101', periods=3) ts = Timestamp('20130101') - dt = ts.to_datetime() + dt = ts.to_pydatetime() dti_tz = date_range('20130101', periods=3).tz_localize('US/Eastern') ts_tz = Timestamp('20130101').tz_localize('US/Eastern') ts_tz2 = Timestamp('20130101').tz_localize('CET') - dt_tz = ts_tz.to_datetime() + dt_tz = ts_tz.to_pydatetime() td = Timedelta('1 days') def _check(result, expected): @@ -1091,50 +1297,6 @@ def _check(result, expected): ['20121231', '20130101', '20130102'], tz='US/Eastern') tm.assert_index_equal(result, expected) - def test_dti_dti_deprecated_ops(self): - - # deprecated in 0.16.0 (GH9094) - # change to return subtraction -> TimeDeltaIndex in 0.17.0 - # shoudl move to the appropriate sections above - - dti = date_range('20130101', periods=3) - dti_tz = date_range('20130101', periods=3).tz_localize('US/Eastern') - - with tm.assert_produces_warning(FutureWarning): - result = dti - dti - expected = Index([]) - tm.assert_index_equal(result, expected) - - with tm.assert_produces_warning(FutureWarning): - result = dti + dti - expected = dti - tm.assert_index_equal(result, expected) - - with tm.assert_produces_warning(FutureWarning): - result = dti_tz - dti_tz - expected = Index([]) - tm.assert_index_equal(result, expected) - - with tm.assert_produces_warning(FutureWarning): - result = dti_tz + dti_tz - expected = dti_tz - tm.assert_index_equal(result, expected) - - with tm.assert_produces_warning(FutureWarning): - result = dti_tz - dti - expected = dti_tz - tm.assert_index_equal(result, expected) - - with tm.assert_produces_warning(FutureWarning): - result = dti - dti_tz - expected = dti - tm.assert_index_equal(result, expected) - - with tm.assert_produces_warning(FutureWarning): - self.assertRaises(TypeError, lambda: dti_tz + dti) - with tm.assert_produces_warning(FutureWarning): - self.assertRaises(TypeError, lambda: dti + dti_tz) - def test_dti_tdi_numeric_ops(self): # These are normally union/diff set-like ops @@ -1157,6 +1319,20 @@ def test_dti_tdi_numeric_ops(self): expected = DatetimeIndex(['20121231', pd.NaT, '20130101']) tm.assert_index_equal(result, expected) + def test_sub_period(self): + # GH 13078 + # not supported, check TypeError + p = pd.Period('2011-01-01', freq='D') + + for freq in [None, 'H']: + idx = pd.TimedeltaIndex(['1 hours', '2 hours'], freq=freq) + + with tm.assertRaises(TypeError): + idx - p + + with tm.assertRaises(TypeError): + p - idx + def test_addition_ops(self): # with datetimes/timedelta and tdi/dti @@ -1207,6 +1383,32 @@ def test_addition_ops(self): expected = Timestamp('20130102') self.assertEqual(result, expected) + def test_comp_nat(self): + left = pd.TimedeltaIndex([pd.Timedelta('1 days'), pd.NaT, + pd.Timedelta('3 days')]) + right = pd.TimedeltaIndex([pd.NaT, pd.NaT, pd.Timedelta('3 days')]) + + for l, r in [(left, right), (left.asobject, right.asobject)]: + result = l == r + expected = np.array([False, False, True]) + tm.assert_numpy_array_equal(result, expected) + + result = l != r + expected = np.array([True, True, False]) + tm.assert_numpy_array_equal(result, expected) + + expected = np.array([False, False, False]) + tm.assert_numpy_array_equal(l == pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT == r, expected) + + expected = np.array([True, True, True]) + tm.assert_numpy_array_equal(l != pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT != l, expected) + + expected = np.array([False, False, False]) + tm.assert_numpy_array_equal(l < pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT > l, expected) + def test_value_counts_unique(self): # GH 7735 @@ -1216,23 +1418,29 @@ def test_value_counts_unique(self): exp_idx = timedelta_range('1 days 18:00:00', freq='-1H', periods=10) expected = Series(range(10, 0, -1), index=exp_idx, dtype='int64') - tm.assert_series_equal(idx.value_counts(), expected) + + for obj in [idx, Series(idx)]: + tm.assert_series_equal(obj.value_counts(), expected) expected = timedelta_range('1 days 09:00:00', freq='H', periods=10) tm.assert_index_equal(idx.unique(), expected) - idx = TimedeltaIndex( - ['1 days 09:00:00', '1 days 09:00:00', '1 days 09:00:00', - '1 days 08:00:00', '1 days 08:00:00', pd.NaT]) + idx = TimedeltaIndex(['1 days 09:00:00', '1 days 09:00:00', + '1 days 09:00:00', '1 days 08:00:00', + '1 days 08:00:00', pd.NaT]) exp_idx = TimedeltaIndex(['1 days 09:00:00', '1 days 08:00:00']) expected = Series([3, 2], index=exp_idx) - tm.assert_series_equal(idx.value_counts(), expected) - exp_idx = TimedeltaIndex(['1 days 09:00:00', '1 days 08:00:00', pd.NaT - ]) + for obj in [idx, Series(idx)]: + tm.assert_series_equal(obj.value_counts(), expected) + + exp_idx = TimedeltaIndex(['1 days 09:00:00', '1 days 08:00:00', + pd.NaT]) expected = Series([3, 2, 1], index=exp_idx) - tm.assert_series_equal(idx.value_counts(dropna=False), expected) + + for obj in [idx, Series(idx)]: + tm.assert_series_equal(obj.value_counts(dropna=False), expected) tm.assert_index_equal(idx.unique(), exp_idx) @@ -1271,7 +1479,8 @@ def test_order(self): ordered, indexer = idx.sort_values(return_indexer=True) self.assert_index_equal(ordered, idx) self.assert_numpy_array_equal(indexer, - np.array([0, 1, 2], dtype=np.int64)) + np.array([0, 1, 2]), + check_dtype=False) self.assertEqual(ordered.freq, idx.freq) ordered, indexer = idx.sort_values(return_indexer=True, @@ -1309,16 +1518,16 @@ def test_order(self): ordered, indexer = idx.sort_values(return_indexer=True) self.assert_index_equal(ordered, expected) - exp = np.array([0, 4, 3, 1, 2], dtype=np.int64) - self.assert_numpy_array_equal(indexer, exp) + exp = np.array([0, 4, 3, 1, 2]) + self.assert_numpy_array_equal(indexer, exp, check_dtype=False) self.assertIsNone(ordered.freq) ordered, indexer = idx.sort_values(return_indexer=True, ascending=False) self.assert_index_equal(ordered, expected[::-1]) - exp = np.array([2, 1, 3, 4, 0], dtype=np.int64) - self.assert_numpy_array_equal(indexer, exp) + exp = np.array([2, 1, 3, 4, 0]) + self.assert_numpy_array_equal(indexer, exp, check_dtype=False) self.assertIsNone(ordered.freq) def test_getitem(self): @@ -1366,6 +1575,27 @@ def test_drop_duplicates_metadata(self): self.assert_index_equal(idx, result) self.assertIsNone(result.freq) + def test_drop_duplicates(self): + # to check Index/Series compat + base = pd.timedelta_range('1 day', '31 day', freq='D', name='idx') + idx = base.append(base[:5]) + + res = idx.drop_duplicates() + tm.assert_index_equal(res, base) + res = Series(idx).drop_duplicates() + tm.assert_series_equal(res, Series(base)) + + res = idx.drop_duplicates(keep='last') + exp = base[5:].append(base[:5]) + tm.assert_index_equal(res, exp) + res = Series(idx).drop_duplicates(keep='last') + tm.assert_series_equal(res, Series(exp, index=np.arange(5, 36))) + + res = idx.drop_duplicates(keep=False) + tm.assert_index_equal(res, base[5:]) + res = Series(idx).drop_duplicates(keep=False) + tm.assert_series_equal(res, Series(base[5:], index=np.arange(5, 31))) + def test_take(self): # GH 10295 idx1 = pd.timedelta_range('1 day', '31 day', freq='D', name='idx') @@ -1462,6 +1692,61 @@ def test_shift(self): name='xxx') tm.assert_index_equal(idx.shift(-3, freq='T'), exp) + def test_repeat(self): + index = pd.timedelta_range('1 days', periods=2, freq='D') + exp = pd.TimedeltaIndex(['1 days', '1 days', '2 days', '2 days']) + for res in [index.repeat(2), np.repeat(index, 2)]: + tm.assert_index_equal(res, exp) + self.assertIsNone(res.freq) + + index = TimedeltaIndex(['1 days', 'NaT', '3 days']) + exp = TimedeltaIndex(['1 days', '1 days', '1 days', + 'NaT', 'NaT', 'NaT', + '3 days', '3 days', '3 days']) + for res in [index.repeat(3), np.repeat(index, 3)]: + tm.assert_index_equal(res, exp) + self.assertIsNone(res.freq) + + def test_nat(self): + self.assertIs(pd.TimedeltaIndex._na_value, pd.NaT) + self.assertIs(pd.TimedeltaIndex([])._na_value, pd.NaT) + + idx = pd.TimedeltaIndex(['1 days', '2 days']) + self.assertTrue(idx._can_hold_na) + + tm.assert_numpy_array_equal(idx._isnan, np.array([False, False])) + self.assertFalse(idx.hasnans) + tm.assert_numpy_array_equal(idx._nan_idxs, + np.array([], dtype=np.int64)) + + idx = pd.TimedeltaIndex(['1 days', 'NaT']) + self.assertTrue(idx._can_hold_na) + + tm.assert_numpy_array_equal(idx._isnan, np.array([False, True])) + self.assertTrue(idx.hasnans) + tm.assert_numpy_array_equal(idx._nan_idxs, + np.array([1], dtype=np.int64)) + + def test_equals(self): + # GH 13107 + idx = pd.TimedeltaIndex(['1 days', '2 days', 'NaT']) + self.assertTrue(idx.equals(idx)) + self.assertTrue(idx.equals(idx.copy())) + self.assertTrue(idx.equals(idx.asobject)) + self.assertTrue(idx.asobject.equals(idx)) + self.assertTrue(idx.asobject.equals(idx.asobject)) + self.assertFalse(idx.equals(list(idx))) + self.assertFalse(idx.equals(pd.Series(idx))) + + idx2 = pd.TimedeltaIndex(['2 days', '1 days', 'NaT']) + self.assertFalse(idx.equals(idx2)) + self.assertFalse(idx.equals(idx2.copy())) + self.assertFalse(idx.equals(idx2.asobject)) + self.assertFalse(idx.asobject.equals(idx2)) + self.assertFalse(idx.asobject.equals(idx2.asobject)) + self.assertFalse(idx.equals(list(idx2))) + self.assertFalse(idx.equals(pd.Series(idx2))) + class TestPeriodIndexOps(Ops): def setUp(self): @@ -1503,17 +1788,16 @@ def test_asobject_tolist(self): result = idx.asobject self.assertTrue(isinstance(result, Index)) self.assertEqual(result.dtype, object) + tm.assert_index_equal(result, expected) for i in [0, 1, 3]: - self.assertTrue(result[i], expected[i]) - self.assertTrue(result[2].ordinal, pd.tslib.iNaT) - self.assertTrue(result[2].freq, 'D') + self.assertEqual(result[i], expected[i]) + self.assertIs(result[2], pd.NaT) self.assertEqual(result.name, expected.name) result_list = idx.tolist() for i in [0, 1, 3]: - self.assertTrue(result_list[i], expected_list[i]) - self.assertTrue(result_list[2].ordinal, pd.tslib.iNaT) - self.assertTrue(result_list[2].freq, 'D') + self.assertEqual(result_list[i], expected_list[i]) + self.assertIs(result_list[2], pd.NaT) def test_minmax(self): @@ -1539,18 +1823,15 @@ def test_minmax(self): # Return NaT obj = PeriodIndex([], freq='M') result = getattr(obj, op)() - self.assertEqual(result.ordinal, tslib.iNaT) - self.assertEqual(result.freq, 'M') + self.assertIs(result, tslib.NaT) obj = PeriodIndex([pd.NaT], freq='M') result = getattr(obj, op)() - self.assertEqual(result.ordinal, tslib.iNaT) - self.assertEqual(result.freq, 'M') + self.assertIs(result, tslib.NaT) obj = PeriodIndex([pd.NaT, pd.NaT, pd.NaT], freq='M') result = getattr(obj, op)() - self.assertEqual(result.ordinal, tslib.iNaT) - self.assertEqual(result.freq, 'M') + self.assertIs(result, tslib.NaT) def test_numpy_minmax(self): pr = pd.period_range(start='2016-01-15', end='2016-01-20') @@ -1575,44 +1856,48 @@ def test_representation(self): idx1 = PeriodIndex([], freq='D') idx2 = PeriodIndex(['2011-01-01'], freq='D') idx3 = PeriodIndex(['2011-01-01', '2011-01-02'], freq='D') - idx4 = PeriodIndex( - ['2011-01-01', '2011-01-02', '2011-01-03'], freq='D') + idx4 = PeriodIndex(['2011-01-01', '2011-01-02', '2011-01-03'], + freq='D') idx5 = PeriodIndex(['2011', '2012', '2013'], freq='A') - idx6 = PeriodIndex( - ['2011-01-01 09:00', '2012-02-01 10:00', 'NaT'], freq='H') - + idx6 = PeriodIndex(['2011-01-01 09:00', '2012-02-01 10:00', + 'NaT'], freq='H') idx7 = pd.period_range('2013Q1', periods=1, freq="Q") idx8 = pd.period_range('2013Q1', periods=2, freq="Q") idx9 = pd.period_range('2013Q1', periods=3, freq="Q") + idx10 = PeriodIndex(['2011-01-01', '2011-02-01'], freq='3D') - exp1 = """PeriodIndex([], dtype='int64', freq='D')""" + exp1 = """PeriodIndex([], dtype='period[D]', freq='D')""" - exp2 = """PeriodIndex(['2011-01-01'], dtype='int64', freq='D')""" + exp2 = """PeriodIndex(['2011-01-01'], dtype='period[D]', freq='D')""" - exp3 = ("PeriodIndex(['2011-01-01', '2011-01-02'], dtype='int64', " + exp3 = ("PeriodIndex(['2011-01-01', '2011-01-02'], dtype='period[D]', " "freq='D')") exp4 = ("PeriodIndex(['2011-01-01', '2011-01-02', '2011-01-03'], " - "dtype='int64', freq='D')") + "dtype='period[D]', freq='D')") - exp5 = ("PeriodIndex(['2011', '2012', '2013'], dtype='int64', " + exp5 = ("PeriodIndex(['2011', '2012', '2013'], dtype='period[A-DEC]', " "freq='A-DEC')") exp6 = ("PeriodIndex(['2011-01-01 09:00', '2012-02-01 10:00', 'NaT'], " - "dtype='int64', freq='H')") - - exp7 = """PeriodIndex(['2013Q1'], dtype='int64', freq='Q-DEC')""" + "dtype='period[H]', freq='H')") - exp8 = ("PeriodIndex(['2013Q1', '2013Q2'], dtype='int64', " + exp7 = ("PeriodIndex(['2013Q1'], dtype='period[Q-DEC]', " "freq='Q-DEC')") - exp9 = ("PeriodIndex(['2013Q1', '2013Q2', '2013Q3'], dtype='int64', " + exp8 = ("PeriodIndex(['2013Q1', '2013Q2'], dtype='period[Q-DEC]', " "freq='Q-DEC')") + exp9 = ("PeriodIndex(['2013Q1', '2013Q2', '2013Q3'], " + "dtype='period[Q-DEC]', freq='Q-DEC')") + + exp10 = ("PeriodIndex(['2011-01-01', '2011-02-01'], " + "dtype='period[3D]', freq='3D')") + for idx, expected in zip([idx1, idx2, idx3, idx4, idx5, - idx6, idx7, idx8, idx9], + idx6, idx7, idx8, idx9, idx10], [exp1, exp2, exp3, exp4, exp5, - exp6, exp7, exp8, exp9]): + exp6, exp7, exp8, exp9, exp10]): for func in ['__repr__', '__unicode__', '__str__']: result = getattr(idx, func)() self.assertEqual(result, expected) @@ -1622,11 +1907,11 @@ def test_representation_to_series(self): idx1 = PeriodIndex([], freq='D') idx2 = PeriodIndex(['2011-01-01'], freq='D') idx3 = PeriodIndex(['2011-01-01', '2011-01-02'], freq='D') - idx4 = PeriodIndex( - ['2011-01-01', '2011-01-02', '2011-01-03'], freq='D') + idx4 = PeriodIndex(['2011-01-01', '2011-01-02', + '2011-01-03'], freq='D') idx5 = PeriodIndex(['2011', '2012', '2013'], freq='A') - idx6 = PeriodIndex( - ['2011-01-01 09:00', '2012-02-01 10:00', 'NaT'], freq='H') + idx6 = PeriodIndex(['2011-01-01 09:00', '2012-02-01 10:00', + 'NaT'], freq='H') idx7 = pd.period_range('2013Q1', periods=1, freq="Q") idx8 = pd.period_range('2013Q1', periods=2, freq="Q") @@ -1734,7 +2019,7 @@ def test_resolution(self): idx = pd.period_range(start='2013-04-01', periods=30, freq=freq) self.assertEqual(idx.resolution, expected) - def test_add_iadd(self): + def test_union(self): # union rng1 = pd.period_range('1/1/2000', freq='D', periods=5) other1 = pd.period_range('1/6/2000', freq='D', periods=5) @@ -1760,7 +2045,8 @@ def test_add_iadd(self): rng5 = pd.PeriodIndex(['2000-01-01 09:01', '2000-01-01 09:03', '2000-01-01 09:05'], freq='T') other5 = pd.PeriodIndex(['2000-01-01 09:01', '2000-01-01 09:05' - '2000-01-01 09:08'], freq='T') + '2000-01-01 09:08'], + freq='T') expected5 = pd.PeriodIndex(['2000-01-01 09:01', '2000-01-01 09:03', '2000-01-01 09:05', '2000-01-01 09:08'], freq='T') @@ -1781,20 +2067,19 @@ def test_add_iadd(self): expected6), (rng7, other7, expected7)]: - # GH9094 - with tm.assert_produces_warning(FutureWarning): - result_add = rng + other - result_union = rng.union(other) - - tm.assert_index_equal(result_add, expected) tm.assert_index_equal(result_union, expected) - # GH 6527 - # GH9094 - with tm.assert_produces_warning(FutureWarning): - rng += other - tm.assert_index_equal(rng, expected) + def test_add_iadd(self): + rng = pd.period_range('1/1/2000', freq='D', periods=5) + other = pd.period_range('1/6/2000', freq='D', periods=5) + + # previously performed setop union, now raises TypeError (GH14164) + with tm.assertRaises(TypeError): + rng + other + + with tm.assertRaises(TypeError): + rng += other # offset # DateOffset @@ -1881,7 +2166,7 @@ def test_add_iadd(self): rng += 1 tm.assert_index_equal(rng, expected) - def test_sub_isub(self): + def test_difference(self): # diff rng1 = pd.period_range('1/1/2000', freq='D', periods=5) other1 = pd.period_range('1/6/2000', freq='D', periods=5) @@ -1923,6 +2208,19 @@ def test_sub_isub(self): result_union = rng.difference(other) tm.assert_index_equal(result_union, expected) + def test_sub_isub(self): + + # previously performed setop, now raises TypeError (GH14164) + # TODO needs to wait on #13077 for decision on result type + rng = pd.period_range('1/1/2000', freq='D', periods=5) + other = pd.period_range('1/6/2000', freq='D', periods=5) + + with tm.assertRaises(TypeError): + rng - other + + with tm.assertRaises(TypeError): + rng -= other + # offset # DateOffset rng = pd.period_range('2014', '2024', freq='A') @@ -2007,12 +2305,38 @@ def test_sub_isub(self): rng -= 1 tm.assert_index_equal(rng, expected) + def test_comp_nat(self): + left = pd.PeriodIndex([pd.Period('2011-01-01'), pd.NaT, + pd.Period('2011-01-03')]) + right = pd.PeriodIndex([pd.NaT, pd.NaT, pd.Period('2011-01-03')]) + + for l, r in [(left, right), (left.asobject, right.asobject)]: + result = l == r + expected = np.array([False, False, True]) + tm.assert_numpy_array_equal(result, expected) + + result = l != r + expected = np.array([True, True, False]) + tm.assert_numpy_array_equal(result, expected) + + expected = np.array([False, False, False]) + tm.assert_numpy_array_equal(l == pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT == r, expected) + + expected = np.array([True, True, True]) + tm.assert_numpy_array_equal(l != pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT != l, expected) + + expected = np.array([False, False, False]) + tm.assert_numpy_array_equal(l < pd.NaT, expected) + tm.assert_numpy_array_equal(pd.NaT > l, expected) + def test_value_counts_unique(self): # GH 7735 idx = pd.period_range('2011-01-01 09:00', freq='H', periods=10) # create repeated values, 'n'th element is repeated by n+1 times - idx = PeriodIndex( - np.repeat(idx.values, range(1, len(idx) + 1)), freq='H') + idx = PeriodIndex(np.repeat(idx.values, range(1, len(idx) + 1)), + freq='H') exp_idx = PeriodIndex(['2011-01-01 18:00', '2011-01-01 17:00', '2011-01-01 16:00', '2011-01-01 15:00', @@ -2021,24 +2345,31 @@ def test_value_counts_unique(self): '2011-01-01 10:00', '2011-01-01 09:00'], freq='H') expected = Series(range(10, 0, -1), index=exp_idx, dtype='int64') - tm.assert_series_equal(idx.value_counts(), expected) - expected = pd.period_range('2011-01-01 09:00', freq='H', periods=10) + for obj in [idx, Series(idx)]: + tm.assert_series_equal(obj.value_counts(), expected) + + expected = pd.period_range('2011-01-01 09:00', freq='H', + periods=10) tm.assert_index_equal(idx.unique(), expected) idx = PeriodIndex(['2013-01-01 09:00', '2013-01-01 09:00', '2013-01-01 09:00', '2013-01-01 08:00', '2013-01-01 08:00', pd.NaT], freq='H') - exp_idx = PeriodIndex( - ['2013-01-01 09:00', '2013-01-01 08:00'], freq='H') + exp_idx = PeriodIndex(['2013-01-01 09:00', '2013-01-01 08:00'], + freq='H') expected = Series([3, 2], index=exp_idx) - tm.assert_series_equal(idx.value_counts(), expected) - exp_idx = PeriodIndex( - ['2013-01-01 09:00', '2013-01-01 08:00', pd.NaT], freq='H') + for obj in [idx, Series(idx)]: + tm.assert_series_equal(obj.value_counts(), expected) + + exp_idx = PeriodIndex(['2013-01-01 09:00', '2013-01-01 08:00', + pd.NaT], freq='H') expected = Series([3, 2, 1], index=exp_idx) - tm.assert_series_equal(idx.value_counts(dropna=False), expected) + + for obj in [idx, Series(idx)]: + tm.assert_series_equal(obj.value_counts(dropna=False), expected) tm.assert_index_equal(idx.unique(), exp_idx) @@ -2054,6 +2385,28 @@ def test_drop_duplicates_metadata(self): self.assert_index_equal(idx, result) self.assertEqual(idx.freq, result.freq) + def test_drop_duplicates(self): + # to check Index/Series compat + base = pd.period_range('2011-01-01', '2011-01-31', freq='D', + name='idx') + idx = base.append(base[:5]) + + res = idx.drop_duplicates() + tm.assert_index_equal(res, base) + res = Series(idx).drop_duplicates() + tm.assert_series_equal(res, Series(base)) + + res = idx.drop_duplicates(keep='last') + exp = base[5:].append(base[:5]) + tm.assert_index_equal(res, exp) + res = Series(idx).drop_duplicates(keep='last') + tm.assert_series_equal(res, Series(exp, index=np.arange(5, 36))) + + res = idx.drop_duplicates(keep=False) + tm.assert_index_equal(res, base[5:]) + res = Series(idx).drop_duplicates(keep=False) + tm.assert_series_equal(res, Series(base[5:], index=np.arange(5, 31))) + def test_order_compat(self): def _check_freq(index, expected_index): if isinstance(index, PeriodIndex): @@ -2074,14 +2427,16 @@ def _check_freq(index, expected_index): ordered, indexer = idx.sort_values(return_indexer=True) self.assert_index_equal(ordered, idx) self.assert_numpy_array_equal(indexer, - np.array([0, 1, 2], dtype=np.int64)) + np.array([0, 1, 2]), + check_dtype=False) _check_freq(ordered, idx) ordered, indexer = idx.sort_values(return_indexer=True, ascending=False) self.assert_index_equal(ordered, idx[::-1]) self.assert_numpy_array_equal(indexer, - np.array([2, 1, 0], dtype=np.int64)) + np.array([2, 1, 0]), + check_dtype=False) _check_freq(ordered, idx[::-1]) pidx = PeriodIndex(['2011', '2013', '2015', '2012', @@ -2103,24 +2458,25 @@ def _check_freq(index, expected_index): ordered, indexer = idx.sort_values(return_indexer=True) self.assert_index_equal(ordered, expected) - exp = np.array([0, 4, 3, 1, 2], dtype=np.int64) - self.assert_numpy_array_equal(indexer, exp) + exp = np.array([0, 4, 3, 1, 2]) + self.assert_numpy_array_equal(indexer, exp, check_dtype=False) _check_freq(ordered, idx) ordered, indexer = idx.sort_values(return_indexer=True, ascending=False) self.assert_index_equal(ordered, expected[::-1]) - exp = np.array([2, 1, 3, 4, 0], dtype=np.int64) - self.assert_numpy_array_equal(indexer, exp) + exp = np.array([2, 1, 3, 4, 0]) + self.assert_numpy_array_equal(indexer, exp, + check_dtype=False) _check_freq(ordered, idx) pidx = PeriodIndex(['2011', '2013', 'NaT', '2011'], name='pidx', freq='D') result = pidx.sort_values() - expected = PeriodIndex( - ['NaT', '2011', '2011', '2013'], name='pidx', freq='D') + expected = PeriodIndex(['NaT', '2011', '2011', '2013'], + name='pidx', freq='D') self.assert_index_equal(result, expected) self.assertEqual(result.freq, 'D') @@ -2148,7 +2504,8 @@ def test_order(self): ordered, indexer = idx.sort_values(return_indexer=True) self.assert_index_equal(ordered, idx) self.assert_numpy_array_equal(indexer, - np.array([0, 1, 2], dtype=np.int64)) + np.array([0, 1, 2]), + check_dtype=False) self.assertEqual(ordered.freq, idx.freq) self.assertEqual(ordered.freq, freq) @@ -2157,7 +2514,8 @@ def test_order(self): expected = idx[::-1] self.assert_index_equal(ordered, expected) self.assert_numpy_array_equal(indexer, - np.array([2, 1, 0], dtype=np.int64)) + np.array([2, 1, 0]), + check_dtype=False) self.assertEqual(ordered.freq, expected.freq) self.assertEqual(ordered.freq, freq) @@ -2166,20 +2524,19 @@ def test_order(self): exp1 = PeriodIndex(['2011-01-01', '2011-01-01', '2011-01-02', '2011-01-03', '2011-01-05'], freq='D', name='idx1') - # TODO(wesm): unused? - # idx2 = PeriodIndex(['2011-01-01', '2011-01-03', '2011-01-05', - # '2011-01-02', '2011-01-01'], - # freq='D', name='idx2') - # exp2 = PeriodIndex(['2011-01-01', '2011-01-01', '2011-01-02', - # '2011-01-03', '2011-01-05'], - # freq='D', name='idx2') - - # idx3 = PeriodIndex([pd.NaT, '2011-01-03', '2011-01-05', - # '2011-01-02', pd.NaT], freq='D', name='idx3') - # exp3 = PeriodIndex([pd.NaT, pd.NaT, '2011-01-02', '2011-01-03', - # '2011-01-05'], freq='D', name='idx3') + idx2 = PeriodIndex(['2011-01-01', '2011-01-03', '2011-01-05', + '2011-01-02', '2011-01-01'], + freq='D', name='idx2') + exp2 = PeriodIndex(['2011-01-01', '2011-01-01', '2011-01-02', + '2011-01-03', '2011-01-05'], + freq='D', name='idx2') - for idx, expected in [(idx1, exp1), (idx1, exp1), (idx1, exp1)]: + idx3 = PeriodIndex([pd.NaT, '2011-01-03', '2011-01-05', + '2011-01-02', pd.NaT], freq='D', name='idx3') + exp3 = PeriodIndex([pd.NaT, pd.NaT, '2011-01-02', '2011-01-03', + '2011-01-05'], freq='D', name='idx3') + + for idx, expected in [(idx1, exp1), (idx2, exp2), (idx3, exp3)]: ordered = idx.sort_values() self.assert_index_equal(ordered, expected) self.assertEqual(ordered.freq, 'D') @@ -2191,16 +2548,16 @@ def test_order(self): ordered, indexer = idx.sort_values(return_indexer=True) self.assert_index_equal(ordered, expected) - exp = np.array([0, 4, 3, 1, 2], dtype=np.int64) - self.assert_numpy_array_equal(indexer, exp) + exp = np.array([0, 4, 3, 1, 2]) + self.assert_numpy_array_equal(indexer, exp, check_dtype=False) self.assertEqual(ordered.freq, 'D') ordered, indexer = idx.sort_values(return_indexer=True, ascending=False) self.assert_index_equal(ordered, expected[::-1]) - exp = np.array([2, 1, 3, 4, 0], dtype=np.int64) - self.assert_numpy_array_equal(indexer, exp) + exp = np.array([2, 1, 3, 4, 0]) + self.assert_numpy_array_equal(indexer, exp, check_dtype=False) self.assertEqual(ordered.freq, 'D') def test_getitem(self): @@ -2327,6 +2684,78 @@ def test_shift(self): '2011-01-01 09:00'], name='xxx', freq='H') tm.assert_index_equal(idx.shift(-3), exp) + def test_repeat(self): + index = pd.period_range('2001-01-01', periods=2, freq='D') + exp = pd.PeriodIndex(['2001-01-01', '2001-01-01', + '2001-01-02', '2001-01-02'], freq='D') + for res in [index.repeat(2), np.repeat(index, 2)]: + tm.assert_index_equal(res, exp) + + index = pd.period_range('2001-01-01', periods=2, freq='2D') + exp = pd.PeriodIndex(['2001-01-01', '2001-01-01', + '2001-01-03', '2001-01-03'], freq='2D') + for res in [index.repeat(2), np.repeat(index, 2)]: + tm.assert_index_equal(res, exp) + + index = pd.PeriodIndex(['2001-01', 'NaT', '2003-01'], freq='M') + exp = pd.PeriodIndex(['2001-01', '2001-01', '2001-01', + 'NaT', 'NaT', 'NaT', + '2003-01', '2003-01', '2003-01'], freq='M') + for res in [index.repeat(3), np.repeat(index, 3)]: + tm.assert_index_equal(res, exp) + + def test_nat(self): + self.assertIs(pd.PeriodIndex._na_value, pd.NaT) + self.assertIs(pd.PeriodIndex([], freq='M')._na_value, pd.NaT) + + idx = pd.PeriodIndex(['2011-01-01', '2011-01-02'], freq='D') + self.assertTrue(idx._can_hold_na) + + tm.assert_numpy_array_equal(idx._isnan, np.array([False, False])) + self.assertFalse(idx.hasnans) + tm.assert_numpy_array_equal(idx._nan_idxs, + np.array([], dtype=np.int64)) + + idx = pd.PeriodIndex(['2011-01-01', 'NaT'], freq='D') + self.assertTrue(idx._can_hold_na) + + tm.assert_numpy_array_equal(idx._isnan, np.array([False, True])) + self.assertTrue(idx.hasnans) + tm.assert_numpy_array_equal(idx._nan_idxs, + np.array([1], dtype=np.int64)) + + def test_equals(self): + # GH 13107 + for freq in ['D', 'M']: + idx = pd.PeriodIndex(['2011-01-01', '2011-01-02', 'NaT'], + freq=freq) + self.assertTrue(idx.equals(idx)) + self.assertTrue(idx.equals(idx.copy())) + self.assertTrue(idx.equals(idx.asobject)) + self.assertTrue(idx.asobject.equals(idx)) + self.assertTrue(idx.asobject.equals(idx.asobject)) + self.assertFalse(idx.equals(list(idx))) + self.assertFalse(idx.equals(pd.Series(idx))) + + idx2 = pd.PeriodIndex(['2011-01-01', '2011-01-02', 'NaT'], + freq='H') + self.assertFalse(idx.equals(idx2)) + self.assertFalse(idx.equals(idx2.copy())) + self.assertFalse(idx.equals(idx2.asobject)) + self.assertFalse(idx.asobject.equals(idx2)) + self.assertFalse(idx.equals(list(idx2))) + self.assertFalse(idx.equals(pd.Series(idx2))) + + # same internal, different tz + idx3 = pd.PeriodIndex._simple_new(idx.asi8, freq='H') + tm.assert_numpy_array_equal(idx.asi8, idx3.asi8) + self.assertFalse(idx.equals(idx3)) + self.assertFalse(idx.equals(idx3.copy())) + self.assertFalse(idx.equals(idx3.asobject)) + self.assertFalse(idx.asobject.equals(idx3)) + self.assertFalse(idx.equals(list(idx3))) + self.assertFalse(idx.equals(pd.Series(idx3))) + if __name__ == '__main__': import nose diff --git a/pandas/tseries/tests/test_bin_groupby.py b/pandas/tseries/tests/test_bin_groupby.py index 6b6c468b7c391..08c0833be0cd6 100644 --- a/pandas/tseries/tests/test_bin_groupby.py +++ b/pandas/tseries/tests/test_bin_groupby.py @@ -3,12 +3,12 @@ from numpy import nan import numpy as np +from pandas.types.common import _ensure_int64 from pandas import Index, isnull from pandas.util.testing import assert_almost_equal import pandas.util.testing as tm import pandas.lib as lib import pandas.algos as algos -from pandas.core import common as com def test_series_grouper(): @@ -90,8 +90,8 @@ def _check(dtype): bins = np.array([6, 12, 20]) out = np.zeros((3, 4), dtype) counts = np.zeros(len(out), dtype=np.int64) - labels = com._ensure_int64(np.repeat(np.arange(3), - np.diff(np.r_[0, bins]))) + labels = _ensure_int64(np.repeat(np.arange(3), + np.diff(np.r_[0, bins]))) func = getattr(algos, 'group_ohlc_%s' % dtype) func(out, counts, obj[:, None], labels) diff --git a/pandas/tseries/tests/test_converter.py b/pandas/tseries/tests/test_converter.py index ceb8660efb9cd..37d9c35639c32 100644 --- a/pandas/tseries/tests/test_converter.py +++ b/pandas/tseries/tests/test_converter.py @@ -77,6 +77,24 @@ def test_conversion_float(self): rs = self.dtc.convert(datetime(2012, 1, 1, 1, 2, 3), None, None) tm.assert_almost_equal(rs, xp, decimals) + def test_conversion_outofbounds_datetime(self): + # 2579 + values = [date(1677, 1, 1), date(1677, 1, 2)] + rs = self.dtc.convert(values, None, None) + xp = converter.dates.date2num(values) + tm.assert_numpy_array_equal(rs, xp) + rs = self.dtc.convert(values[0], None, None) + xp = converter.dates.date2num(values[0]) + self.assertEqual(rs, xp) + + values = [datetime(1677, 1, 1, 12), datetime(1677, 1, 2, 12)] + rs = self.dtc.convert(values, None, None) + xp = converter.dates.date2num(values) + tm.assert_numpy_array_equal(rs, xp) + rs = self.dtc.convert(values[0], None, None) + xp = converter.dates.date2num(values[0]) + self.assertEqual(rs, xp) + def test_time_formatter(self): self.tc(90000) diff --git a/pandas/tseries/tests/test_daterange.py b/pandas/tseries/tests/test_daterange.py index 6ad33b6b973de..87f9f55e0189c 100644 --- a/pandas/tseries/tests/test_daterange.py +++ b/pandas/tseries/tests/test_daterange.py @@ -7,11 +7,11 @@ from pandas.tseries.index import DatetimeIndex from pandas import Timestamp -from pandas.tseries.offsets import generate_range +from pandas.tseries.offsets import (BDay, BMonthEnd, CDay, MonthEnd, + generate_range, DateOffset, Minute) from pandas.tseries.index import cdate_range, bdate_range, date_range from pandas.core import common as com -import pandas.core.datetools as datetools from pandas.util.testing import assertRaisesRegexp import pandas.util.testing as tm @@ -27,12 +27,12 @@ def eq_gen_range(kwargs, expected): class TestGenRangeGeneration(tm.TestCase): def test_generate(self): - rng1 = list(generate_range(START, END, offset=datetools.bday)) + rng1 = list(generate_range(START, END, offset=BDay())) rng2 = list(generate_range(START, END, time_rule='B')) self.assertEqual(rng1, rng2) def test_generate_cday(self): - rng1 = list(generate_range(START, END, offset=datetools.cday)) + rng1 = list(generate_range(START, END, offset=CDay())) rng2 = list(generate_range(START, END, time_rule='C')) self.assertEqual(rng1, rng2) @@ -78,44 +78,42 @@ def setUp(self): self.rng = bdate_range(START, END) def test_constructor(self): - bdate_range(START, END, freq=datetools.bday) - bdate_range(START, periods=20, freq=datetools.bday) - bdate_range(end=START, periods=20, freq=datetools.bday) + bdate_range(START, END, freq=BDay()) + bdate_range(START, periods=20, freq=BDay()) + bdate_range(end=START, periods=20, freq=BDay()) self.assertRaises(ValueError, date_range, '2011-1-1', '2012-1-1', 'B') self.assertRaises(ValueError, bdate_range, '2011-1-1', '2012-1-1', 'B') def test_naive_aware_conflicts(self): - naive = bdate_range(START, END, freq=datetools.bday, tz=None) - aware = bdate_range(START, END, freq=datetools.bday, + naive = bdate_range(START, END, freq=BDay(), tz=None) + aware = bdate_range(START, END, freq=BDay(), tz="Asia/Hong_Kong") assertRaisesRegexp(TypeError, "tz-naive.*tz-aware", naive.join, aware) assertRaisesRegexp(TypeError, "tz-naive.*tz-aware", aware.join, naive) def test_cached_range(self): - DatetimeIndex._cached_range(START, END, offset=datetools.bday) - DatetimeIndex._cached_range(START, periods=20, - offset=datetools.bday) - DatetimeIndex._cached_range(end=START, periods=20, - offset=datetools.bday) + DatetimeIndex._cached_range(START, END, offset=BDay()) + DatetimeIndex._cached_range(START, periods=20, offset=BDay()) + DatetimeIndex._cached_range(end=START, periods=20, offset=BDay()) assertRaisesRegexp(TypeError, "offset", DatetimeIndex._cached_range, START, END) assertRaisesRegexp(TypeError, "specify period", DatetimeIndex._cached_range, START, - offset=datetools.bday) + offset=BDay()) assertRaisesRegexp(TypeError, "specify period", DatetimeIndex._cached_range, end=END, - offset=datetools.bday) + offset=BDay()) assertRaisesRegexp(TypeError, "start or end", DatetimeIndex._cached_range, periods=20, - offset=datetools.bday) + offset=BDay()) def test_cached_range_bug(self): rng = date_range('2010-09-01 05:00:00', periods=50, - freq=datetools.DateOffset(hours=6)) + freq=DateOffset(hours=6)) self.assertEqual(len(rng), 50) self.assertEqual(rng[0], datetime(2010, 9, 1, 5)) @@ -155,7 +153,7 @@ def test_getitem(self): self.assertEqual(smaller.offset, self.rng.offset) sliced = self.rng[::5] - self.assertEqual(sliced.offset, datetools.bday * 5) + self.assertEqual(sliced.offset, BDay() * 5) fancy_indexed = self.rng[[4, 3, 2, 1, 0]] self.assertEqual(len(fancy_indexed), 5) @@ -183,9 +181,9 @@ def test_shift(self): self.assertEqual(shifted[0], self.rng[0]) self.assertEqual(shifted.offset, self.rng.offset) - rng = date_range(START, END, freq=datetools.bmonthEnd) - shifted = rng.shift(1, freq=datetools.bday) - self.assertEqual(shifted[0], rng[0] + datetools.bday) + rng = date_range(START, END, freq=BMonthEnd()) + shifted = rng.shift(1, freq=BDay()) + self.assertEqual(shifted[0], rng[0] + BDay()) def test_pickle_unpickle(self): unpickled = self.round_trip_pickle(self.rng) @@ -217,7 +215,7 @@ def test_union(self): tm.assert_index_equal(right.union(left), the_union) # overlapping, but different offset - rng = date_range(START, END, freq=datetools.bmonthEnd) + rng = date_range(START, END, freq=BMonthEnd()) the_union = self.rng.union(rng) tm.assertIsInstance(the_union, DatetimeIndex) @@ -248,14 +246,14 @@ def test_outer_join(self): tm.assertIsInstance(the_join, DatetimeIndex) # overlapping, but different offset - rng = date_range(START, END, freq=datetools.bmonthEnd) + rng = date_range(START, END, freq=BMonthEnd()) the_join = self.rng.join(rng, how='outer') tm.assertIsInstance(the_join, DatetimeIndex) self.assertIsNone(the_join.freq) def test_union_not_cacheable(self): - rng = date_range('1/1/2000', periods=50, freq=datetools.Minute()) + rng = date_range('1/1/2000', periods=50, freq=Minute()) rng1 = rng[10:] rng2 = rng[:25] the_union = rng1.union(rng2) @@ -268,7 +266,7 @@ def test_union_not_cacheable(self): self.assert_index_equal(the_union, expected) def test_intersection(self): - rng = date_range('1/1/2000', periods=50, freq=datetools.Minute()) + rng = date_range('1/1/2000', periods=50, freq=Minute()) rng1 = rng[10:] rng2 = rng[:25] the_int = rng1.intersection(rng2) @@ -309,7 +307,7 @@ def test_summary_dateutil(self): def test_misc(self): end = datetime(2009, 5, 13) dr = bdate_range(end=end, periods=20) - firstDate = end - 19 * datetools.bday + firstDate = end - 19 * BDay() assert len(dr) == 20 assert dr[0] == firstDate @@ -351,18 +349,18 @@ def test_daterange_bug_456(self): # GH #456 rng1 = bdate_range('12/5/2011', '12/5/2011') rng2 = bdate_range('12/2/2011', '12/5/2011') - rng2.offset = datetools.BDay() + rng2.offset = BDay() result = rng1.union(rng2) tm.assertIsInstance(result, DatetimeIndex) def test_error_with_zero_monthends(self): self.assertRaises(ValueError, date_range, '1/1/2000', '1/1/2001', - freq=datetools.MonthEnd(0)) + freq=MonthEnd(0)) def test_range_bug(self): # GH #770 - offset = datetools.DateOffset(months=3) + offset = DateOffset(months=3) result = date_range("2011-1-1", "2012-1-31", freq=offset) start = datetime(2011, 1, 1) @@ -456,9 +454,9 @@ def test_month_range_union_tz_pytz(self): late_end = datetime(2011, 5, 1) early_dr = date_range(start=early_start, end=early_end, tz=tz, - freq=datetools.monthEnd) + freq=MonthEnd()) late_dr = date_range(start=late_start, end=late_end, tz=tz, - freq=datetools.monthEnd) + freq=MonthEnd()) early_dr.union(late_dr) @@ -475,9 +473,9 @@ def test_month_range_union_tz_dateutil(self): late_end = datetime(2011, 5, 1) early_dr = date_range(start=early_start, end=early_end, tz=tz, - freq=datetools.monthEnd) + freq=MonthEnd()) late_dr = date_range(start=late_start, end=late_end, tz=tz, - freq=datetools.monthEnd) + freq=MonthEnd()) early_dr.union(late_dr) @@ -485,7 +483,7 @@ def test_range_closed(self): begin = datetime(2011, 1, 1) end = datetime(2014, 1, 1) - for freq in ["3D", "2M", "7W", "3H", "A"]: + for freq in ["1D", "3D", "2M", "7W", "3H", "A"]: closed = date_range(begin, end, closed=None, freq=freq) left = date_range(begin, end, closed="left", freq=freq) right = date_range(begin, end, closed="right", freq=freq) @@ -501,11 +499,11 @@ def test_range_closed(self): self.assert_index_equal(expected_right, right) def test_range_closed_with_tz_aware_start_end(self): - # GH12409 + # GH12409, GH12684 begin = Timestamp('2011/1/1', tz='US/Eastern') end = Timestamp('2014/1/1', tz='US/Eastern') - for freq in ["3D", "2M", "7W", "3H", "A"]: + for freq in ["1D", "3D", "2M", "7W", "3H", "A"]: closed = date_range(begin, end, closed=None, freq=freq) left = date_range(begin, end, closed="left", freq=freq) right = date_range(begin, end, closed="right", freq=freq) @@ -520,15 +518,28 @@ def test_range_closed_with_tz_aware_start_end(self): self.assert_index_equal(expected_left, left) self.assert_index_equal(expected_right, right) - # test with default frequency, UTC - begin = Timestamp('2011/1/1', tz='UTC') - end = Timestamp('2014/1/1', tz='UTC') + begin = Timestamp('2011/1/1') + end = Timestamp('2014/1/1') + begintz = Timestamp('2011/1/1', tz='US/Eastern') + endtz = Timestamp('2014/1/1', tz='US/Eastern') + + for freq in ["1D", "3D", "2M", "7W", "3H", "A"]: + closed = date_range(begin, end, closed=None, freq=freq, + tz='US/Eastern') + left = date_range(begin, end, closed="left", freq=freq, + tz='US/Eastern') + right = date_range(begin, end, closed="right", freq=freq, + tz='US/Eastern') + expected_left = left + expected_right = right - intervals = ['left', 'right', None] - for i in intervals: - result = date_range(start=begin, end=end, closed=i) - self.assertEqual(result[0], begin) - self.assertEqual(result[-1], end) + if endtz == closed[-1]: + expected_left = closed[:-1] + if begintz == closed[0]: + expected_right = closed[1:] + + self.assert_index_equal(expected_left, left) + self.assert_index_equal(expected_right, right) def test_range_closed_boundary(self): # GH 11804 @@ -582,29 +593,29 @@ def setUp(self): self.rng = cdate_range(START, END) def test_constructor(self): - cdate_range(START, END, freq=datetools.cday) - cdate_range(START, periods=20, freq=datetools.cday) - cdate_range(end=START, periods=20, freq=datetools.cday) + cdate_range(START, END, freq=CDay()) + cdate_range(START, periods=20, freq=CDay()) + cdate_range(end=START, periods=20, freq=CDay()) self.assertRaises(ValueError, date_range, '2011-1-1', '2012-1-1', 'C') self.assertRaises(ValueError, cdate_range, '2011-1-1', '2012-1-1', 'C') def test_cached_range(self): - DatetimeIndex._cached_range(START, END, offset=datetools.cday) + DatetimeIndex._cached_range(START, END, offset=CDay()) DatetimeIndex._cached_range(START, periods=20, - offset=datetools.cday) + offset=CDay()) DatetimeIndex._cached_range(end=START, periods=20, - offset=datetools.cday) + offset=CDay()) self.assertRaises(Exception, DatetimeIndex._cached_range, START, END) self.assertRaises(Exception, DatetimeIndex._cached_range, START, - freq=datetools.cday) + freq=CDay()) self.assertRaises(Exception, DatetimeIndex._cached_range, end=END, - freq=datetools.cday) + freq=CDay()) self.assertRaises(Exception, DatetimeIndex._cached_range, periods=20, - freq=datetools.cday) + freq=CDay()) def test_comparison(self): d = self.rng[10] @@ -629,7 +640,7 @@ def test_getitem(self): self.assertEqual(smaller.offset, self.rng.offset) sliced = self.rng[::5] - self.assertEqual(sliced.offset, datetools.cday * 5) + self.assertEqual(sliced.offset, CDay() * 5) fancy_indexed = self.rng[[4, 3, 2, 1, 0]] self.assertEqual(len(fancy_indexed), 5) @@ -659,9 +670,9 @@ def test_shift(self): self.assertEqual(shifted.offset, self.rng.offset) with tm.assert_produces_warning(com.PerformanceWarning): - rng = date_range(START, END, freq=datetools.bmonthEnd) - shifted = rng.shift(1, freq=datetools.cday) - self.assertEqual(shifted[0], rng[0] + datetools.cday) + rng = date_range(START, END, freq=BMonthEnd()) + shifted = rng.shift(1, freq=CDay()) + self.assertEqual(shifted[0], rng[0] + CDay()) def test_pickle_unpickle(self): unpickled = self.round_trip_pickle(self.rng) @@ -693,7 +704,7 @@ def test_union(self): self.assert_index_equal(right.union(left), the_union) # overlapping, but different offset - rng = date_range(START, END, freq=datetools.bmonthEnd) + rng = date_range(START, END, freq=BMonthEnd()) the_union = self.rng.union(rng) tm.assertIsInstance(the_union, DatetimeIndex) @@ -724,7 +735,7 @@ def test_outer_join(self): tm.assertIsInstance(the_join, DatetimeIndex) # overlapping, but different offset - rng = date_range(START, END, freq=datetools.bmonthEnd) + rng = date_range(START, END, freq=BMonthEnd()) the_join = self.rng.join(rng, how='outer') tm.assertIsInstance(the_join, DatetimeIndex) @@ -754,7 +765,7 @@ def test_summary_dateutil(self): def test_misc(self): end = datetime(2009, 5, 13) dr = cdate_range(end=end, periods=20) - firstDate = end - 19 * datetools.cday + firstDate = end - 19 * CDay() assert len(dr) == 20 assert dr[0] == firstDate @@ -779,7 +790,7 @@ def test_daterange_bug_456(self): # GH #456 rng1 = cdate_range('12/5/2011', '12/5/2011') rng2 = cdate_range('12/2/2011', '12/5/2011') - rng2.offset = datetools.CDay() + rng2.offset = CDay() result = rng1.union(rng2) tm.assertIsInstance(result, DatetimeIndex) diff --git a/pandas/tseries/tests/test_frequencies.py b/pandas/tseries/tests/test_frequencies.py index 528b9cc0b08a9..5ba98f15aed8d 100644 --- a/pandas/tseries/tests/test_frequencies.py +++ b/pandas/tseries/tests/test_frequencies.py @@ -18,124 +18,210 @@ from pandas import Timedelta -def test_to_offset_multiple(): - freqstr = '2h30min' - freqstr2 = '2h 30min' - - result = frequencies.to_offset(freqstr) - assert (result == frequencies.to_offset(freqstr2)) - expected = offsets.Minute(150) - assert (result == expected) - - freqstr = '2h30min15s' - result = frequencies.to_offset(freqstr) - expected = offsets.Second(150 * 60 + 15) - assert (result == expected) - - freqstr = '2h 60min' - result = frequencies.to_offset(freqstr) - expected = offsets.Hour(3) - assert (result == expected) - - freqstr = '15l500u' - result = frequencies.to_offset(freqstr) - expected = offsets.Micro(15500) - assert (result == expected) - - freqstr = '10s75L' - result = frequencies.to_offset(freqstr) - expected = offsets.Milli(10075) - assert (result == expected) - - freqstr = '2800N' - result = frequencies.to_offset(freqstr) - expected = offsets.Nano(2800) - assert (result == expected) - - # malformed - try: - frequencies.to_offset('2h20m') - except ValueError: - pass - else: - assert (False) - - -def test_to_offset_negative(): - freqstr = '-1S' - result = frequencies.to_offset(freqstr) - assert (result.n == -1) - - freqstr = '-5min10s' - result = frequencies.to_offset(freqstr) - assert (result.n == -310) - - -def test_to_offset_leading_zero(): - freqstr = '00H 00T 01S' - result = frequencies.to_offset(freqstr) - assert (result.n == 1) - - freqstr = '-00H 03T 14S' - result = frequencies.to_offset(freqstr) - assert (result.n == -194) - - -def test_to_offset_pd_timedelta(): - # Tests for #9064 - td = Timedelta(days=1, seconds=1) - result = frequencies.to_offset(td) - expected = offsets.Second(86401) - assert (expected == result) - - td = Timedelta(days=-1, seconds=1) - result = frequencies.to_offset(td) - expected = offsets.Second(-86399) - assert (expected == result) - - td = Timedelta(hours=1, minutes=10) - result = frequencies.to_offset(td) - expected = offsets.Minute(70) - assert (expected == result) - - td = Timedelta(hours=1, minutes=-10) - result = frequencies.to_offset(td) - expected = offsets.Minute(50) - assert (expected == result) - - td = Timedelta(weeks=1) - result = frequencies.to_offset(td) - expected = offsets.Day(7) - assert (expected == result) - - td1 = Timedelta(hours=1) - result1 = frequencies.to_offset(td1) - result2 = frequencies.to_offset('60min') - assert (result1 == result2) - - td = Timedelta(microseconds=1) - result = frequencies.to_offset(td) - expected = offsets.Micro(1) - assert (expected == result) - - td = Timedelta(microseconds=0) - tm.assertRaises(ValueError, lambda: frequencies.to_offset(td)) - - -def test_anchored_shortcuts(): - result = frequencies.to_offset('W') - expected = frequencies.to_offset('W-SUN') - assert (result == expected) - - result1 = frequencies.to_offset('Q') - result2 = frequencies.to_offset('Q-DEC') - expected = offsets.QuarterEnd(startingMonth=12) - assert (result1 == expected) - assert (result2 == expected) - - result1 = frequencies.to_offset('Q-MAY') - expected = offsets.QuarterEnd(startingMonth=5) - assert (result1 == expected) +class TestToOffset(tm.TestCase): + + def test_to_offset_multiple(self): + freqstr = '2h30min' + freqstr2 = '2h 30min' + + result = frequencies.to_offset(freqstr) + assert (result == frequencies.to_offset(freqstr2)) + expected = offsets.Minute(150) + assert (result == expected) + + freqstr = '2h30min15s' + result = frequencies.to_offset(freqstr) + expected = offsets.Second(150 * 60 + 15) + assert (result == expected) + + freqstr = '2h 60min' + result = frequencies.to_offset(freqstr) + expected = offsets.Hour(3) + assert (result == expected) + + freqstr = '15l500u' + result = frequencies.to_offset(freqstr) + expected = offsets.Micro(15500) + assert (result == expected) + + freqstr = '10s75L' + result = frequencies.to_offset(freqstr) + expected = offsets.Milli(10075) + assert (result == expected) + + freqstr = '2800N' + result = frequencies.to_offset(freqstr) + expected = offsets.Nano(2800) + assert (result == expected) + + freqstr = '2SM' + result = frequencies.to_offset(freqstr) + expected = offsets.SemiMonthEnd(2) + assert (result == expected) + + freqstr = '2SM-16' + result = frequencies.to_offset(freqstr) + expected = offsets.SemiMonthEnd(2, day_of_month=16) + assert (result == expected) + + freqstr = '2SMS-14' + result = frequencies.to_offset(freqstr) + expected = offsets.SemiMonthBegin(2, day_of_month=14) + assert (result == expected) + + freqstr = '2SMS-15' + result = frequencies.to_offset(freqstr) + expected = offsets.SemiMonthBegin(2) + assert (result == expected) + + # malformed + with tm.assertRaisesRegexp(ValueError, 'Invalid frequency: 2h20m'): + frequencies.to_offset('2h20m') + + def test_to_offset_negative(self): + freqstr = '-1S' + result = frequencies.to_offset(freqstr) + assert (result.n == -1) + + freqstr = '-5min10s' + result = frequencies.to_offset(freqstr) + assert (result.n == -310) + + freqstr = '-2SM' + result = frequencies.to_offset(freqstr) + assert (result.n == -2) + + freqstr = '-1SMS' + result = frequencies.to_offset(freqstr) + assert (result.n == -1) + + def test_to_offset_invalid(self): + # GH 13930 + with tm.assertRaisesRegexp(ValueError, 'Invalid frequency: U1'): + frequencies.to_offset('U1') + with tm.assertRaisesRegexp(ValueError, 'Invalid frequency: -U'): + frequencies.to_offset('-U') + with tm.assertRaisesRegexp(ValueError, 'Invalid frequency: 3U1'): + frequencies.to_offset('3U1') + with tm.assertRaisesRegexp(ValueError, 'Invalid frequency: -2-3U'): + frequencies.to_offset('-2-3U') + with tm.assertRaisesRegexp(ValueError, 'Invalid frequency: -2D:3H'): + frequencies.to_offset('-2D:3H') + + # ToDo: Must be fixed in #8419 + with tm.assertRaisesRegexp(ValueError, 'Invalid frequency: .5S'): + frequencies.to_offset('.5S') + + # split offsets with spaces are valid + assert frequencies.to_offset('2D 3H') == offsets.Hour(51) + assert frequencies.to_offset('2 D3 H') == offsets.Hour(51) + assert frequencies.to_offset('2 D 3 H') == offsets.Hour(51) + assert frequencies.to_offset(' 2 D 3 H ') == offsets.Hour(51) + assert frequencies.to_offset(' H ') == offsets.Hour() + assert frequencies.to_offset(' 3 H ') == offsets.Hour(3) + + # special cases + assert frequencies.to_offset('2SMS-15') == offsets.SemiMonthBegin(2) + with tm.assertRaisesRegexp(ValueError, + 'Invalid frequency: 2SMS-15-15'): + frequencies.to_offset('2SMS-15-15') + with tm.assertRaisesRegexp(ValueError, 'Invalid frequency: 2SMS-15D'): + frequencies.to_offset('2SMS-15D') + + def test_to_offset_leading_zero(self): + freqstr = '00H 00T 01S' + result = frequencies.to_offset(freqstr) + assert (result.n == 1) + + freqstr = '-00H 03T 14S' + result = frequencies.to_offset(freqstr) + assert (result.n == -194) + + def test_to_offset_pd_timedelta(self): + # Tests for #9064 + td = Timedelta(days=1, seconds=1) + result = frequencies.to_offset(td) + expected = offsets.Second(86401) + assert (expected == result) + + td = Timedelta(days=-1, seconds=1) + result = frequencies.to_offset(td) + expected = offsets.Second(-86399) + assert (expected == result) + + td = Timedelta(hours=1, minutes=10) + result = frequencies.to_offset(td) + expected = offsets.Minute(70) + assert (expected == result) + + td = Timedelta(hours=1, minutes=-10) + result = frequencies.to_offset(td) + expected = offsets.Minute(50) + assert (expected == result) + + td = Timedelta(weeks=1) + result = frequencies.to_offset(td) + expected = offsets.Day(7) + assert (expected == result) + + td1 = Timedelta(hours=1) + result1 = frequencies.to_offset(td1) + result2 = frequencies.to_offset('60min') + assert (result1 == result2) + + td = Timedelta(microseconds=1) + result = frequencies.to_offset(td) + expected = offsets.Micro(1) + assert (expected == result) + + td = Timedelta(microseconds=0) + tm.assertRaises(ValueError, lambda: frequencies.to_offset(td)) + + def test_anchored_shortcuts(self): + result = frequencies.to_offset('W') + expected = frequencies.to_offset('W-SUN') + assert (result == expected) + + result1 = frequencies.to_offset('Q') + result2 = frequencies.to_offset('Q-DEC') + expected = offsets.QuarterEnd(startingMonth=12) + assert (result1 == expected) + assert (result2 == expected) + + result1 = frequencies.to_offset('Q-MAY') + expected = offsets.QuarterEnd(startingMonth=5) + assert (result1 == expected) + + result1 = frequencies.to_offset('SM') + result2 = frequencies.to_offset('SM-15') + expected = offsets.SemiMonthEnd(day_of_month=15) + assert (result1 == expected) + assert (result2 == expected) + + result = frequencies.to_offset('SM-1') + expected = offsets.SemiMonthEnd(day_of_month=1) + assert (result == expected) + + result = frequencies.to_offset('SM-27') + expected = offsets.SemiMonthEnd(day_of_month=27) + assert (result == expected) + + result = frequencies.to_offset('SMS-2') + expected = offsets.SemiMonthBegin(day_of_month=2) + assert (result == expected) + + result = frequencies.to_offset('SMS-27') + expected = offsets.SemiMonthBegin(day_of_month=27) + assert (result == expected) + + # ensure invalid cases fail as expected + invalid_anchors = ['SM-0', 'SM-28', 'SM-29', + 'SM-FOO', 'BSM', 'SM--1' + 'SMS-1', 'SMS-28', 'SMS-30', + 'SMS-BAR', 'BSMS', 'SMS--2'] + for invalid_anchor in invalid_anchors: + with tm.assertRaisesRegexp(ValueError, 'Invalid frequency: '): + frequencies.to_offset(invalid_anchor) def test_get_rule_month(): @@ -182,10 +268,10 @@ def _assert_depr(freq, expected, aliases): assert isinstance(aliases, list) assert (frequencies._period_str_to_code(freq) == expected) + msg = frequencies._INVALID_FREQ_ERROR for alias in aliases: - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - assert (frequencies._period_str_to_code(alias) == expected) + with tm.assertRaisesRegexp(ValueError, msg): + frequencies._period_str_to_code(alias) _assert_depr("M", 3000, ["MTH", "MONTH", "MONTHLY"]) @@ -212,6 +298,7 @@ def _assert_depr(freq, expected, aliases): class TestFrequencyCode(tm.TestCase): + def test_freq_code(self): self.assertEqual(frequencies.get_freq('A'), 1000) self.assertEqual(frequencies.get_freq('3A'), 1000) @@ -636,8 +723,9 @@ def test_series(self): s = Series(period_range('2013', periods=10, freq=freq)) self.assertRaises(TypeError, lambda: frequencies.infer_freq(s)) for freq in ['Y']: - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): + + msg = frequencies._INVALID_FREQ_ERROR + with tm.assertRaisesRegexp(ValueError, msg): s = Series(period_range('2013', periods=10, freq=freq)) self.assertRaises(TypeError, lambda: frequencies.infer_freq(s)) @@ -652,17 +740,23 @@ def test_series(self): self.assertEqual(inferred, 'D') def test_legacy_offset_warnings(self): - for k, v in compat.iteritems(frequencies._rule_aliases): - with tm.assert_produces_warning(FutureWarning): - result = frequencies.get_offset(k) - exp = frequencies.get_offset(v) - self.assertEqual(result, exp) - - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - idx = date_range('2011-01-01', periods=5, freq=k) - exp = date_range('2011-01-01', periods=5, freq=v) - self.assert_index_equal(idx, exp) + freqs = ['WEEKDAY', 'EOM', 'W@MON', 'W@TUE', 'W@WED', 'W@THU', + 'W@FRI', 'W@SAT', 'W@SUN', 'Q@JAN', 'Q@FEB', 'Q@MAR', + 'A@JAN', 'A@FEB', 'A@MAR', 'A@APR', 'A@MAY', 'A@JUN', + 'A@JUL', 'A@AUG', 'A@SEP', 'A@OCT', 'A@NOV', 'A@DEC', + 'WOM@1MON', 'WOM@2MON', 'WOM@3MON', 'WOM@4MON', + 'WOM@1TUE', 'WOM@2TUE', 'WOM@3TUE', 'WOM@4TUE', + 'WOM@1WED', 'WOM@2WED', 'WOM@3WED', 'WOM@4WED', + 'WOM@1THU', 'WOM@2THU', 'WOM@3THU', 'WOM@4THU' + 'WOM@1FRI', 'WOM@2FRI', 'WOM@3FRI', 'WOM@4FRI'] + + msg = frequencies._INVALID_FREQ_ERROR + for freq in freqs: + with tm.assertRaisesRegexp(ValueError, msg): + frequencies.get_offset(freq) + + with tm.assertRaisesRegexp(ValueError, msg): + date_range('2011-01-01', periods=5, freq=freq) MONTHS = ['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', 'OCT', diff --git a/pandas/tseries/tests/test_offsets.py b/pandas/tseries/tests/test_offsets.py index ec88acc421cdb..b3da62c8d2db5 100644 --- a/pandas/tseries/tests/test_offsets.py +++ b/pandas/tseries/tests/test_offsets.py @@ -9,30 +9,31 @@ import numpy as np from pandas.compat.numpy import np_datetime64_compat -from pandas.core.datetools import (bday, BDay, CDay, BQuarterEnd, BMonthEnd, - BusinessHour, CustomBusinessHour, - CBMonthEnd, CBMonthBegin, - BYearEnd, MonthEnd, MonthBegin, BYearBegin, - QuarterBegin, - BQuarterBegin, BMonthBegin, DateOffset, - Week, YearBegin, YearEnd, Hour, Minute, - Second, Day, Micro, Milli, Nano, Easter, - WeekOfMonth, format, ole2datetime, - QuarterEnd, to_datetime, normalize_date, - get_offset, get_standard_freq) +from pandas.core.series import Series from pandas.tseries.frequencies import (_offset_map, get_freq_code, - _get_freq_str) + _get_freq_str, _INVALID_FREQ_ERROR, + get_offset, get_standard_freq) from pandas.tseries.index import _to_m8, DatetimeIndex, _daterange_cache -from pandas.tseries.tools import parse_time_string, DateParseError +from pandas.tseries.offsets import (BDay, CDay, BQuarterEnd, BMonthEnd, + BusinessHour, WeekOfMonth, CBMonthEnd, + CustomBusinessHour, WeekDay, + CBMonthBegin, BYearEnd, MonthEnd, + MonthBegin, SemiMonthBegin, SemiMonthEnd, + BYearBegin, QuarterBegin, BQuarterBegin, + BMonthBegin, DateOffset, Week, YearBegin, + YearEnd, Hour, Minute, Second, Day, Micro, + QuarterEnd, BusinessMonthEnd, FY5253, + Milli, Nano, Easter, FY5253Quarter, + LastWeekOfMonth, CacheableOffset) +from pandas.tseries.tools import (format, ole2datetime, parse_time_string, + to_datetime, DateParseError) import pandas.tseries.offsets as offsets from pandas.io.pickle import read_pickle -from pandas.tslib import NaT, Timestamp, Timedelta +from pandas.tslib import normalize_date, NaT, Timestamp, Timedelta import pandas.tslib as tslib from pandas.util.testing import assertRaisesRegexp import pandas.util.testing as tm -from pandas.tseries.offsets import BusinessMonthEnd, CacheableOffset, \ - LastWeekOfMonth, FY5253, FY5253Quarter, WeekDay from pandas.tseries.holiday import USFederalHolidayCalendar _multiprocess_can_split_ = True @@ -182,6 +183,8 @@ def setUp(self): 'BusinessMonthBegin': Timestamp('2011-01-03 09:00:00'), 'MonthEnd': Timestamp('2011-01-31 09:00:00'), + 'SemiMonthEnd': Timestamp('2011-01-15 09:00:00'), + 'SemiMonthBegin': Timestamp('2011-01-15 09:00:00'), 'BusinessMonthEnd': Timestamp('2011-01-31 09:00:00'), 'YearBegin': Timestamp('2012-01-01 09:00:00'), 'BYearBegin': Timestamp('2011-01-03 09:00:00'), @@ -258,8 +261,19 @@ def _check_offsetfunc_works(self, offset, funcname, dt, expected, self.assertTrue(isinstance(result, Timestamp)) self.assertEqual(result, expected) - # test nano second is preserved - result = func(Timestamp(dt) + Nano(5)) + # see gh-14101 + exp_warning = None + ts = Timestamp(dt) + Nano(5) + + if (offset_s.__class__.__name__ == 'DateOffset' and + (funcname == 'apply' or normalize) and + ts.nanosecond > 0): + exp_warning = UserWarning + + # test nanosecond is preserved + with tm.assert_produces_warning(exp_warning, + check_stacklevel=False): + result = func(ts) self.assertTrue(isinstance(result, Timestamp)) if normalize is False: self.assertEqual(result, expected + Nano(5)) @@ -286,8 +300,19 @@ def _check_offsetfunc_works(self, offset, funcname, dt, expected, self.assertTrue(isinstance(result, Timestamp)) self.assertEqual(result, expected_localize) - # test nano second is preserved - result = func(Timestamp(dt, tz=tz) + Nano(5)) + # see gh-14101 + exp_warning = None + ts = Timestamp(dt, tz=tz) + Nano(5) + + if (offset_s.__class__.__name__ == 'DateOffset' and + (funcname == 'apply' or normalize) and + ts.nanosecond > 0): + exp_warning = UserWarning + + # test nanosecond is preserved + with tm.assert_produces_warning(exp_warning, + check_stacklevel=False): + result = func(ts) self.assertTrue(isinstance(result, Timestamp)) if normalize is False: self.assertEqual(result, expected_localize + Nano(5)) @@ -311,9 +336,9 @@ def test_rollforward(self): expecteds = self.expecteds.copy() # result will not be changed if the target is on the offset - no_changes = ['Day', 'MonthBegin', 'YearBegin', 'Week', 'Hour', - 'Minute', 'Second', 'Milli', 'Micro', 'Nano', - 'DateOffset'] + no_changes = ['Day', 'MonthBegin', 'SemiMonthBegin', 'YearBegin', + 'Week', 'Hour', 'Minute', 'Second', 'Milli', 'Micro', + 'Nano', 'DateOffset'] for n in no_changes: expecteds[n] = Timestamp('2011/01/01 09:00') @@ -328,6 +353,7 @@ def test_rollforward(self): normalized = {'Day': Timestamp('2011-01-02 00:00:00'), 'DateOffset': Timestamp('2011-01-02 00:00:00'), 'MonthBegin': Timestamp('2011-02-01 00:00:00'), + 'SemiMonthBegin': Timestamp('2011-01-15 00:00:00'), 'YearBegin': Timestamp('2012-01-01 00:00:00'), 'Week': Timestamp('2011-01-08 00:00:00'), 'Hour': Timestamp('2011-01-01 00:00:00'), @@ -358,6 +384,7 @@ def test_rollback(self): Timestamp('2010-12-01 09:00:00'), 'BusinessMonthBegin': Timestamp('2010-12-01 09:00:00'), 'MonthEnd': Timestamp('2010-12-31 09:00:00'), + 'SemiMonthEnd': Timestamp('2010-12-31 09:00:00'), 'BusinessMonthEnd': Timestamp('2010-12-31 09:00:00'), 'BYearBegin': Timestamp('2010-01-01 09:00:00'), 'YearEnd': Timestamp('2010-12-31 09:00:00'), @@ -375,8 +402,9 @@ def test_rollback(self): 'Easter': Timestamp('2010-04-04 09:00:00')} # result will not be changed if the target is on the offset - for n in ['Day', 'MonthBegin', 'YearBegin', 'Week', 'Hour', 'Minute', - 'Second', 'Milli', 'Micro', 'Nano', 'DateOffset']: + for n in ['Day', 'MonthBegin', 'SemiMonthBegin', 'YearBegin', 'Week', + 'Hour', 'Minute', 'Second', 'Milli', 'Micro', 'Nano', + 'DateOffset']: expecteds[n] = Timestamp('2011/01/01 09:00') # but be changed when normalize=True @@ -387,6 +415,7 @@ def test_rollback(self): normalized = {'Day': Timestamp('2010-12-31 00:00:00'), 'DateOffset': Timestamp('2010-12-31 00:00:00'), 'MonthBegin': Timestamp('2010-12-01 00:00:00'), + 'SemiMonthBegin': Timestamp('2010-12-15 00:00:00'), 'YearBegin': Timestamp('2010-01-01 00:00:00'), 'Week': Timestamp('2010-12-25 00:00:00'), 'Hour': Timestamp('2011-01-01 00:00:00'), @@ -617,38 +646,43 @@ def test_onOffset(self): def test_apply(self): tests = [] - tests.append((bday, {datetime(2008, 1, 1): datetime(2008, 1, 2), - datetime(2008, 1, 4): datetime(2008, 1, 7), - datetime(2008, 1, 5): datetime(2008, 1, 7), - datetime(2008, 1, 6): datetime(2008, 1, 7), - datetime(2008, 1, 7): datetime(2008, 1, 8)})) - - tests.append((2 * bday, {datetime(2008, 1, 1): datetime(2008, 1, 3), - datetime(2008, 1, 4): datetime(2008, 1, 8), - datetime(2008, 1, 5): datetime(2008, 1, 8), - datetime(2008, 1, 6): datetime(2008, 1, 8), - datetime(2008, 1, 7): datetime(2008, 1, 9)})) - - tests.append((-bday, {datetime(2008, 1, 1): datetime(2007, 12, 31), - datetime(2008, 1, 4): datetime(2008, 1, 3), - datetime(2008, 1, 5): datetime(2008, 1, 4), - datetime(2008, 1, 6): datetime(2008, 1, 4), - datetime(2008, 1, 7): datetime(2008, 1, 4), - datetime(2008, 1, 8): datetime(2008, 1, 7)})) - - tests.append((-2 * bday, {datetime(2008, 1, 1): datetime(2007, 12, 28), - datetime(2008, 1, 4): datetime(2008, 1, 2), - datetime(2008, 1, 5): datetime(2008, 1, 3), - datetime(2008, 1, 6): datetime(2008, 1, 3), - datetime(2008, 1, 7): datetime(2008, 1, 3), - datetime(2008, 1, 8): datetime(2008, 1, 4), - datetime(2008, 1, 9): datetime(2008, 1, 7)})) + tests.append((BDay(), {datetime(2008, 1, 1): datetime(2008, 1, 2), + datetime(2008, 1, 4): datetime(2008, 1, 7), + datetime(2008, 1, 5): datetime(2008, 1, 7), + datetime(2008, 1, 6): datetime(2008, 1, 7), + datetime(2008, 1, 7): datetime(2008, 1, 8)})) + + tests.append((2 * BDay(), {datetime(2008, 1, 1): datetime(2008, 1, 3), + datetime(2008, 1, 4): datetime(2008, 1, 8), + datetime(2008, 1, 5): datetime(2008, 1, 8), + datetime(2008, 1, 6): datetime(2008, 1, 8), + datetime(2008, 1, 7): datetime(2008, 1, 9)} + )) + + tests.append((-BDay(), {datetime(2008, 1, 1): datetime(2007, 12, 31), + datetime(2008, 1, 4): datetime(2008, 1, 3), + datetime(2008, 1, 5): datetime(2008, 1, 4), + datetime(2008, 1, 6): datetime(2008, 1, 4), + datetime(2008, 1, 7): datetime(2008, 1, 4), + datetime(2008, 1, 8): datetime(2008, 1, 7)} + )) + + tests.append((-2 * BDay(), { + datetime(2008, 1, 1): datetime(2007, 12, 28), + datetime(2008, 1, 4): datetime(2008, 1, 2), + datetime(2008, 1, 5): datetime(2008, 1, 3), + datetime(2008, 1, 6): datetime(2008, 1, 3), + datetime(2008, 1, 7): datetime(2008, 1, 3), + datetime(2008, 1, 8): datetime(2008, 1, 4), + datetime(2008, 1, 9): datetime(2008, 1, 7)} + )) tests.append((BDay(0), {datetime(2008, 1, 1): datetime(2008, 1, 1), datetime(2008, 1, 4): datetime(2008, 1, 4), datetime(2008, 1, 5): datetime(2008, 1, 7), datetime(2008, 1, 6): datetime(2008, 1, 7), - datetime(2008, 1, 7): datetime(2008, 1, 7)})) + datetime(2008, 1, 7): datetime(2008, 1, 7)} + )) for offset, cases in tests: for base, expected in compat.iteritems(cases): @@ -1758,35 +1792,40 @@ def test_onOffset(self): assertOnOffset(offset, d, expected) def test_apply(self): - from pandas.core.datetools import cday tests = [] - tests.append((cday, {datetime(2008, 1, 1): datetime(2008, 1, 2), - datetime(2008, 1, 4): datetime(2008, 1, 7), - datetime(2008, 1, 5): datetime(2008, 1, 7), - datetime(2008, 1, 6): datetime(2008, 1, 7), - datetime(2008, 1, 7): datetime(2008, 1, 8)})) - - tests.append((2 * cday, {datetime(2008, 1, 1): datetime(2008, 1, 3), - datetime(2008, 1, 4): datetime(2008, 1, 8), - datetime(2008, 1, 5): datetime(2008, 1, 8), - datetime(2008, 1, 6): datetime(2008, 1, 8), - datetime(2008, 1, 7): datetime(2008, 1, 9)})) - - tests.append((-cday, {datetime(2008, 1, 1): datetime(2007, 12, 31), - datetime(2008, 1, 4): datetime(2008, 1, 3), - datetime(2008, 1, 5): datetime(2008, 1, 4), - datetime(2008, 1, 6): datetime(2008, 1, 4), - datetime(2008, 1, 7): datetime(2008, 1, 4), - datetime(2008, 1, 8): datetime(2008, 1, 7)})) - - tests.append((-2 * cday, {datetime(2008, 1, 1): datetime(2007, 12, 28), - datetime(2008, 1, 4): datetime(2008, 1, 2), - datetime(2008, 1, 5): datetime(2008, 1, 3), - datetime(2008, 1, 6): datetime(2008, 1, 3), - datetime(2008, 1, 7): datetime(2008, 1, 3), - datetime(2008, 1, 8): datetime(2008, 1, 4), - datetime(2008, 1, 9): datetime(2008, 1, 7)})) + tests.append((CDay(), {datetime(2008, 1, 1): datetime(2008, 1, 2), + datetime(2008, 1, 4): datetime(2008, 1, 7), + datetime(2008, 1, 5): datetime(2008, 1, 7), + datetime(2008, 1, 6): datetime(2008, 1, 7), + datetime(2008, 1, 7): datetime(2008, 1, 8)})) + + tests.append((2 * CDay(), { + datetime(2008, 1, 1): datetime(2008, 1, 3), + datetime(2008, 1, 4): datetime(2008, 1, 8), + datetime(2008, 1, 5): datetime(2008, 1, 8), + datetime(2008, 1, 6): datetime(2008, 1, 8), + datetime(2008, 1, 7): datetime(2008, 1, 9)} + )) + + tests.append((-CDay(), { + datetime(2008, 1, 1): datetime(2007, 12, 31), + datetime(2008, 1, 4): datetime(2008, 1, 3), + datetime(2008, 1, 5): datetime(2008, 1, 4), + datetime(2008, 1, 6): datetime(2008, 1, 4), + datetime(2008, 1, 7): datetime(2008, 1, 4), + datetime(2008, 1, 8): datetime(2008, 1, 7)} + )) + + tests.append((-2 * CDay(), { + datetime(2008, 1, 1): datetime(2007, 12, 28), + datetime(2008, 1, 4): datetime(2008, 1, 2), + datetime(2008, 1, 5): datetime(2008, 1, 3), + datetime(2008, 1, 6): datetime(2008, 1, 3), + datetime(2008, 1, 7): datetime(2008, 1, 3), + datetime(2008, 1, 8): datetime(2008, 1, 4), + datetime(2008, 1, 9): datetime(2008, 1, 7)} + )) tests.append((CDay(0), {datetime(2008, 1, 1): datetime(2008, 1, 1), datetime(2008, 1, 4): datetime(2008, 1, 4), @@ -2646,6 +2685,353 @@ def test_onOffset(self): assertOnOffset(offset, dt, expected) +class TestSemiMonthEnd(Base): + _offset = SemiMonthEnd + + def _get_tests(self): + tests = [] + + tests.append((SemiMonthEnd(), + {datetime(2008, 1, 1): datetime(2008, 1, 15), + datetime(2008, 1, 15): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 2, 15), + datetime(2006, 12, 14): datetime(2006, 12, 15), + datetime(2006, 12, 29): datetime(2006, 12, 31), + datetime(2006, 12, 31): datetime(2007, 1, 15), + datetime(2007, 1, 1): datetime(2007, 1, 15), + datetime(2006, 12, 1): datetime(2006, 12, 15), + datetime(2006, 12, 15): datetime(2006, 12, 31)})) + + tests.append((SemiMonthEnd(day_of_month=20), + {datetime(2008, 1, 1): datetime(2008, 1, 20), + datetime(2008, 1, 15): datetime(2008, 1, 20), + datetime(2008, 1, 21): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 2, 20), + datetime(2006, 12, 14): datetime(2006, 12, 20), + datetime(2006, 12, 29): datetime(2006, 12, 31), + datetime(2006, 12, 31): datetime(2007, 1, 20), + datetime(2007, 1, 1): datetime(2007, 1, 20), + datetime(2006, 12, 1): datetime(2006, 12, 20), + datetime(2006, 12, 15): datetime(2006, 12, 20)})) + + tests.append((SemiMonthEnd(0), + {datetime(2008, 1, 1): datetime(2008, 1, 15), + datetime(2008, 1, 16): datetime(2008, 1, 31), + datetime(2008, 1, 15): datetime(2008, 1, 15), + datetime(2008, 1, 31): datetime(2008, 1, 31), + datetime(2006, 12, 29): datetime(2006, 12, 31), + datetime(2006, 12, 31): datetime(2006, 12, 31), + datetime(2007, 1, 1): datetime(2007, 1, 15)})) + + tests.append((SemiMonthEnd(0, day_of_month=16), + {datetime(2008, 1, 1): datetime(2008, 1, 16), + datetime(2008, 1, 16): datetime(2008, 1, 16), + datetime(2008, 1, 15): datetime(2008, 1, 16), + datetime(2008, 1, 31): datetime(2008, 1, 31), + datetime(2006, 12, 29): datetime(2006, 12, 31), + datetime(2006, 12, 31): datetime(2006, 12, 31), + datetime(2007, 1, 1): datetime(2007, 1, 16)})) + + tests.append((SemiMonthEnd(2), + {datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 2, 29), + datetime(2006, 12, 29): datetime(2007, 1, 15), + datetime(2006, 12, 31): datetime(2007, 1, 31), + datetime(2007, 1, 1): datetime(2007, 1, 31), + datetime(2007, 1, 16): datetime(2007, 2, 15), + datetime(2006, 11, 1): datetime(2006, 11, 30)})) + + tests.append((SemiMonthEnd(-1), + {datetime(2007, 1, 1): datetime(2006, 12, 31), + datetime(2008, 6, 30): datetime(2008, 6, 15), + datetime(2008, 12, 31): datetime(2008, 12, 15), + datetime(2006, 12, 29): datetime(2006, 12, 15), + datetime(2006, 12, 30): datetime(2006, 12, 15), + datetime(2007, 1, 1): datetime(2006, 12, 31)})) + + tests.append((SemiMonthEnd(-1, day_of_month=4), + {datetime(2007, 1, 1): datetime(2006, 12, 31), + datetime(2007, 1, 4): datetime(2006, 12, 31), + datetime(2008, 6, 30): datetime(2008, 6, 4), + datetime(2008, 12, 31): datetime(2008, 12, 4), + datetime(2006, 12, 5): datetime(2006, 12, 4), + datetime(2006, 12, 30): datetime(2006, 12, 4), + datetime(2007, 1, 1): datetime(2006, 12, 31)})) + + tests.append((SemiMonthEnd(-2), + {datetime(2007, 1, 1): datetime(2006, 12, 15), + datetime(2008, 6, 30): datetime(2008, 5, 31), + datetime(2008, 3, 15): datetime(2008, 2, 15), + datetime(2008, 12, 31): datetime(2008, 11, 30), + datetime(2006, 12, 29): datetime(2006, 11, 30), + datetime(2006, 12, 14): datetime(2006, 11, 15), + datetime(2007, 1, 1): datetime(2006, 12, 15)})) + + return tests + + def test_offset_whole_year(self): + dates = (datetime(2007, 12, 31), + datetime(2008, 1, 15), + datetime(2008, 1, 31), + datetime(2008, 2, 15), + datetime(2008, 2, 29), + datetime(2008, 3, 15), + datetime(2008, 3, 31), + datetime(2008, 4, 15), + datetime(2008, 4, 30), + datetime(2008, 5, 15), + datetime(2008, 5, 31), + datetime(2008, 6, 15), + datetime(2008, 6, 30), + datetime(2008, 7, 15), + datetime(2008, 7, 31), + datetime(2008, 8, 15), + datetime(2008, 8, 31), + datetime(2008, 9, 15), + datetime(2008, 9, 30), + datetime(2008, 10, 15), + datetime(2008, 10, 31), + datetime(2008, 11, 15), + datetime(2008, 11, 30), + datetime(2008, 12, 15), + datetime(2008, 12, 31)) + + for base, exp_date in zip(dates[:-1], dates[1:]): + assertEq(SemiMonthEnd(), base, exp_date) + + # ensure .apply_index works as expected + s = DatetimeIndex(dates[:-1]) + result = SemiMonthEnd().apply_index(s) + exp = DatetimeIndex(dates[1:]) + tm.assert_index_equal(result, exp) + + # ensure generating a range with DatetimeIndex gives same result + result = DatetimeIndex(start=dates[0], end=dates[-1], freq='SM') + exp = DatetimeIndex(dates) + tm.assert_index_equal(result, exp) + + def test_offset(self): + for offset, cases in self._get_tests(): + for base, expected in compat.iteritems(cases): + assertEq(offset, base, expected) + + def test_apply_index(self): + for offset, cases in self._get_tests(): + s = DatetimeIndex(cases.keys()) + result = offset.apply_index(s) + exp = DatetimeIndex(cases.values()) + tm.assert_index_equal(result, exp) + + def test_onOffset(self): + + tests = [(datetime(2007, 12, 31), True), + (datetime(2007, 12, 15), True), + (datetime(2007, 12, 14), False), + (datetime(2007, 12, 1), False), + (datetime(2008, 2, 29), True)] + + for dt, expected in tests: + assertOnOffset(SemiMonthEnd(), dt, expected) + + def test_vectorized_offset_addition(self): + for klass, assert_func in zip([Series, DatetimeIndex], + [self.assert_series_equal, + tm.assert_index_equal]): + s = klass([Timestamp('2000-01-15 00:15:00', tz='US/Central'), + Timestamp('2000-02-15', tz='US/Central')], name='a') + + result = s + SemiMonthEnd() + result2 = SemiMonthEnd() + s + exp = klass([Timestamp('2000-01-31 00:15:00', tz='US/Central'), + Timestamp('2000-02-29', tz='US/Central')], name='a') + assert_func(result, exp) + assert_func(result2, exp) + + s = klass([Timestamp('2000-01-01 00:15:00', tz='US/Central'), + Timestamp('2000-02-01', tz='US/Central')], name='a') + result = s + SemiMonthEnd() + result2 = SemiMonthEnd() + s + exp = klass([Timestamp('2000-01-15 00:15:00', tz='US/Central'), + Timestamp('2000-02-15', tz='US/Central')], name='a') + assert_func(result, exp) + assert_func(result2, exp) + + +class TestSemiMonthBegin(Base): + _offset = SemiMonthBegin + + def _get_tests(self): + tests = [] + + tests.append((SemiMonthBegin(), + {datetime(2008, 1, 1): datetime(2008, 1, 15), + datetime(2008, 1, 15): datetime(2008, 2, 1), + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2006, 12, 14): datetime(2006, 12, 15), + datetime(2006, 12, 29): datetime(2007, 1, 1), + datetime(2006, 12, 31): datetime(2007, 1, 1), + datetime(2007, 1, 1): datetime(2007, 1, 15), + datetime(2006, 12, 1): datetime(2006, 12, 15), + datetime(2006, 12, 15): datetime(2007, 1, 1)})) + + tests.append((SemiMonthBegin(day_of_month=20), + {datetime(2008, 1, 1): datetime(2008, 1, 20), + datetime(2008, 1, 15): datetime(2008, 1, 20), + datetime(2008, 1, 21): datetime(2008, 2, 1), + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2006, 12, 14): datetime(2006, 12, 20), + datetime(2006, 12, 29): datetime(2007, 1, 1), + datetime(2006, 12, 31): datetime(2007, 1, 1), + datetime(2007, 1, 1): datetime(2007, 1, 20), + datetime(2006, 12, 1): datetime(2006, 12, 20), + datetime(2006, 12, 15): datetime(2006, 12, 20)})) + + tests.append((SemiMonthBegin(0), + {datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2008, 1, 16): datetime(2008, 2, 1), + datetime(2008, 1, 15): datetime(2008, 1, 15), + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2006, 12, 29): datetime(2007, 1, 1), + datetime(2006, 12, 2): datetime(2006, 12, 15), + datetime(2007, 1, 1): datetime(2007, 1, 1)})) + + tests.append((SemiMonthBegin(0, day_of_month=16), + {datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2008, 1, 16): datetime(2008, 1, 16), + datetime(2008, 1, 15): datetime(2008, 1, 16), + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2006, 12, 29): datetime(2007, 1, 1), + datetime(2006, 12, 31): datetime(2007, 1, 1), + datetime(2007, 1, 5): datetime(2007, 1, 16), + datetime(2007, 1, 1): datetime(2007, 1, 1)})) + + tests.append((SemiMonthBegin(2), + {datetime(2008, 1, 1): datetime(2008, 2, 1), + datetime(2008, 1, 31): datetime(2008, 2, 15), + datetime(2006, 12, 1): datetime(2007, 1, 1), + datetime(2006, 12, 29): datetime(2007, 1, 15), + datetime(2006, 12, 15): datetime(2007, 1, 15), + datetime(2007, 1, 1): datetime(2007, 2, 1), + datetime(2007, 1, 16): datetime(2007, 2, 15), + datetime(2006, 11, 1): datetime(2006, 12, 1)})) + + tests.append((SemiMonthBegin(-1), + {datetime(2007, 1, 1): datetime(2006, 12, 15), + datetime(2008, 6, 30): datetime(2008, 6, 15), + datetime(2008, 6, 14): datetime(2008, 6, 1), + datetime(2008, 12, 31): datetime(2008, 12, 15), + datetime(2006, 12, 29): datetime(2006, 12, 15), + datetime(2006, 12, 15): datetime(2006, 12, 1), + datetime(2007, 1, 1): datetime(2006, 12, 15)})) + + tests.append((SemiMonthBegin(-1, day_of_month=4), + {datetime(2007, 1, 1): datetime(2006, 12, 4), + datetime(2007, 1, 4): datetime(2007, 1, 1), + datetime(2008, 6, 30): datetime(2008, 6, 4), + datetime(2008, 12, 31): datetime(2008, 12, 4), + datetime(2006, 12, 5): datetime(2006, 12, 4), + datetime(2006, 12, 30): datetime(2006, 12, 4), + datetime(2006, 12, 2): datetime(2006, 12, 1), + datetime(2007, 1, 1): datetime(2006, 12, 4)})) + + tests.append((SemiMonthBegin(-2), + {datetime(2007, 1, 1): datetime(2006, 12, 1), + datetime(2008, 6, 30): datetime(2008, 6, 1), + datetime(2008, 6, 14): datetime(2008, 5, 15), + datetime(2008, 12, 31): datetime(2008, 12, 1), + datetime(2006, 12, 29): datetime(2006, 12, 1), + datetime(2006, 12, 15): datetime(2006, 11, 15), + datetime(2007, 1, 1): datetime(2006, 12, 1)})) + + return tests + + def test_offset_whole_year(self): + dates = (datetime(2007, 12, 15), + datetime(2008, 1, 1), + datetime(2008, 1, 15), + datetime(2008, 2, 1), + datetime(2008, 2, 15), + datetime(2008, 3, 1), + datetime(2008, 3, 15), + datetime(2008, 4, 1), + datetime(2008, 4, 15), + datetime(2008, 5, 1), + datetime(2008, 5, 15), + datetime(2008, 6, 1), + datetime(2008, 6, 15), + datetime(2008, 7, 1), + datetime(2008, 7, 15), + datetime(2008, 8, 1), + datetime(2008, 8, 15), + datetime(2008, 9, 1), + datetime(2008, 9, 15), + datetime(2008, 10, 1), + datetime(2008, 10, 15), + datetime(2008, 11, 1), + datetime(2008, 11, 15), + datetime(2008, 12, 1), + datetime(2008, 12, 15)) + + for base, exp_date in zip(dates[:-1], dates[1:]): + assertEq(SemiMonthBegin(), base, exp_date) + + # ensure .apply_index works as expected + s = DatetimeIndex(dates[:-1]) + result = SemiMonthBegin().apply_index(s) + exp = DatetimeIndex(dates[1:]) + tm.assert_index_equal(result, exp) + + # ensure generating a range with DatetimeIndex gives same result + result = DatetimeIndex(start=dates[0], end=dates[-1], freq='SMS') + exp = DatetimeIndex(dates) + tm.assert_index_equal(result, exp) + + def test_offset(self): + for offset, cases in self._get_tests(): + for base, expected in compat.iteritems(cases): + assertEq(offset, base, expected) + + def test_apply_index(self): + for offset, cases in self._get_tests(): + s = DatetimeIndex(cases.keys()) + result = offset.apply_index(s) + exp = DatetimeIndex(cases.values()) + tm.assert_index_equal(result, exp) + + def test_onOffset(self): + tests = [(datetime(2007, 12, 1), True), + (datetime(2007, 12, 15), True), + (datetime(2007, 12, 14), False), + (datetime(2007, 12, 31), False), + (datetime(2008, 2, 15), True)] + + for dt, expected in tests: + assertOnOffset(SemiMonthBegin(), dt, expected) + + def test_vectorized_offset_addition(self): + for klass, assert_func in zip([Series, DatetimeIndex], + [self.assert_series_equal, + tm.assert_index_equal]): + + s = klass([Timestamp('2000-01-15 00:15:00', tz='US/Central'), + Timestamp('2000-02-15', tz='US/Central')], name='a') + result = s + SemiMonthBegin() + result2 = SemiMonthBegin() + s + exp = klass([Timestamp('2000-02-01 00:15:00', tz='US/Central'), + Timestamp('2000-03-01', tz='US/Central')], name='a') + assert_func(result, exp) + assert_func(result2, exp) + + s = klass([Timestamp('2000-01-01 00:15:00', tz='US/Central'), + Timestamp('2000-02-01', tz='US/Central')], name='a') + result = s + SemiMonthBegin() + result2 = SemiMonthBegin() + s + exp = klass([Timestamp('2000-01-15 00:15:00', tz='US/Central'), + Timestamp('2000-02-15', tz='US/Central')], name='a') + assert_func(result, exp) + assert_func(result2, exp) + + class TestBQuarterBegin(Base): _offset = BQuarterBegin @@ -4177,8 +4563,11 @@ def test_get_offset_name(self): def test_get_offset(): - assertRaisesRegexp(ValueError, "rule.*GIBBERISH", get_offset, 'gibberish') - assertRaisesRegexp(ValueError, "rule.*QS-JAN-B", get_offset, 'QS-JAN-B') + with tm.assertRaisesRegexp(ValueError, _INVALID_FREQ_ERROR): + get_offset('gibberish') + with tm.assertRaisesRegexp(ValueError, _INVALID_FREQ_ERROR): + get_offset('QS-JAN-B') + pairs = [ ('B', BDay()), ('b', BDay()), ('bm', BMonthEnd()), ('Bm', BMonthEnd()), ('W-MON', Week(weekday=0)), @@ -4204,10 +4593,8 @@ def test_get_offset(): def test_get_offset_legacy(): pairs = [('w@Sat', Week(weekday=5))] for name, expected in pairs: - with tm.assert_produces_warning(FutureWarning): - offset = get_offset(name) - assert offset == expected, ("Expected %r to yield %r (actual: %r)" % - (name, expected, offset)) + with tm.assertRaisesRegexp(ValueError, _INVALID_FREQ_ERROR): + get_offset(name) class TestParseTimeString(tm.TestCase): @@ -4236,23 +4623,30 @@ def test_parse_time_quarter_w_dash(self): def test_get_standard_freq(): - fstr = get_standard_freq('W') - assert fstr == get_standard_freq('w') - assert fstr == get_standard_freq('1w') - assert fstr == get_standard_freq(('W', 1)) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = get_standard_freq('WeEk') - assert fstr == result + fstr = get_standard_freq('W') + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + assert fstr == get_standard_freq('w') + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + assert fstr == get_standard_freq('1w') + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + assert fstr == get_standard_freq(('W', 1)) - fstr = get_standard_freq('5Q') - assert fstr == get_standard_freq('5q') + with tm.assertRaisesRegexp(ValueError, _INVALID_FREQ_ERROR): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + get_standard_freq('WeEk') with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = get_standard_freq('5QuarTer') - assert fstr == result + fstr = get_standard_freq('5Q') + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + assert fstr == get_standard_freq('5q') - assert fstr == get_standard_freq(('q', 5)) + with tm.assertRaisesRegexp(ValueError, _INVALID_FREQ_ERROR): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + get_standard_freq('5QuarTer') + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + assert fstr == get_standard_freq(('q', 5)) def test_quarterly_dont_normalize(): @@ -4537,6 +4931,8 @@ def test_all_offset_classes(self): BMonthEnd: ['11/2/2012', '11/30/2012'], CBMonthBegin: ['11/2/2012', '12/3/2012'], CBMonthEnd: ['11/2/2012', '11/30/2012'], + SemiMonthBegin: ['11/2/2012', '11/15/2012'], + SemiMonthEnd: ['11/2/2012', '11/15/2012'], Week: ['11/2/2012', '11/9/2012'], YearBegin: ['11/2/2012', '1/1/2013'], YearEnd: ['11/2/2012', '12/31/2012'], diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py index de23306c80b71..62cfcf7f1360e 100644 --- a/pandas/tseries/tests/test_period.py +++ b/pandas/tseries/tests/test_period.py @@ -16,7 +16,6 @@ import pandas.tseries.period as period import pandas.tseries.offsets as offsets -import pandas.core.datetools as datetools import pandas as pd import numpy as np from numpy.random import randn @@ -36,14 +35,17 @@ def test_quarterly_negative_ordinals(self): p = Period(ordinal=-1, freq='Q-DEC') self.assertEqual(p.year, 1969) self.assertEqual(p.quarter, 4) + self.assertIsInstance(p, Period) p = Period(ordinal=-2, freq='Q-DEC') self.assertEqual(p.year, 1969) self.assertEqual(p.quarter, 3) + self.assertIsInstance(p, Period) p = Period(ordinal=-2, freq='M') self.assertEqual(p.year, 1969) self.assertEqual(p.month, 11) + self.assertIsInstance(p, Period) def test_period_cons_quarterly(self): # bugs in scikits.timeseries @@ -67,6 +69,7 @@ def test_period_cons_annual(self): stamp = exp.to_timestamp('D', how='end') + timedelta(days=30) p = Period(stamp, freq=freq) self.assertEqual(p, exp + 1) + self.assertIsInstance(p, Period) def test_period_cons_weekly(self): for num in range(10, 17): @@ -77,34 +80,49 @@ def test_period_cons_weekly(self): result = Period(daystr, freq=freq) expected = Period(daystr, freq='D').asfreq(freq) self.assertEqual(result, expected) + self.assertIsInstance(result, Period) + + def test_period_from_ordinal(self): + p = pd.Period('2011-01', freq='M') + res = pd.Period._from_ordinal(p.ordinal, freq='M') + self.assertEqual(p, res) + self.assertIsInstance(res, Period) def test_period_cons_nat(self): p = Period('NaT', freq='M') - self.assertEqual(p.ordinal, tslib.iNaT) - self.assertEqual(p.freq, 'M') - self.assertEqual((p + 1).ordinal, tslib.iNaT) - self.assertEqual((1 + p).ordinal, tslib.iNaT) + self.assertIs(p, pd.NaT) p = Period('nat', freq='W-SUN') - self.assertEqual(p.ordinal, tslib.iNaT) - self.assertEqual(p.freq, 'W-SUN') - self.assertEqual((p + 1).ordinal, tslib.iNaT) - self.assertEqual((1 + p).ordinal, tslib.iNaT) + self.assertIs(p, pd.NaT) p = Period(tslib.iNaT, freq='D') - self.assertEqual(p.ordinal, tslib.iNaT) - self.assertEqual(p.freq, 'D') - self.assertEqual((p + 1).ordinal, tslib.iNaT) - self.assertEqual((1 + p).ordinal, tslib.iNaT) + self.assertIs(p, pd.NaT) p = Period(tslib.iNaT, freq='3D') - self.assertEqual(p.ordinal, tslib.iNaT) - self.assertEqual(p.freq, offsets.Day(3)) - self.assertEqual(p.freqstr, '3D') - self.assertEqual((p + 1).ordinal, tslib.iNaT) - self.assertEqual((1 + p).ordinal, tslib.iNaT) + self.assertIs(p, pd.NaT) + + p = Period(tslib.iNaT, freq='1D1H') + self.assertIs(p, pd.NaT) + + p = Period('NaT') + self.assertIs(p, pd.NaT) + + p = Period(tslib.iNaT) + self.assertIs(p, pd.NaT) - self.assertRaises(ValueError, Period, 'NaT') + def test_cons_null_like(self): + # check Timestamp compat + self.assertIs(Timestamp('NaT'), pd.NaT) + self.assertIs(Period('NaT'), pd.NaT) + + self.assertIs(Timestamp(None), pd.NaT) + self.assertIs(Period(None), pd.NaT) + + self.assertIs(Timestamp(float('nan')), pd.NaT) + self.assertIs(Period(float('nan')), pd.NaT) + + self.assertIs(Timestamp(np.nan), pd.NaT) + self.assertIs(Period(np.nan), pd.NaT) def test_period_cons_mult(self): p1 = Period('2011-01', freq='3M') @@ -136,6 +154,73 @@ def test_period_cons_mult(self): with tm.assertRaisesRegexp(ValueError, msg): Period('2011-01', freq='0M') + def test_period_cons_combined(self): + p = [(Period('2011-01', freq='1D1H'), + Period('2011-01', freq='1H1D'), + Period('2011-01', freq='H')), + (Period(ordinal=1, freq='1D1H'), + Period(ordinal=1, freq='1H1D'), + Period(ordinal=1, freq='H'))] + + for p1, p2, p3 in p: + self.assertEqual(p1.ordinal, p3.ordinal) + self.assertEqual(p2.ordinal, p3.ordinal) + + self.assertEqual(p1.freq, offsets.Hour(25)) + self.assertEqual(p1.freqstr, '25H') + + self.assertEqual(p2.freq, offsets.Hour(25)) + self.assertEqual(p2.freqstr, '25H') + + self.assertEqual(p3.freq, offsets.Hour()) + self.assertEqual(p3.freqstr, 'H') + + result = p1 + 1 + self.assertEqual(result.ordinal, (p3 + 25).ordinal) + self.assertEqual(result.freq, p1.freq) + self.assertEqual(result.freqstr, '25H') + + result = p2 + 1 + self.assertEqual(result.ordinal, (p3 + 25).ordinal) + self.assertEqual(result.freq, p2.freq) + self.assertEqual(result.freqstr, '25H') + + result = p1 - 1 + self.assertEqual(result.ordinal, (p3 - 25).ordinal) + self.assertEqual(result.freq, p1.freq) + self.assertEqual(result.freqstr, '25H') + + result = p2 - 1 + self.assertEqual(result.ordinal, (p3 - 25).ordinal) + self.assertEqual(result.freq, p2.freq) + self.assertEqual(result.freqstr, '25H') + + msg = ('Frequency must be positive, because it' + ' represents span: -25H') + with tm.assertRaisesRegexp(ValueError, msg): + Period('2011-01', freq='-1D1H') + with tm.assertRaisesRegexp(ValueError, msg): + Period('2011-01', freq='-1H1D') + with tm.assertRaisesRegexp(ValueError, msg): + Period(ordinal=1, freq='-1D1H') + with tm.assertRaisesRegexp(ValueError, msg): + Period(ordinal=1, freq='-1H1D') + + msg = ('Frequency must be positive, because it' + ' represents span: 0D') + with tm.assertRaisesRegexp(ValueError, msg): + Period('2011-01', freq='0D0H') + with tm.assertRaisesRegexp(ValueError, msg): + Period(ordinal=1, freq='0D0H') + + # You can only combine together day and intraday offsets + msg = ('Invalid frequency: 1W1D') + with tm.assertRaisesRegexp(ValueError, msg): + Period('2011-01', freq='1W1D') + msg = ('Invalid frequency: 1D1W') + with tm.assertRaisesRegexp(ValueError, msg): + Period('2011-01', freq='1D1W') + def test_timestamp_tz_arg(self): tm._skip_if_no_pytz() import pytz @@ -197,13 +282,6 @@ def test_timestamp_tz_arg_dateutil_from_string(self): freq='M').to_timestamp(tz='dateutil/Europe/Brussels') self.assertEqual(p.tz, gettz('Europe/Brussels')) - def test_timestamp_nat_tz(self): - t = Period('NaT', freq='M').to_timestamp() - self.assertTrue(t is tslib.NaT) - - t = Period('NaT', freq='M').to_timestamp(tz='Asia/Tokyo') - self.assertTrue(t is tslib.NaT) - def test_timestamp_mult(self): p = pd.Period('2011-01', freq='M') self.assertEqual(p.to_timestamp(how='S'), pd.Timestamp('2011-01-01')) @@ -213,12 +291,6 @@ def test_timestamp_mult(self): self.assertEqual(p.to_timestamp(how='S'), pd.Timestamp('2011-01-01')) self.assertEqual(p.to_timestamp(how='E'), pd.Timestamp('2011-03-31')) - def test_timestamp_nat_mult(self): - for freq in ['M', '3M']: - p = pd.Period('NaT', freq=freq) - self.assertTrue(p.to_timestamp(how='S') is pd.NaT) - self.assertTrue(p.to_timestamp(how='E') is pd.NaT) - def test_period_constructor(self): i1 = Period('1/1/2005', freq='M') i2 = Period('Jan 2005') @@ -448,13 +520,33 @@ def test_period_deprecated_freq(self): "L": ["MILLISECOND", "MILLISECONDLY", "millisecond"], "U": ["MICROSECOND", "MICROSECONDLY", "microsecond"], "N": ["NANOSECOND", "NANOSECONDLY", "nanosecond"]} + + msg = pd.tseries.frequencies._INVALID_FREQ_ERROR for exp, freqs in iteritems(cases): for freq in freqs: + with self.assertRaisesRegexp(ValueError, msg): + Period('2016-03-01 09:00', freq=freq) + with self.assertRaisesRegexp(ValueError, msg): + Period(ordinal=1, freq=freq) + + # check supported freq-aliases still works + p1 = Period('2016-03-01 09:00', freq=exp) + p2 = Period(ordinal=1, freq=exp) + tm.assertIsInstance(p1, Period) + tm.assertIsInstance(p2, Period) - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - res = pd.Period('2016-03-01 09:00', freq=freq) - self.assertEqual(res, Period('2016-03-01 09:00', freq=exp)) + def test_hash(self): + self.assertEqual(hash(Period('2011-01', freq='M')), + hash(Period('2011-01', freq='M'))) + + self.assertNotEqual(hash(Period('2011-01-01', freq='D')), + hash(Period('2011-01', freq='M'))) + + self.assertNotEqual(hash(Period('2011-01', freq='3M')), + hash(Period('2011-01', freq='2M'))) + + self.assertNotEqual(hash(Period('2011-01', freq='M')), + hash(Period('2011-02', freq='M'))) def test_repr(self): p = Period('Jan-2000') @@ -552,9 +644,6 @@ def _ex(p): result = p.to_timestamp('5S', how='start') self.assertEqual(result, expected) - p = Period('NaT', freq='W') - self.assertTrue(p.to_timestamp() is tslib.NaT) - def test_start_time(self): freq_lst = ['A', 'Q', 'M', 'D', 'H', 'T', 'S'] xp = datetime(2012, 1, 1) @@ -566,9 +655,6 @@ def test_start_time(self): self.assertEqual(Period('2012', freq='W').start_time, datetime(2011, 12, 26)) - p = Period('NaT', freq='W') - self.assertTrue(p.start_time is tslib.NaT) - def test_end_time(self): p = Period('2012', freq='A') @@ -607,8 +693,13 @@ def _ex(*args): xp = _ex(2012, 1, 16) self.assertEqual(xp, p.end_time) - p = Period('NaT', freq='W') - self.assertTrue(p.end_time is tslib.NaT) + p = Period('2012', freq='1D1H') + xp = _ex(2012, 1, 2, 1) + self.assertEqual(xp, p.end_time) + + p = Period('2012', freq='1H1D') + xp = _ex(2012, 1, 2, 1) + self.assertEqual(xp, p.end_time) def test_anchor_week_end_time(self): def _ex(*args): @@ -665,19 +756,21 @@ def test_properties_weekly(self): def test_properties_weekly_legacy(self): # Test properties on Periods with daily frequency. - with tm.assert_produces_warning(FutureWarning): - w_date = Period(freq='WK', year=2007, month=1, day=7) - # + w_date = Period(freq='W', year=2007, month=1, day=7) self.assertEqual(w_date.year, 2007) self.assertEqual(w_date.quarter, 1) self.assertEqual(w_date.month, 1) self.assertEqual(w_date.week, 1) self.assertEqual((w_date - 1).week, 52) self.assertEqual(w_date.days_in_month, 31) - with tm.assert_produces_warning(FutureWarning): - exp = Period(freq='WK', year=2012, month=2, day=1) + + exp = Period(freq='W', year=2012, month=2, day=1) self.assertEqual(exp.days_in_month, 29) + msg = pd.tseries.frequencies._INVALID_FREQ_ERROR + with self.assertRaisesRegexp(ValueError, msg): + Period(freq='WK', year=2007, month=1, day=7) + def test_properties_daily(self): # Test properties on Periods with daily frequency. b_date = Period(freq='B', year=2007, month=1, day=1) @@ -758,15 +851,14 @@ def test_properties_secondly(self): def test_properties_nat(self): p_nat = Period('NaT', freq='M') t_nat = pd.Timestamp('NaT') + self.assertIs(p_nat, t_nat) + # confirm Period('NaT') work identical with Timestamp('NaT') for f in ['year', 'month', 'day', 'hour', 'minute', 'second', 'week', 'dayofyear', 'quarter', 'days_in_month']: self.assertTrue(np.isnan(getattr(p_nat, f))) self.assertTrue(np.isnan(getattr(t_nat, f))) - for f in ['weekofyear', 'dayofweek', 'weekday', 'qyear']: - self.assertTrue(np.isnan(getattr(p_nat, f))) - def test_pnow(self): dt = datetime.now() @@ -789,7 +881,7 @@ def test_constructor_corner(self): self.assertRaises(ValueError, Period, 1.6, freq='D') self.assertRaises(ValueError, Period, ordinal=1.6, freq='D') self.assertRaises(ValueError, Period, ordinal=2, value=1, freq='D') - self.assertRaises(ValueError, Period) + self.assertIs(Period(None), pd.NaT) self.assertRaises(ValueError, Period, month=1) p = Period('2007-01-01', freq='D') @@ -826,10 +918,11 @@ def test_asfreq_MS(self): self.assertEqual(initial.asfreq(freq="M", how="S"), Period('2013-01', 'M')) - with self.assertRaisesRegexp(ValueError, "Unknown freqstr"): + msg = pd.tseries.frequencies._INVALID_FREQ_ERROR + with self.assertRaisesRegexp(ValueError, msg): initial.asfreq(freq="MS", how="S") - with tm.assertRaisesRegexp(ValueError, "Unknown freqstr: MS"): + with tm.assertRaisesRegexp(ValueError, msg): pd.Period('2013-01', 'MS') self.assertTrue(_period_code_map.get("MS") is None) @@ -1129,123 +1222,28 @@ def test_conv_weekly(self): self.assertEqual(ival_W.asfreq('W'), ival_W) + msg = pd.tseries.frequencies._INVALID_FREQ_ERROR + with self.assertRaisesRegexp(ValueError, msg): + ival_W.asfreq('WK') + def test_conv_weekly_legacy(self): # frequency conversion tests: from Weekly Frequency - - with tm.assert_produces_warning(FutureWarning): - ival_W = Period(freq='WK', year=2007, month=1, day=1) - - with tm.assert_produces_warning(FutureWarning): - ival_WSUN = Period(freq='WK', year=2007, month=1, day=7) - with tm.assert_produces_warning(FutureWarning): - ival_WSAT = Period(freq='WK-SAT', year=2007, month=1, day=6) - with tm.assert_produces_warning(FutureWarning): - ival_WFRI = Period(freq='WK-FRI', year=2007, month=1, day=5) - with tm.assert_produces_warning(FutureWarning): - ival_WTHU = Period(freq='WK-THU', year=2007, month=1, day=4) - with tm.assert_produces_warning(FutureWarning): - ival_WWED = Period(freq='WK-WED', year=2007, month=1, day=3) - with tm.assert_produces_warning(FutureWarning): - ival_WTUE = Period(freq='WK-TUE', year=2007, month=1, day=2) - with tm.assert_produces_warning(FutureWarning): - ival_WMON = Period(freq='WK-MON', year=2007, month=1, day=1) - - ival_WSUN_to_D_start = Period(freq='D', year=2007, month=1, day=1) - ival_WSUN_to_D_end = Period(freq='D', year=2007, month=1, day=7) - ival_WSAT_to_D_start = Period(freq='D', year=2006, month=12, day=31) - ival_WSAT_to_D_end = Period(freq='D', year=2007, month=1, day=6) - ival_WFRI_to_D_start = Period(freq='D', year=2006, month=12, day=30) - ival_WFRI_to_D_end = Period(freq='D', year=2007, month=1, day=5) - ival_WTHU_to_D_start = Period(freq='D', year=2006, month=12, day=29) - ival_WTHU_to_D_end = Period(freq='D', year=2007, month=1, day=4) - ival_WWED_to_D_start = Period(freq='D', year=2006, month=12, day=28) - ival_WWED_to_D_end = Period(freq='D', year=2007, month=1, day=3) - ival_WTUE_to_D_start = Period(freq='D', year=2006, month=12, day=27) - ival_WTUE_to_D_end = Period(freq='D', year=2007, month=1, day=2) - ival_WMON_to_D_start = Period(freq='D', year=2006, month=12, day=26) - ival_WMON_to_D_end = Period(freq='D', year=2007, month=1, day=1) - - with tm.assert_produces_warning(FutureWarning): - ival_W_end_of_year = Period(freq='WK', year=2007, month=12, day=31) - with tm.assert_produces_warning(FutureWarning): - ival_W_end_of_quarter = Period(freq='WK', year=2007, month=3, - day=31) - with tm.assert_produces_warning(FutureWarning): - ival_W_end_of_month = Period(freq='WK', year=2007, month=1, day=31) - ival_W_to_A = Period(freq='A', year=2007) - ival_W_to_Q = Period(freq='Q', year=2007, quarter=1) - ival_W_to_M = Period(freq='M', year=2007, month=1) - - if Period(freq='D', year=2007, month=12, day=31).weekday == 6: - ival_W_to_A_end_of_year = Period(freq='A', year=2007) - else: - ival_W_to_A_end_of_year = Period(freq='A', year=2008) - - if Period(freq='D', year=2007, month=3, day=31).weekday == 6: - ival_W_to_Q_end_of_quarter = Period(freq='Q', year=2007, quarter=1) - else: - ival_W_to_Q_end_of_quarter = Period(freq='Q', year=2007, quarter=2) - - if Period(freq='D', year=2007, month=1, day=31).weekday == 6: - ival_W_to_M_end_of_month = Period(freq='M', year=2007, month=1) - else: - ival_W_to_M_end_of_month = Period(freq='M', year=2007, month=2) - - ival_W_to_B_start = Period(freq='B', year=2007, month=1, day=1) - ival_W_to_B_end = Period(freq='B', year=2007, month=1, day=5) - ival_W_to_D_start = Period(freq='D', year=2007, month=1, day=1) - ival_W_to_D_end = Period(freq='D', year=2007, month=1, day=7) - ival_W_to_H_start = Period(freq='H', year=2007, month=1, day=1, hour=0) - ival_W_to_H_end = Period(freq='H', year=2007, month=1, day=7, hour=23) - ival_W_to_T_start = Period(freq='Min', year=2007, month=1, day=1, - hour=0, minute=0) - ival_W_to_T_end = Period(freq='Min', year=2007, month=1, day=7, - hour=23, minute=59) - ival_W_to_S_start = Period(freq='S', year=2007, month=1, day=1, hour=0, - minute=0, second=0) - ival_W_to_S_end = Period(freq='S', year=2007, month=1, day=7, hour=23, - minute=59, second=59) - - self.assertEqual(ival_W.asfreq('A'), ival_W_to_A) - self.assertEqual(ival_W_end_of_year.asfreq('A'), - ival_W_to_A_end_of_year) - self.assertEqual(ival_W.asfreq('Q'), ival_W_to_Q) - self.assertEqual(ival_W_end_of_quarter.asfreq('Q'), - ival_W_to_Q_end_of_quarter) - self.assertEqual(ival_W.asfreq('M'), ival_W_to_M) - self.assertEqual(ival_W_end_of_month.asfreq('M'), - ival_W_to_M_end_of_month) - - self.assertEqual(ival_W.asfreq('B', 'S'), ival_W_to_B_start) - self.assertEqual(ival_W.asfreq('B', 'E'), ival_W_to_B_end) - - self.assertEqual(ival_W.asfreq('D', 'S'), ival_W_to_D_start) - self.assertEqual(ival_W.asfreq('D', 'E'), ival_W_to_D_end) - - self.assertEqual(ival_WSUN.asfreq('D', 'S'), ival_WSUN_to_D_start) - self.assertEqual(ival_WSUN.asfreq('D', 'E'), ival_WSUN_to_D_end) - self.assertEqual(ival_WSAT.asfreq('D', 'S'), ival_WSAT_to_D_start) - self.assertEqual(ival_WSAT.asfreq('D', 'E'), ival_WSAT_to_D_end) - self.assertEqual(ival_WFRI.asfreq('D', 'S'), ival_WFRI_to_D_start) - self.assertEqual(ival_WFRI.asfreq('D', 'E'), ival_WFRI_to_D_end) - self.assertEqual(ival_WTHU.asfreq('D', 'S'), ival_WTHU_to_D_start) - self.assertEqual(ival_WTHU.asfreq('D', 'E'), ival_WTHU_to_D_end) - self.assertEqual(ival_WWED.asfreq('D', 'S'), ival_WWED_to_D_start) - self.assertEqual(ival_WWED.asfreq('D', 'E'), ival_WWED_to_D_end) - self.assertEqual(ival_WTUE.asfreq('D', 'S'), ival_WTUE_to_D_start) - self.assertEqual(ival_WTUE.asfreq('D', 'E'), ival_WTUE_to_D_end) - self.assertEqual(ival_WMON.asfreq('D', 'S'), ival_WMON_to_D_start) - self.assertEqual(ival_WMON.asfreq('D', 'E'), ival_WMON_to_D_end) - - self.assertEqual(ival_W.asfreq('H', 'S'), ival_W_to_H_start) - self.assertEqual(ival_W.asfreq('H', 'E'), ival_W_to_H_end) - self.assertEqual(ival_W.asfreq('Min', 'S'), ival_W_to_T_start) - self.assertEqual(ival_W.asfreq('Min', 'E'), ival_W_to_T_end) - self.assertEqual(ival_W.asfreq('S', 'S'), ival_W_to_S_start) - self.assertEqual(ival_W.asfreq('S', 'E'), ival_W_to_S_end) - - with tm.assert_produces_warning(FutureWarning): - self.assertEqual(ival_W.asfreq('WK'), ival_W) + msg = pd.tseries.frequencies._INVALID_FREQ_ERROR + with self.assertRaisesRegexp(ValueError, msg): + Period(freq='WK', year=2007, month=1, day=1) + + with self.assertRaisesRegexp(ValueError, msg): + Period(freq='WK-SAT', year=2007, month=1, day=6) + with self.assertRaisesRegexp(ValueError, msg): + Period(freq='WK-FRI', year=2007, month=1, day=5) + with self.assertRaisesRegexp(ValueError, msg): + Period(freq='WK-THU', year=2007, month=1, day=4) + with self.assertRaisesRegexp(ValueError, msg): + Period(freq='WK-WED', year=2007, month=1, day=3) + with self.assertRaisesRegexp(ValueError, msg): + Period(freq='WK-TUE', year=2007, month=1, day=2) + with self.assertRaisesRegexp(ValueError, msg): + Period(freq='WK-MON', year=2007, month=1, day=1) def test_conv_business(self): # frequency conversion tests: from Business Frequency" @@ -1526,12 +1524,6 @@ def test_conv_secondly(self): self.assertEqual(ival_S.asfreq('S'), ival_S) - def test_asfreq_nat(self): - p = Period('NaT', freq='A') - result = p.asfreq('M') - self.assertEqual(result.ordinal, tslib.iNaT) - self.assertEqual(result.freq, 'M') - def test_asfreq_mult(self): # normal freq to mult freq p = Period(freq='A', year=2007) @@ -1603,20 +1595,59 @@ def test_asfreq_mult(self): self.assertEqual(result.ordinal, expected.ordinal) self.assertEqual(result.freq, expected.freq) - def test_asfreq_mult_nat(self): - # normal freq to mult freq - for p in [Period('NaT', freq='A'), Period('NaT', freq='3A'), - Period('NaT', freq='2M'), Period('NaT', freq='3D')]: - for freq in ['3A', offsets.YearEnd(3)]: - result = p.asfreq(freq) - expected = Period('NaT', freq='3A') - self.assertEqual(result.ordinal, pd.tslib.iNaT) - self.assertEqual(result.freq, expected.freq) + def test_asfreq_combined(self): + # normal freq to combined freq + p = Period('2007', freq='H') + + # ordinal will not change + expected = Period('2007', freq='25H') + for freq, how in zip(['1D1H', '1H1D'], ['E', 'S']): + result = p.asfreq(freq, how=how) + self.assertEqual(result, expected) + self.assertEqual(result.ordinal, expected.ordinal) + self.assertEqual(result.freq, expected.freq) - result = p.asfreq(freq, how='S') - expected = Period('NaT', freq='3A') - self.assertEqual(result.ordinal, pd.tslib.iNaT) - self.assertEqual(result.freq, expected.freq) + # combined freq to normal freq + p1 = Period(freq='1D1H', year=2007) + p2 = Period(freq='1H1D', year=2007) + + # ordinal will change because how=E is the default + result1 = p1.asfreq('H') + result2 = p2.asfreq('H') + expected = Period('2007-01-02', freq='H') + self.assertEqual(result1, expected) + self.assertEqual(result1.ordinal, expected.ordinal) + self.assertEqual(result1.freq, expected.freq) + self.assertEqual(result2, expected) + self.assertEqual(result2.ordinal, expected.ordinal) + self.assertEqual(result2.freq, expected.freq) + + # ordinal will not change + result1 = p1.asfreq('H', how='S') + result2 = p2.asfreq('H', how='S') + expected = Period('2007-01-01', freq='H') + self.assertEqual(result1, expected) + self.assertEqual(result1.ordinal, expected.ordinal) + self.assertEqual(result1.freq, expected.freq) + self.assertEqual(result2, expected) + self.assertEqual(result2.ordinal, expected.ordinal) + self.assertEqual(result2.freq, expected.freq) + + def test_is_leap_year(self): + # GH 13727 + for freq in ['A', 'M', 'D', 'H']: + p = Period('2000-01-01 00:00:00', freq=freq) + self.assertTrue(p.is_leap_year) + self.assertIsInstance(p.is_leap_year, bool) + + p = Period('1999-01-01 00:00:00', freq=freq) + self.assertFalse(p.is_leap_year) + + p = Period('2004-01-01 00:00:00', freq=freq) + self.assertTrue(p.is_leap_year) + + p = Period('2100-01-01 00:00:00', freq=freq) + self.assertFalse(p.is_leap_year) class TestPeriodIndex(tm.TestCase): @@ -1677,6 +1708,15 @@ def test_constructor_U(self): self.assertRaises(ValueError, period_range, '2007-1-1', periods=500, freq='X') + def test_constructor_nano(self): + idx = period_range(start=Period(ordinal=1, freq='N'), + end=Period(ordinal=4, freq='N'), freq='N') + exp = PeriodIndex([Period(ordinal=1, freq='N'), + Period(ordinal=2, freq='N'), + Period(ordinal=3, freq='N'), + Period(ordinal=4, freq='N')], freq='N') + tm.assert_index_equal(idx, exp) + def test_constructor_arrays_negative_year(self): years = np.arange(1960, 2000, dtype=np.int64).repeat(4) quarters = np.tile(np.array([1, 2, 3, 4], dtype=np.int64), 40) @@ -1706,8 +1746,12 @@ def test_constructor_corner(self): def test_constructor_fromarraylike(self): idx = period_range('2007-01', periods=20, freq='M') - self.assertRaises(ValueError, PeriodIndex, idx.values) - self.assertRaises(ValueError, PeriodIndex, list(idx.values)) + # values is an array of Period, thus can retrieve freq + tm.assert_index_equal(PeriodIndex(idx.values), idx) + tm.assert_index_equal(PeriodIndex(list(idx.values)), idx) + + self.assertRaises(ValueError, PeriodIndex, idx._values) + self.assertRaises(ValueError, PeriodIndex, list(idx._values)) self.assertRaises(ValueError, PeriodIndex, data=Period('2007', freq='A')) @@ -1742,6 +1786,113 @@ def test_constructor_datetime64arr(self): self.assertRaises(ValueError, PeriodIndex, vals, freq='D') + def test_constructor_dtype(self): + # passing a dtype with a tz should localize + idx = PeriodIndex(['2013-01', '2013-03'], dtype='period[M]') + exp = PeriodIndex(['2013-01', '2013-03'], freq='M') + tm.assert_index_equal(idx, exp) + self.assertEqual(idx.dtype, 'period[M]') + + idx = PeriodIndex(['2013-01-05', '2013-03-05'], dtype='period[3D]') + exp = PeriodIndex(['2013-01-05', '2013-03-05'], freq='3D') + tm.assert_index_equal(idx, exp) + self.assertEqual(idx.dtype, 'period[3D]') + + # if we already have a freq and its not the same, then asfreq + # (not changed) + idx = PeriodIndex(['2013-01-01', '2013-01-02'], freq='D') + + res = PeriodIndex(idx, dtype='period[M]') + exp = PeriodIndex(['2013-01', '2013-01'], freq='M') + tm.assert_index_equal(res, exp) + self.assertEqual(res.dtype, 'period[M]') + + res = PeriodIndex(idx, freq='M') + tm.assert_index_equal(res, exp) + self.assertEqual(res.dtype, 'period[M]') + + msg = 'specified freq and dtype are different' + with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): + PeriodIndex(['2011-01'], freq='M', dtype='period[D]') + + def test_constructor_empty(self): + idx = pd.PeriodIndex([], freq='M') + tm.assertIsInstance(idx, PeriodIndex) + self.assertEqual(len(idx), 0) + self.assertEqual(idx.freq, 'M') + + with tm.assertRaisesRegexp(ValueError, 'freq not specified'): + pd.PeriodIndex([]) + + def test_constructor_pi_nat(self): + idx = PeriodIndex([Period('2011-01', freq='M'), pd.NaT, + Period('2011-01', freq='M')]) + exp = PeriodIndex(['2011-01', 'NaT', '2011-01'], freq='M') + tm.assert_index_equal(idx, exp) + + idx = PeriodIndex(np.array([Period('2011-01', freq='M'), pd.NaT, + Period('2011-01', freq='M')])) + tm.assert_index_equal(idx, exp) + + idx = PeriodIndex([pd.NaT, pd.NaT, Period('2011-01', freq='M'), + Period('2011-01', freq='M')]) + exp = PeriodIndex(['NaT', 'NaT', '2011-01', '2011-01'], freq='M') + tm.assert_index_equal(idx, exp) + + idx = PeriodIndex(np.array([pd.NaT, pd.NaT, + Period('2011-01', freq='M'), + Period('2011-01', freq='M')])) + tm.assert_index_equal(idx, exp) + + idx = PeriodIndex([pd.NaT, pd.NaT, '2011-01', '2011-01'], freq='M') + tm.assert_index_equal(idx, exp) + + with tm.assertRaisesRegexp(ValueError, 'freq not specified'): + PeriodIndex([pd.NaT, pd.NaT]) + + with tm.assertRaisesRegexp(ValueError, 'freq not specified'): + PeriodIndex(np.array([pd.NaT, pd.NaT])) + + with tm.assertRaisesRegexp(ValueError, 'freq not specified'): + PeriodIndex(['NaT', 'NaT']) + + with tm.assertRaisesRegexp(ValueError, 'freq not specified'): + PeriodIndex(np.array(['NaT', 'NaT'])) + + def test_constructor_incompat_freq(self): + msg = "Input has different freq=D from PeriodIndex\\(freq=M\\)" + + with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): + PeriodIndex([Period('2011-01', freq='M'), pd.NaT, + Period('2011-01', freq='D')]) + + with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): + PeriodIndex(np.array([Period('2011-01', freq='M'), pd.NaT, + Period('2011-01', freq='D')])) + + # first element is pd.NaT + with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): + PeriodIndex([pd.NaT, Period('2011-01', freq='M'), + Period('2011-01', freq='D')]) + + with tm.assertRaisesRegexp(period.IncompatibleFrequency, msg): + PeriodIndex(np.array([pd.NaT, Period('2011-01', freq='M'), + Period('2011-01', freq='D')])) + + def test_constructor_mixed(self): + idx = PeriodIndex(['2011-01', pd.NaT, Period('2011-01', freq='M')]) + exp = PeriodIndex(['2011-01', 'NaT', '2011-01'], freq='M') + tm.assert_index_equal(idx, exp) + + idx = PeriodIndex(['NaT', pd.NaT, Period('2011-01', freq='M')]) + exp = PeriodIndex(['NaT', 'NaT', '2011-01'], freq='M') + tm.assert_index_equal(idx, exp) + + idx = PeriodIndex([Period('2011-01-01', freq='D'), pd.NaT, + '2012-01-01']) + exp = PeriodIndex(['2011-01-01', 'NaT', '2012-01-01'], freq='D') + tm.assert_index_equal(idx, exp) + def test_constructor_simple_new(self): idx = period_range('2007-01', name='p', periods=2, freq='M') result = idx._simple_new(idx, 'p', freq=idx.freq) @@ -1840,6 +1991,89 @@ def test_constructor_freq_mult_dti_compat(self): periods=10).to_period(freqstr) tm.assert_index_equal(pidx, expected) + def test_constructor_freq_combined(self): + for freq in ['1D1H', '1H1D']: + pidx = PeriodIndex(['2016-01-01', '2016-01-02'], freq=freq) + expected = PeriodIndex(['2016-01-01 00:00', '2016-01-02 00:00'], + freq='25H') + for freq, func in zip(['1D1H', '1H1D'], [PeriodIndex, period_range]): + pidx = func(start='2016-01-01', periods=2, freq=freq) + expected = PeriodIndex(['2016-01-01 00:00', '2016-01-02 01:00'], + freq='25H') + tm.assert_index_equal(pidx, expected) + + def test_dtype_str(self): + pi = pd.PeriodIndex([], freq='M') + self.assertEqual(pi.dtype_str, 'period[M]') + self.assertEqual(pi.dtype_str, str(pi.dtype)) + + pi = pd.PeriodIndex([], freq='3M') + self.assertEqual(pi.dtype_str, 'period[3M]') + self.assertEqual(pi.dtype_str, str(pi.dtype)) + + def test_view_asi8(self): + idx = pd.PeriodIndex([], freq='M') + + exp = np.array([], dtype=np.int64) + tm.assert_numpy_array_equal(idx.view('i8'), exp) + tm.assert_numpy_array_equal(idx.asi8, exp) + + idx = pd.PeriodIndex(['2011-01', pd.NaT], freq='M') + + exp = np.array([492, -9223372036854775808], dtype=np.int64) + tm.assert_numpy_array_equal(idx.view('i8'), exp) + tm.assert_numpy_array_equal(idx.asi8, exp) + + exp = np.array([14975, -9223372036854775808], dtype=np.int64) + idx = pd.PeriodIndex(['2011-01-01', pd.NaT], freq='D') + tm.assert_numpy_array_equal(idx.view('i8'), exp) + tm.assert_numpy_array_equal(idx.asi8, exp) + + def test_values(self): + idx = pd.PeriodIndex([], freq='M') + + exp = np.array([], dtype=np.object) + tm.assert_numpy_array_equal(idx.values, exp) + tm.assert_numpy_array_equal(idx.get_values(), exp) + exp = np.array([], dtype=np.int64) + tm.assert_numpy_array_equal(idx._values, exp) + + idx = pd.PeriodIndex(['2011-01', pd.NaT], freq='M') + + exp = np.array([pd.Period('2011-01', freq='M'), pd.NaT], dtype=object) + tm.assert_numpy_array_equal(idx.values, exp) + tm.assert_numpy_array_equal(idx.get_values(), exp) + exp = np.array([492, -9223372036854775808], dtype=np.int64) + tm.assert_numpy_array_equal(idx._values, exp) + + idx = pd.PeriodIndex(['2011-01-01', pd.NaT], freq='D') + + exp = np.array([pd.Period('2011-01-01', freq='D'), pd.NaT], + dtype=object) + tm.assert_numpy_array_equal(idx.values, exp) + tm.assert_numpy_array_equal(idx.get_values(), exp) + exp = np.array([14975, -9223372036854775808], dtype=np.int64) + tm.assert_numpy_array_equal(idx._values, exp) + + def test_asobject_like(self): + idx = pd.PeriodIndex([], freq='M') + + exp = np.array([], dtype=object) + tm.assert_numpy_array_equal(idx.asobject.values, exp) + tm.assert_numpy_array_equal(idx._mpl_repr(), exp) + + idx = pd.PeriodIndex(['2011-01', pd.NaT], freq='M') + + exp = np.array([pd.Period('2011-01', freq='M'), pd.NaT], dtype=object) + tm.assert_numpy_array_equal(idx.asobject.values, exp) + tm.assert_numpy_array_equal(idx._mpl_repr(), exp) + + exp = np.array([pd.Period('2011-01-01', freq='D'), pd.NaT], + dtype=object) + idx = pd.PeriodIndex(['2011-01-01', pd.NaT], freq='D') + tm.assert_numpy_array_equal(idx.asobject.values, exp) + tm.assert_numpy_array_equal(idx._mpl_repr(), exp) + def test_is_(self): create_index = lambda: PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') @@ -1870,8 +2104,23 @@ def test_getitem_ndim2(self): idx = period_range('2007-01', periods=3, freq='M') result = idx[:, None] - # MPL kludge + # MPL kludge, internally has incorrect shape tm.assertIsInstance(result, PeriodIndex) + self.assertEqual(result.shape, (len(idx), )) + + def test_getitem_index(self): + idx = period_range('2007-01', periods=10, freq='M', name='x') + + result = idx[[1, 3, 5]] + exp = pd.PeriodIndex(['2007-02', '2007-04', '2007-06'], + freq='M', name='x') + tm.assert_index_equal(result, exp) + + result = idx[[True, True, False, False, False, + True, True, False, False, False]] + exp = pd.PeriodIndex(['2007-01', '2007-02', '2007-06', '2007-07'], + freq='M', name='x') + tm.assert_index_equal(result, exp) def test_getitem_partial(self): rng = period_range('2007-01', periods=50, freq='M') @@ -1917,6 +2166,26 @@ def test_getitem_datetime(self): rs = ts[dt1:dt4] tm.assert_series_equal(rs, ts) + def test_getitem_nat(self): + idx = pd.PeriodIndex(['2011-01', 'NaT', '2011-02'], freq='M') + self.assertEqual(idx[0], pd.Period('2011-01', freq='M')) + self.assertIs(idx[1], tslib.NaT) + + s = pd.Series([0, 1, 2], index=idx) + self.assertEqual(s[pd.NaT], 1) + + s = pd.Series(idx, index=idx) + self.assertEqual(s[pd.Period('2011-01', freq='M')], + pd.Period('2011-01', freq='M')) + self.assertIs(s[pd.NaT], tslib.NaT) + + def test_getitem_list_periods(self): + # GH 7710 + rng = period_range(start='2012-01-01', periods=10, freq='D') + ts = Series(lrange(len(rng)), index=rng) + exp = ts.iloc[[1]] + tm.assert_series_equal(ts[[Period('2012-01-02', freq='D')]], exp) + def test_slice_with_negative_step(self): ts = Series(np.arange(20), period_range('2014-01', periods=20, freq='M')) @@ -1960,6 +2229,20 @@ def test_contains(self): self.assertFalse(Period('2007-01', freq='D') in rng) self.assertFalse(Period('2007-01', freq='2M') in rng) + def test_contains_nat(self): + # GH13582 + idx = period_range('2007-01', freq='M', periods=10) + self.assertFalse(pd.NaT in idx) + self.assertFalse(None in idx) + self.assertFalse(float('nan') in idx) + self.assertFalse(np.nan in idx) + + idx = pd.PeriodIndex(['2011-01', 'NaT', '2011-02'], freq='M') + self.assertTrue(pd.NaT in idx) + self.assertTrue(None in idx) + self.assertTrue(float('nan') in idx) + self.assertTrue(np.nan in idx) + def test_sub(self): rng = period_range('2007-01', periods=50) @@ -2082,6 +2365,32 @@ def test_to_timestamp_pi_mult(self): ['2011-02-28', 'NaT', '2011-03-31'], name='idx') self.assert_index_equal(result, expected) + def test_to_timestamp_pi_combined(self): + idx = PeriodIndex(start='2011', periods=2, freq='1D1H', name='idx') + result = idx.to_timestamp() + expected = DatetimeIndex( + ['2011-01-01 00:00', '2011-01-02 01:00'], name='idx') + self.assert_index_equal(result, expected) + result = idx.to_timestamp(how='E') + expected = DatetimeIndex( + ['2011-01-02 00:59:59', '2011-01-03 01:59:59'], name='idx') + self.assert_index_equal(result, expected) + result = idx.to_timestamp(how='E', freq='H') + expected = DatetimeIndex( + ['2011-01-02 00:00', '2011-01-03 01:00'], name='idx') + self.assert_index_equal(result, expected) + + def test_to_timestamp_to_period_astype(self): + idx = DatetimeIndex([pd.NaT, '2011-01-01', '2011-02-01'], name='idx') + + res = idx.astype('period[M]') + exp = PeriodIndex(['NaT', '2011-01', '2011-02'], freq='M', name='idx') + tm.assert_index_equal(res, exp) + + res = idx.astype('period[3M]') + exp = PeriodIndex(['NaT', '2011-01', '2011-02'], freq='3M', name='idx') + self.assert_index_equal(res, exp) + def test_start_time(self): index = PeriodIndex(freq='M', start='2016-01-01', end='2016-05-31') expected_index = date_range('2016-01-01', end='2016-05-31', freq='MS') @@ -2326,15 +2635,6 @@ def test_constructor(self): vals = np.array(vals) self.assertRaises(ValueError, PeriodIndex, vals) - def test_repeat(self): - index = period_range('20010101', periods=2) - expected = PeriodIndex([ - Period('2001-01-01'), Period('2001-01-01'), - Period('2001-01-02'), Period('2001-01-02'), - ]) - - tm.assert_index_equal(index.repeat(2), expected) - def test_numpy_repeat(self): index = period_range('20010101', periods=2) expected = PeriodIndex([Period('2001-01-01'), Period('2001-01-01'), @@ -2493,6 +2793,33 @@ def test_asfreq_mult_pi(self): self.assert_index_equal(result, exp) self.assertEqual(result.freq, exp.freq) + def test_asfreq_combined_pi(self): + pi = pd.PeriodIndex(['2001-01-01 00:00', '2001-01-02 02:00', 'NaT'], + freq='H') + exp = PeriodIndex(['2001-01-01 00:00', '2001-01-02 02:00', 'NaT'], + freq='25H') + for freq, how in zip(['1D1H', '1H1D'], ['S', 'E']): + result = pi.asfreq(freq, how=how) + self.assert_index_equal(result, exp) + self.assertEqual(result.freq, exp.freq) + + for freq in ['1D1H', '1H1D']: + pi = pd.PeriodIndex(['2001-01-01 00:00', '2001-01-02 02:00', + 'NaT'], freq=freq) + result = pi.asfreq('H') + exp = PeriodIndex(['2001-01-02 00:00', '2001-01-03 02:00', 'NaT'], + freq='H') + self.assert_index_equal(result, exp) + self.assertEqual(result.freq, exp.freq) + + pi = pd.PeriodIndex(['2001-01-01 00:00', '2001-01-02 02:00', + 'NaT'], freq=freq) + result = pi.asfreq('H', how='S') + exp = PeriodIndex(['2001-01-01 00:00', '2001-01-02 02:00', 'NaT'], + freq='H') + self.assert_index_equal(result, exp) + self.assertEqual(result.freq, exp.freq) + def test_period_index_length(self): pi = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') self.assertEqual(len(pi), 9) @@ -2582,9 +2909,9 @@ def test_asfreq_ts(self): tm.assert_index_equal(result.index, index.asfreq('D', how='start')) def test_badinput(self): - self.assertRaises(datetools.DateParseError, Period, '1/1/-2000', 'A') - # self.assertRaises(datetools.DateParseError, Period, '-2000', 'A') - # self.assertRaises(datetools.DateParseError, Period, '0', 'A') + self.assertRaises(ValueError, Period, '-2000', 'A') + self.assertRaises(tslib.DateParseError, Period, '0', 'A') + self.assertRaises(tslib.DateParseError, Period, '1/1/-2000', 'A') def test_negative_ordinals(self): Period(ordinal=-1000, freq='A') @@ -2763,6 +3090,16 @@ def test_range_slice_outofbounds(self): tm.assert_frame_equal(df['2013-06':'2013-09'], empty) tm.assert_frame_equal(df['2013-11':'2013-12'], empty) + def test_astype_asfreq(self): + pi1 = PeriodIndex(['2011-01-01', '2011-02-01', '2011-03-01'], freq='D') + exp = PeriodIndex(['2011-01', '2011-02', '2011-03'], freq='M') + tm.assert_index_equal(pi1.asfreq('M'), exp) + tm.assert_index_equal(pi1.astype('period[M]'), exp) + + exp = PeriodIndex(['2011-01', '2011-02', '2011-03'], freq='3M') + tm.assert_index_equal(pi1.asfreq('3M'), exp) + tm.assert_index_equal(pi1.astype('period[3M]'), exp) + def test_pindex_fieldaccessor_nat(self): idx = PeriodIndex(['2011-01', '2011-02', 'NaT', '2012-03', '2012-04'], freq='D') @@ -2787,6 +3124,25 @@ def test_period_dt64_round_trip(self): pi = dti.to_period(freq='H') tm.assert_index_equal(pi.to_timestamp(), dti) + def test_period_astype_to_timestamp(self): + pi = pd.PeriodIndex(['2011-01', '2011-02', '2011-03'], freq='M') + + exp = pd.DatetimeIndex(['2011-01-01', '2011-02-01', '2011-03-01']) + tm.assert_index_equal(pi.astype('datetime64[ns]'), exp) + + exp = pd.DatetimeIndex(['2011-01-31', '2011-02-28', '2011-03-31']) + tm.assert_index_equal(pi.astype('datetime64[ns]', how='end'), exp) + + exp = pd.DatetimeIndex(['2011-01-01', '2011-02-01', '2011-03-01'], + tz='US/Eastern') + res = pi.astype('datetime64[ns, US/Eastern]') + tm.assert_index_equal(pi.astype('datetime64[ns, US/Eastern]'), exp) + + exp = pd.DatetimeIndex(['2011-01-31', '2011-02-28', '2011-03-31'], + tz='US/Eastern') + res = pi.astype('datetime64[ns, US/Eastern]', how='end') + tm.assert_index_equal(res, exp) + def test_to_period_quarterly(self): # make sure we can make the round trip for month in MONTHS: @@ -2817,11 +3173,14 @@ def test_to_period_monthish(self): prng = rng.to_period() self.assertEqual(prng.freq, 'M') - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - rng = date_range('01-Jan-2012', periods=8, freq='EOM') + rng = date_range('01-Jan-2012', periods=8, freq='M') prng = rng.to_period() self.assertEqual(prng.freq, 'M') + msg = pd.tseries.frequencies._INVALID_FREQ_ERROR + with self.assertRaisesRegexp(ValueError, msg): + date_range('01-Jan-2012', periods=8, freq='EOM') + def test_multiples(self): result1 = Period('1989', freq='2A') result2 = Period('1989', freq='A') @@ -3117,9 +3476,10 @@ def test_fields(self): def _check_all_fields(self, periodindex): fields = ['year', 'month', 'day', 'hour', 'minute', 'second', 'weekofyear', 'week', 'dayofweek', 'weekday', 'dayofyear', - 'quarter', 'qyear', 'days_in_month'] + 'quarter', 'qyear', 'days_in_month', 'is_leap_year'] periods = list(periodindex) + s = pd.Series(periodindex) for field in fields: field_idx = getattr(periodindex, field) @@ -3127,6 +3487,14 @@ def _check_all_fields(self, periodindex): for x, val in zip(periods, field_idx): self.assertEqual(getattr(x, field), val) + if len(s) == 0: + continue + + field_s = getattr(s.dt, field) + self.assertEqual(len(periodindex), len(field_s)) + for x, val in zip(periods, field_s): + self.assertEqual(getattr(x, field), val) + def test_is_full(self): index = PeriodIndex([2005, 2007, 2009], freq='A') self.assertFalse(index.is_full) @@ -3198,12 +3566,20 @@ def test_with_multi_index(self): tm.assertIsInstance(s.index.values[0][0], Period) - def test_to_datetime_1703(self): + def test_to_timestamp_1703(self): index = period_range('1/1/2012', periods=4, freq='D') - result = index.to_datetime() + result = index.to_timestamp() self.assertEqual(result[0], Timestamp('1/1/2012')) + def test_to_datetime_depr(self): + index = period_range('1/1/2012', periods=4, freq='D') + + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + result = index.to_datetime() + self.assertEqual(result[0], Timestamp('1/1/2012')) + def test_get_loc_msg(self): idx = period_range('2000-1-1', freq='A', periods=10) bad_period = Period('2012', 'A') @@ -3214,6 +3590,17 @@ def test_get_loc_msg(self): except KeyError as inst: self.assertEqual(inst.args[0], bad_period) + def test_get_loc_nat(self): + didx = DatetimeIndex(['2011-01-01', 'NaT', '2011-01-03']) + pidx = PeriodIndex(['2011-01-01', 'NaT', '2011-01-03'], freq='M') + + # check DatetimeIndex compat + for idx in [didx, pidx]: + self.assertEqual(idx.get_loc(pd.NaT), 1) + self.assertEqual(idx.get_loc(None), 1) + self.assertEqual(idx.get_loc(float('nan')), 1) + self.assertEqual(idx.get_loc(np.nan), 1) + def test_append_concat(self): # #1815 d1 = date_range('12/31/1990', '12/31/1999', freq='A-DEC') @@ -3245,7 +3632,7 @@ def test_factorize(self): idx1 = PeriodIndex(['2014-01', '2014-01', '2014-02', '2014-02', '2014-03', '2014-03'], freq='M') - exp_arr = np.array([0, 0, 1, 1, 2, 2]) + exp_arr = np.array([0, 0, 1, 1, 2, 2], dtype=np.intp) exp_idx = PeriodIndex(['2014-01', '2014-02', '2014-03'], freq='M') arr, idx = idx1.factorize() @@ -3259,12 +3646,12 @@ def test_factorize(self): idx2 = pd.PeriodIndex(['2014-03', '2014-03', '2014-02', '2014-01', '2014-03', '2014-01'], freq='M') - exp_arr = np.array([2, 2, 1, 0, 2, 0]) + exp_arr = np.array([2, 2, 1, 0, 2, 0], dtype=np.intp) arr, idx = idx2.factorize(sort=True) self.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, exp_idx) - exp_arr = np.array([0, 0, 1, 2, 0, 2]) + exp_arr = np.array([0, 0, 1, 2, 0, 2], dtype=np.intp) exp_idx = PeriodIndex(['2014-03', '2014-02', '2014-01'], freq='M') arr, idx = idx2.factorize() self.assert_numpy_array_equal(arr, exp_arr) @@ -3498,95 +3885,87 @@ def test_add_offset_nat(self): for freq in ['A', '2A', '3A']: p = Period('NaT', freq=freq) for o in [offsets.YearEnd(2)]: - self.assertEqual((p + o).ordinal, tslib.iNaT) - self.assertEqual((o + p).ordinal, tslib.iNaT) + self.assertIs(p + o, tslib.NaT) + self.assertIs(o + p, tslib.NaT) for o in [offsets.YearBegin(2), offsets.MonthBegin(1), offsets.Minute(), np.timedelta64(365, 'D'), timedelta(365)]: - with tm.assertRaises(period.IncompatibleFrequency): - p + o + self.assertIs(p + o, tslib.NaT) if isinstance(o, np.timedelta64): with tm.assertRaises(TypeError): o + p else: - with tm.assertRaises(period.IncompatibleFrequency): - o + p + self.assertIs(o + p, tslib.NaT) for freq in ['M', '2M', '3M']: p = Period('NaT', freq=freq) for o in [offsets.MonthEnd(2), offsets.MonthEnd(12)]: - self.assertEqual((p + o).ordinal, tslib.iNaT) + self.assertIs(p + o, tslib.NaT) if isinstance(o, np.timedelta64): with tm.assertRaises(TypeError): o + p else: - self.assertEqual((o + p).ordinal, tslib.iNaT) + self.assertIs(o + p, tslib.NaT) for o in [offsets.YearBegin(2), offsets.MonthBegin(1), offsets.Minute(), np.timedelta64(365, 'D'), timedelta(365)]: - with tm.assertRaises(period.IncompatibleFrequency): - p + o + self.assertIs(p + o, tslib.NaT) if isinstance(o, np.timedelta64): with tm.assertRaises(TypeError): o + p else: - with tm.assertRaises(period.IncompatibleFrequency): - o + p + self.assertIs(o + p, tslib.NaT) + # freq is Tick for freq in ['D', '2D', '3D']: p = Period('NaT', freq=freq) for o in [offsets.Day(5), offsets.Hour(24), np.timedelta64(2, 'D'), np.timedelta64(3600 * 24, 's'), timedelta(-2), timedelta(hours=48)]: - self.assertEqual((p + o).ordinal, tslib.iNaT) + self.assertIs(p + o, tslib.NaT) if isinstance(o, np.timedelta64): with tm.assertRaises(TypeError): o + p else: - self.assertEqual((o + p).ordinal, tslib.iNaT) + self.assertIs(o + p, tslib.NaT) for o in [offsets.YearBegin(2), offsets.MonthBegin(1), offsets.Minute(), np.timedelta64(4, 'h'), timedelta(hours=23)]: - - with tm.assertRaises(period.IncompatibleFrequency): - p + o + self.assertIs(p + o, tslib.NaT) if isinstance(o, np.timedelta64): with tm.assertRaises(TypeError): o + p else: - with tm.assertRaises(period.IncompatibleFrequency): - o + p + self.assertIs(o + p, tslib.NaT) for freq in ['H', '2H', '3H']: p = Period('NaT', freq=freq) for o in [offsets.Day(2), offsets.Hour(3), np.timedelta64(3, 'h'), np.timedelta64(3600, 's'), timedelta(minutes=120), timedelta(days=4, minutes=180)]: - self.assertEqual((p + o).ordinal, tslib.iNaT) + self.assertIs(p + o, tslib.NaT) if not isinstance(o, np.timedelta64): - self.assertEqual((o + p).ordinal, tslib.iNaT) + self.assertIs(o + p, tslib.NaT) for o in [offsets.YearBegin(2), offsets.MonthBegin(1), offsets.Minute(), np.timedelta64(3200, 's'), timedelta(hours=23, minutes=30)]: - with tm.assertRaises(period.IncompatibleFrequency): - p + o + self.assertIs(p + o, tslib.NaT) if isinstance(o, np.timedelta64): with tm.assertRaises(TypeError): o + p else: - with tm.assertRaises(period.IncompatibleFrequency): - o + p + self.assertIs(o + p, tslib.NaT) def test_sub_pdnat(self): # GH 13071 @@ -3671,24 +4050,22 @@ def test_sub_offset_nat(self): for freq in ['A', '2A', '3A']: p = Period('NaT', freq=freq) for o in [offsets.YearEnd(2)]: - self.assertEqual((p - o).ordinal, tslib.iNaT) + self.assertIs(p - o, tslib.NaT) for o in [offsets.YearBegin(2), offsets.MonthBegin(1), offsets.Minute(), np.timedelta64(365, 'D'), timedelta(365)]: - with tm.assertRaises(period.IncompatibleFrequency): - p - o + self.assertIs(p - o, tslib.NaT) for freq in ['M', '2M', '3M']: p = Period('NaT', freq=freq) for o in [offsets.MonthEnd(2), offsets.MonthEnd(12)]: - self.assertEqual((p - o).ordinal, tslib.iNaT) + self.assertIs(p - o, tslib.NaT) for o in [offsets.YearBegin(2), offsets.MonthBegin(1), offsets.Minute(), np.timedelta64(365, 'D'), timedelta(365)]: - with tm.assertRaises(period.IncompatibleFrequency): - p - o + self.assertIs(p - o, tslib.NaT) # freq is Tick for freq in ['D', '2D', '3D']: @@ -3696,37 +4073,33 @@ def test_sub_offset_nat(self): for o in [offsets.Day(5), offsets.Hour(24), np.timedelta64(2, 'D'), np.timedelta64(3600 * 24, 's'), timedelta(-2), timedelta(hours=48)]: - self.assertEqual((p - o).ordinal, tslib.iNaT) + self.assertIs(p - o, tslib.NaT) for o in [offsets.YearBegin(2), offsets.MonthBegin(1), offsets.Minute(), np.timedelta64(4, 'h'), timedelta(hours=23)]: - with tm.assertRaises(period.IncompatibleFrequency): - p - o + self.assertIs(p - o, tslib.NaT) for freq in ['H', '2H', '3H']: p = Period('NaT', freq=freq) for o in [offsets.Day(2), offsets.Hour(3), np.timedelta64(3, 'h'), np.timedelta64(3600, 's'), timedelta(minutes=120), timedelta(days=4, minutes=180)]: - self.assertEqual((p - o).ordinal, tslib.iNaT) + self.assertIs(p - o, tslib.NaT) for o in [offsets.YearBegin(2), offsets.MonthBegin(1), offsets.Minute(), np.timedelta64(3200, 's'), timedelta(hours=23, minutes=30)]: - with tm.assertRaises(period.IncompatibleFrequency): - p - o + self.assertIs(p - o, tslib.NaT) def test_nat_ops(self): for freq in ['M', '2M', '3M']: p = Period('NaT', freq=freq) - self.assertEqual((p + 1).ordinal, tslib.iNaT) - self.assertEqual((1 + p).ordinal, tslib.iNaT) - self.assertEqual((p - 1).ordinal, tslib.iNaT) - self.assertEqual((p - Period('2011-01', freq=freq)).ordinal, - tslib.iNaT) - self.assertEqual((Period('2011-01', freq=freq) - p).ordinal, - tslib.iNaT) + self.assertIs(p + 1, tslib.NaT) + self.assertIs(1 + p, tslib.NaT) + self.assertIs(p - 1, tslib.NaT) + self.assertIs(p - Period('2011-01', freq=freq), tslib.NaT) + self.assertIs(Period('2011-01', freq=freq) - p, tslib.NaT) def test_period_ops_offset(self): p = Period('2011-04-01', freq='D') @@ -3752,18 +4125,17 @@ class TestPeriodIndexSeriesMethods(tm.TestCase): def _check(self, values, func, expected): idx = pd.PeriodIndex(values) result = func(idx) - tm.assert_index_equal(result, pd.PeriodIndex(expected)) + if isinstance(expected, pd.Index): + tm.assert_index_equal(result, expected) + else: + # comp op results in bool + tm.assert_numpy_array_equal(result, expected) s = pd.Series(values) result = func(s) - exp = pd.Series(expected) - # Period(NaT) != Period(NaT) - - lmask = result.map(lambda x: x.ordinal != tslib.iNaT) - rmask = exp.map(lambda x: x.ordinal != tslib.iNaT) - tm.assert_series_equal(lmask, rmask) - tm.assert_series_equal(result[lmask], exp[rmask]) + exp = pd.Series(expected, name=values.name) + tm.assert_series_equal(result, exp) def test_pi_ops(self): idx = PeriodIndex(['2011-01', '2011-02', '2011-03', @@ -3789,6 +4161,7 @@ def test_pi_ops_errors(self): s = pd.Series(idx) msg = "unsupported operand type\(s\)" + for obj in [idx, s]: for ng in ["str", 1.5]: with tm.assertRaisesRegexp(TypeError, msg): @@ -3801,6 +4174,24 @@ def test_pi_ops_errors(self): with tm.assertRaisesRegexp(TypeError, msg): obj - ng + with tm.assertRaises(TypeError): + np.add(obj, ng) + + if _np_version_under1p9: + self.assertIs(np.add(ng, obj), NotImplemented) + else: + with tm.assertRaises(TypeError): + np.add(ng, obj) + + with tm.assertRaises(TypeError): + np.subtract(obj, ng) + + if _np_version_under1p9: + self.assertIs(np.subtract(ng, obj), NotImplemented) + else: + with tm.assertRaises(TypeError): + np.subtract(ng, obj) + def test_pi_ops_nat(self): idx = PeriodIndex(['2011-01', '2011-02', 'NaT', '2011-04'], freq='M', name='idx') @@ -3808,8 +4199,22 @@ def test_pi_ops_nat(self): 'NaT', '2011-06'], freq='M', name='idx') self._check(idx, lambda x: x + 2, expected) self._check(idx, lambda x: 2 + x, expected) + self._check(idx, lambda x: np.add(x, 2), expected) self._check(idx + 2, lambda x: x - 2, idx) + self._check(idx + 2, lambda x: np.subtract(x, 2), idx) + + # freq with mult + idx = PeriodIndex(['2011-01', '2011-02', 'NaT', + '2011-04'], freq='2M', name='idx') + expected = PeriodIndex(['2011-07', '2011-08', + 'NaT', '2011-10'], freq='2M', name='idx') + self._check(idx, lambda x: x + 3, expected) + self._check(idx, lambda x: 3 + x, expected) + self._check(idx, lambda x: np.add(x, 3), expected) + + self._check(idx + 3, lambda x: x - 3, idx) + self._check(idx + 3, lambda x: np.subtract(x, 3), idx) def test_pi_ops_array_int(self): idx = PeriodIndex(['2011-01', '2011-02', 'NaT', @@ -3880,11 +4285,20 @@ def test_pi_sub_period(self): exp = pd.Index([-12, -11, -10, -9], name='idx') tm.assert_index_equal(result, exp) + result = np.subtract(idx, pd.Period('2012-01', freq='M')) + tm.assert_index_equal(result, exp) + result = pd.Period('2012-01', freq='M') - idx exp = pd.Index([12, 11, 10, 9], name='idx') tm.assert_index_equal(result, exp) - exp = pd.Index([np.nan, np.nan, np.nan, np.nan], name='idx') + result = np.subtract(pd.Period('2012-01', freq='M'), idx) + if _np_version_under1p9: + self.assertIs(result, NotImplemented) + else: + tm.assert_index_equal(result, exp) + + exp = pd.TimedeltaIndex([np.nan, np.nan, np.nan, np.nan], name='idx') tm.assert_index_equal(idx - pd.Period('NaT', freq='M'), exp) tm.assert_index_equal(pd.Period('NaT', freq='M') - idx, exp) @@ -3909,10 +4323,82 @@ def test_pi_sub_period_nat(self): exp = pd.Index([12, np.nan, 10, 9], name='idx') tm.assert_index_equal(result, exp) - exp = pd.Index([np.nan, np.nan, np.nan, np.nan], name='idx') + exp = pd.TimedeltaIndex([np.nan, np.nan, np.nan, np.nan], name='idx') tm.assert_index_equal(idx - pd.Period('NaT', freq='M'), exp) tm.assert_index_equal(pd.Period('NaT', freq='M') - idx, exp) + def test_pi_comp_period(self): + idx = PeriodIndex(['2011-01', '2011-02', '2011-03', + '2011-04'], freq='M', name='idx') + + f = lambda x: x == pd.Period('2011-03', freq='M') + exp = np.array([False, False, True, False], dtype=np.bool) + self._check(idx, f, exp) + f = lambda x: pd.Period('2011-03', freq='M') == x + self._check(idx, f, exp) + + f = lambda x: x != pd.Period('2011-03', freq='M') + exp = np.array([True, True, False, True], dtype=np.bool) + self._check(idx, f, exp) + f = lambda x: pd.Period('2011-03', freq='M') != x + self._check(idx, f, exp) + + f = lambda x: pd.Period('2011-03', freq='M') >= x + exp = np.array([True, True, True, False], dtype=np.bool) + self._check(idx, f, exp) + + f = lambda x: x > pd.Period('2011-03', freq='M') + exp = np.array([False, False, False, True], dtype=np.bool) + self._check(idx, f, exp) + + f = lambda x: pd.Period('2011-03', freq='M') >= x + exp = np.array([True, True, True, False], dtype=np.bool) + self._check(idx, f, exp) + + def test_pi_comp_period_nat(self): + idx = PeriodIndex(['2011-01', 'NaT', '2011-03', + '2011-04'], freq='M', name='idx') + + f = lambda x: x == pd.Period('2011-03', freq='M') + exp = np.array([False, False, True, False], dtype=np.bool) + self._check(idx, f, exp) + f = lambda x: pd.Period('2011-03', freq='M') == x + self._check(idx, f, exp) + + f = lambda x: x == tslib.NaT + exp = np.array([False, False, False, False], dtype=np.bool) + self._check(idx, f, exp) + f = lambda x: tslib.NaT == x + self._check(idx, f, exp) + + f = lambda x: x != pd.Period('2011-03', freq='M') + exp = np.array([True, True, False, True], dtype=np.bool) + self._check(idx, f, exp) + f = lambda x: pd.Period('2011-03', freq='M') != x + self._check(idx, f, exp) + + f = lambda x: x != tslib.NaT + exp = np.array([True, True, True, True], dtype=np.bool) + self._check(idx, f, exp) + f = lambda x: tslib.NaT != x + self._check(idx, f, exp) + + f = lambda x: pd.Period('2011-03', freq='M') >= x + exp = np.array([True, False, True, False], dtype=np.bool) + self._check(idx, f, exp) + + f = lambda x: x < pd.Period('2011-03', freq='M') + exp = np.array([True, False, False, False], dtype=np.bool) + self._check(idx, f, exp) + + f = lambda x: x > tslib.NaT + exp = np.array([False, False, False, False], dtype=np.bool) + self._check(idx, f, exp) + + f = lambda x: tslib.NaT >= x + exp = np.array([False, False, False, False], dtype=np.bool) + self._check(idx, f, exp) + class TestPeriodRepresentation(tm.TestCase): """ @@ -3955,7 +4441,8 @@ def test_nanosecondly(self): def _check_freq(self, freq, base_date): rng = PeriodIndex(start=base_date, periods=10, freq=freq) exp = np.arange(10, dtype=np.int64) - self.assert_numpy_array_equal(rng.values, exp) + self.assert_numpy_array_equal(rng._values, exp) + self.assert_numpy_array_equal(rng.asi8, exp) def test_negone_ordinals(self): freqs = ['A', 'M', 'Q', 'D', 'H', 'T', 'S'] @@ -4222,6 +4709,36 @@ def test_constructor_cast_object(self): exp = Series(period_range('1/1/2000', periods=10)) tm.assert_series_equal(s, exp) + def test_isnull(self): + # GH 13737 + s = Series([pd.Period('2011-01', freq='M'), + pd.Period('NaT', freq='M')]) + tm.assert_series_equal(s.isnull(), Series([False, True])) + tm.assert_series_equal(s.notnull(), Series([True, False])) + + def test_fillna(self): + # GH 13737 + s = Series([pd.Period('2011-01', freq='M'), + pd.Period('NaT', freq='M')]) + + res = s.fillna(pd.Period('2012-01', freq='M')) + exp = Series([pd.Period('2011-01', freq='M'), + pd.Period('2012-01', freq='M')]) + tm.assert_series_equal(res, exp) + self.assertEqual(res.dtype, 'object') + + res = s.fillna('XXX') + exp = Series([pd.Period('2011-01', freq='M'), 'XXX']) + tm.assert_series_equal(res, exp) + self.assertEqual(res.dtype, 'object') + + def test_dropna(self): + # GH 13737 + s = Series([pd.Period('2011-01', freq='M'), + pd.Period('NaT', freq='M')]) + tm.assert_series_equal(s.dropna(), + Series([pd.Period('2011-01', freq='M')])) + def test_series_comparison_scalars(self): val = pd.Period('2000-01-04', freq='D') result = self.series > val @@ -4248,10 +4765,10 @@ def test_NaT_scalar(self): series = Series([0, 1000, 2000, iNaT], dtype='period[D]') val = series[3] - self.assertTrue(com.isnull(val)) + self.assertTrue(isnull(val)) series[2] = val - self.assertTrue(com.isnull(series[2])) + self.assertTrue(isnull(series[2])) def test_NaT_cast(self): result = Series([np.nan]).astype('period[D]') @@ -4458,6 +4975,7 @@ def test_get_period_field_array_raises_on_out_of_range(self): self.assertRaises(ValueError, _period.get_period_field_arr, -1, np.empty(1), 0) + if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py index 2236d20975eee..204808dd510a0 100644 --- a/pandas/tseries/tests/test_resample.py +++ b/pandas/tseries/tests/test_resample.py @@ -11,10 +11,11 @@ import pandas.util.testing as tm from pandas import (Series, DataFrame, Panel, Index, isnull, notnull, Timestamp) + +from pandas.types.generic import ABCSeries, ABCDataFrame from pandas.compat import range, lrange, zip, product, OrderedDict from pandas.core.base import SpecificationError -from pandas.core.common import (ABCSeries, ABCDataFrame, - UnsupportedFunctionCall) +from pandas.core.common import UnsupportedFunctionCall from pandas.core.groupby import DataError from pandas.tseries.frequencies import MONTHS, DAYS from pandas.tseries.frequencies import to_offset @@ -167,6 +168,13 @@ def f(): check_stacklevel=False): self.assertIsInstance(getattr(r, op)(2), pd.Series) + # IPython introspection shouldn't trigger warning GH 13618 + for op in ['_repr_json', '_repr_latex', + '_ipython_canary_method_should_not_exist_']: + r = self.series.resample('H') + with tm.assert_produces_warning(None): + getattr(r, op, None) + # getitem compat df = self.series.to_frame('foo') @@ -363,18 +371,44 @@ def test_apply_without_aggregation(self): result = t.apply(lambda x: x) assert_series_equal(result, self.series) + def test_agg_consistency(self): + + # make sure that we are consistent across + # similar aggregations with and w/o selection list + df = DataFrame(np.random.randn(1000, 3), + index=pd.date_range('1/1/2012', freq='S', periods=1000), + columns=['A', 'B', 'C']) + + r = df.resample('3T') + + expected = r[['A', 'B', 'C']].agg({'r1': 'mean', 'r2': 'sum'}) + result = r.agg({'r1': 'mean', 'r2': 'sum'}) + assert_frame_equal(result, expected) + + # TODO: once GH 14008 is fixed, move these tests into + # `Base` test class def test_agg(self): - # test with both a Resampler and a TimeGrouper + # test with all three Resampler apis and TimeGrouper np.random.seed(1234) + index = date_range(datetime(2005, 1, 1), + datetime(2005, 1, 10), freq='D') + index.name = 'date' df = pd.DataFrame(np.random.rand(10, 2), columns=list('AB'), - index=pd.date_range('2010-01-01 09:00:00', - periods=10, - freq='s')) + index=index) + df_col = df.reset_index() + df_mult = df_col.copy() + df_mult.index = pd.MultiIndex.from_arrays([range(10), df.index], + names=['index', 'date']) + r = df.resample('2D') + cases = [ + r, + df_col.resample('2D', on='date'), + df_mult.resample('2D', level='date'), + df.groupby(pd.Grouper(freq='2D')) + ] - r = df.resample('2s') - g = df.groupby(pd.Grouper(freq='2s')) a_mean = r['A'].mean() a_std = r['A'].std() a_sum = r['A'].sum() @@ -385,12 +419,12 @@ def test_agg(self): expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1) expected.columns = pd.MultiIndex.from_product([['A', 'B'], ['mean', 'std']]) - for t in [r, g]: + for t in cases: result = t.aggregate([np.mean, np.std]) assert_frame_equal(result, expected) expected = pd.concat([a_mean, b_std], axis=1) - for t in [r, g]: + for t in cases: result = t.aggregate({'A': np.mean, 'B': np.std}) assert_frame_equal(result, expected, check_like=True) @@ -398,20 +432,20 @@ def test_agg(self): expected = pd.concat([a_mean, a_std], axis=1) expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'), ('A', 'std')]) - for t in [r, g]: + for t in cases: result = t.aggregate({'A': ['mean', 'std']}) assert_frame_equal(result, expected) expected = pd.concat([a_mean, a_sum], axis=1) expected.columns = ['mean', 'sum'] - for t in [r, g]: + for t in cases: result = t['A'].aggregate(['mean', 'sum']) assert_frame_equal(result, expected) expected = pd.concat([a_mean, a_sum], axis=1) expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'), ('A', 'sum')]) - for t in [r, g]: + for t in cases: result = t.aggregate({'A': {'mean': 'mean', 'sum': 'sum'}}) assert_frame_equal(result, expected, check_like=True) @@ -420,7 +454,7 @@ def test_agg(self): ('A', 'sum'), ('B', 'mean2'), ('B', 'sum2')]) - for t in [r, g]: + for t in cases: result = t.aggregate({'A': {'mean': 'mean', 'sum': 'sum'}, 'B': {'mean2': 'mean', 'sum2': 'sum'}}) assert_frame_equal(result, expected, check_like=True) @@ -430,7 +464,7 @@ def test_agg(self): ('A', 'std'), ('B', 'mean'), ('B', 'std')]) - for t in [r, g]: + for t in cases: result = t.aggregate({'A': ['mean', 'std'], 'B': ['mean', 'std']}) assert_frame_equal(result, expected, check_like=True) @@ -442,20 +476,30 @@ def test_agg(self): ('r2', 'B', 'sum')]) def test_agg_misc(self): - # test with both a Resampler and a TimeGrouper + # test with all three Resampler apis and TimeGrouper np.random.seed(1234) + index = date_range(datetime(2005, 1, 1), + datetime(2005, 1, 10), freq='D') + index.name = 'date' df = pd.DataFrame(np.random.rand(10, 2), columns=list('AB'), - index=pd.date_range('2010-01-01 09:00:00', - periods=10, - freq='s')) - - r = df.resample('2s') - g = df.groupby(pd.Grouper(freq='2s')) + index=index) + df_col = df.reset_index() + df_mult = df_col.copy() + df_mult.index = pd.MultiIndex.from_arrays([range(10), df.index], + names=['index', 'date']) + + r = df.resample('2D') + cases = [ + r, + df_col.resample('2D', on='date'), + df_mult.resample('2D', level='date'), + df.groupby(pd.Grouper(freq='2D')) + ] # passed lambda - for t in [r, g]: + for t in cases: result = t.agg({'A': np.sum, 'B': lambda x: np.std(x, ddof=1)}) rcustom = t['B'].apply(lambda x: np.std(x, ddof=1)) @@ -472,7 +516,7 @@ def test_agg_misc(self): ('result1', 'B'), ('result2', 'A'), ('result2', 'B')]) - for t in [r, g]: + for t in cases: result = t[['A', 'B']].agg(OrderedDict([('result1', np.sum), ('result2', np.mean)])) assert_frame_equal(result, expected, check_like=True) @@ -487,19 +531,19 @@ def test_agg_misc(self): ('A', 'std'), ('B', 'mean'), ('B', 'std')]) - for t in [r, g]: + for t in cases: result = t.agg(OrderedDict([('A', ['sum', 'std']), ('B', ['mean', 'std'])])) assert_frame_equal(result, expected, check_like=True) # equivalent of using a selection list / or not - for t in [r, g]: - result = g[['A', 'B']].agg({'A': ['sum', 'std'], + for t in cases: + result = t[['A', 'B']].agg({'A': ['sum', 'std'], 'B': ['mean', 'std']}) assert_frame_equal(result, expected, check_like=True) # series like aggs - for t in [r, g]: + for t in cases: result = t['A'].agg({'A': ['sum', 'std']}) expected = pd.concat([t['A'].sum(), t['A'].std()], @@ -520,9 +564,9 @@ def test_agg_misc(self): # errors # invalid names in the agg specification - for t in [r, g]: + for t in cases: def f(): - r[['A']].agg({'A': ['sum', 'std'], + t[['A']].agg({'A': ['sum', 'std'], 'B': ['mean', 'std']}) self.assertRaises(SpecificationError, f) @@ -530,22 +574,31 @@ def f(): def test_agg_nested_dicts(self): np.random.seed(1234) + index = date_range(datetime(2005, 1, 1), + datetime(2005, 1, 10), freq='D') + index.name = 'date' df = pd.DataFrame(np.random.rand(10, 2), columns=list('AB'), - index=pd.date_range('2010-01-01 09:00:00', - periods=10, - freq='s')) - - r = df.resample('2s') - g = df.groupby(pd.Grouper(freq='2s')) - - for t in [r, g]: + index=index) + df_col = df.reset_index() + df_mult = df_col.copy() + df_mult.index = pd.MultiIndex.from_arrays([range(10), df.index], + names=['index', 'date']) + r = df.resample('2D') + cases = [ + r, + df_col.resample('2D', on='date'), + df_mult.resample('2D', level='date'), + df.groupby(pd.Grouper(freq='2D')) + ] + + for t in cases: def f(): t.aggregate({'r1': {'A': ['mean', 'sum']}, 'r2': {'B': ['mean', 'sum']}}) self.assertRaises(ValueError, f) - for t in [r, g]: + for t in cases: expected = pd.concat([t['A'].mean(), t['A'].std(), t['B'].mean(), t['B'].std()], axis=1) expected.columns = pd.MultiIndex.from_tuples([('ra', 'mean'), ( @@ -559,19 +612,44 @@ def f(): 'B': {'rb': ['mean', 'std']}}) assert_frame_equal(result, expected, check_like=True) - def test_agg_consistency(self): + def test_selection_api_validation(self): + # GH 13500 + index = date_range(datetime(2005, 1, 1), + datetime(2005, 1, 10), freq='D') + df = pd.DataFrame({'date': index, + 'a': np.arange(len(index), dtype=np.int64)}, + index=pd.MultiIndex.from_arrays([ + np.arange(len(index), dtype=np.int64), + index], names=['v', 'd'])) + df_exp = pd.DataFrame({'a': np.arange(len(index), dtype=np.int64)}, + index=index) - # make sure that we are consistent across - # similar aggregations with and w/o selection list - df = DataFrame(np.random.randn(1000, 3), - index=pd.date_range('1/1/2012', freq='S', periods=1000), - columns=['A', 'B', 'C']) + # non DatetimeIndex + with tm.assertRaises(TypeError): + df.resample('2D', level='v') - r = df.resample('3T') + with tm.assertRaises(ValueError): + df.resample('2D', on='date', level='d') - expected = r[['A', 'B', 'C']].agg({'r1': 'mean', 'r2': 'sum'}) - result = r.agg({'r1': 'mean', 'r2': 'sum'}) - assert_frame_equal(result, expected) + with tm.assertRaises(TypeError): + df.resample('2D', on=['a', 'date']) + + with tm.assertRaises(KeyError): + df.resample('2D', level=['a', 'date']) + + # upsampling not allowed + with tm.assertRaises(ValueError): + df.resample('2D', level='d').asfreq() + + with tm.assertRaises(ValueError): + df.resample('2D', on='date').asfreq() + + exp = df_exp.resample('2D').sum() + exp.index.name = 'date' + assert_frame_equal(exp, df.resample('2D', on='date').sum()) + + exp.index.name = 'd' + assert_frame_equal(exp, df.resample('2D', level='d').sum()) class Base(object): @@ -1927,6 +2005,40 @@ def test_resample_with_nat(self): assert_frame_equal(frame.resample('60s').mean(), frame_3s) + def test_resample_timedelta_values(self): + # GH 13119 + # check that timedelta dtype is preserved when NaT values are + # introduced by the resampling + + times = timedelta_range('1 day', '4 day', freq='4D') + df = DataFrame({'time': times}, index=times) + + times2 = timedelta_range('1 day', '4 day', freq='2D') + exp = Series(times2, index=times2, name='time') + exp.iloc[1] = pd.NaT + + res = df.resample('2D').first()['time'] + tm.assert_series_equal(res, exp) + res = df['time'].resample('2D').first() + tm.assert_series_equal(res, exp) + + def test_resample_datetime_values(self): + # GH 13119 + # check that datetime dtype is preserved when NaT values are + # introduced by the resampling + + dates = [datetime(2016, 1, 15), datetime(2016, 1, 19)] + df = DataFrame({'timestamp': dates}, index=dates) + + exp = Series([datetime(2016, 1, 15), pd.NaT, datetime(2016, 1, 19)], + index=date_range('2016-01-15', periods=3, freq='2D'), + name='timestamp') + + res = df.resample('2D').first()['timestamp'] + tm.assert_series_equal(res, exp) + res = df['timestamp'].resample('2D').first() + tm.assert_series_equal(res, exp) + class TestPeriodIndex(Base, tm.TestCase): _multiprocess_can_split_ = True @@ -1984,6 +2096,22 @@ def test_asfreq_upsample(self): result = frame.resample('1H').asfreq() assert_frame_equal(result, expected) + def test_selection(self): + index = self.create_series().index + # This is a bug, these should be implemented + # GH 14008 + df = pd.DataFrame({'date': index, + 'a': np.arange(len(index), dtype=np.int64)}, + index=pd.MultiIndex.from_arrays([ + np.arange(len(index), dtype=np.int64), + index], names=['v', 'd'])) + + with tm.assertRaises(NotImplementedError): + df.resample('2D', on='date') + + with tm.assertRaises(NotImplementedError): + df.resample('2D', level='d') + def test_annual_upsample_D_s_f(self): self._check_annual_upsample_cases('D', 'start', 'ffill') diff --git a/pandas/tseries/tests/test_timedeltas.py b/pandas/tseries/tests/test_timedeltas.py index 10276137b42a1..ab413af897215 100644 --- a/pandas/tseries/tests/test_timedeltas.py +++ b/pandas/tseries/tests/test_timedeltas.py @@ -30,6 +30,25 @@ class TestTimedeltas(tm.TestCase): def setUp(self): pass + def test_get_loc_nat(self): + tidx = TimedeltaIndex(['1 days 01:00:00', 'NaT', '2 days 01:00:00']) + + self.assertEqual(tidx.get_loc(pd.NaT), 1) + self.assertEqual(tidx.get_loc(None), 1) + self.assertEqual(tidx.get_loc(float('nan')), 1) + self.assertEqual(tidx.get_loc(np.nan), 1) + + def test_contains(self): + # Checking for any NaT-like objects + # GH 13603 + td = to_timedelta(range(5), unit='d') + pd.offsets.Hour(1) + for v in [pd.NaT, None, float('nan'), np.nan]: + self.assertFalse((v in td)) + + td = to_timedelta([pd.NaT]) + for v in [pd.NaT, None, float('nan'), np.nan]: + self.assertTrue((v in td)) + def test_construction(self): expected = np.timedelta64(10, 'D').astype('m8[ns]').view('i8') @@ -137,12 +156,12 @@ def test_construction(self): self.assertRaises(ValueError, lambda: Timedelta('3.1415')) # invalid construction - tm.assertRaisesRegexp(ValueError, "cannot construct a TimeDelta", + tm.assertRaisesRegexp(ValueError, "cannot construct a Timedelta", lambda: Timedelta()) tm.assertRaisesRegexp(ValueError, "unit abbreviation w/o a number", lambda: Timedelta('foo')) tm.assertRaisesRegexp(ValueError, - "cannot construct a TimeDelta from the passed " + "cannot construct a Timedelta from the passed " "arguments, allowed keywords are ", lambda: Timedelta(day=10)) @@ -169,7 +188,8 @@ def test_construction(self): self.assertEqual(Timedelta('').value, iNaT) self.assertEqual(Timedelta('nat').value, iNaT) self.assertEqual(Timedelta('NAT').value, iNaT) - self.assertTrue(isnull(Timestamp('nat'))) + self.assertEqual(Timedelta(None).value, iNaT) + self.assertEqual(Timedelta(np.nan).value, iNaT) self.assertTrue(isnull(Timedelta('nat'))) # offset @@ -472,6 +492,21 @@ class Other: self.assertTrue(td.__mul__(other) is NotImplemented) self.assertTrue(td.__floordiv__(td) is NotImplemented) + def test_ops_error_str(self): + # GH 13624 + td = Timedelta('1 day') + + for l, r in [(td, 'a'), ('a', td)]: + + with tm.assertRaises(TypeError): + l + r + + with tm.assertRaises(TypeError): + l > r + + self.assertFalse(l == r) + self.assertTrue(l != r) + def test_fields(self): def check(value): # that we are int/long like @@ -810,6 +845,11 @@ def testit(unit, transform): def test_to_timedelta_invalid(self): + # bad value for errors parameter + msg = "errors must be one of" + tm.assertRaisesRegexp(ValueError, msg, to_timedelta, + ['foo'], errors='never') + # these will error self.assertRaises(ValueError, lambda: to_timedelta([1, 2], unit='foo')) self.assertRaises(ValueError, lambda: to_timedelta(1, unit='foo')) @@ -827,6 +867,24 @@ def test_to_timedelta_invalid(self): to_timedelta(['1 day', 'bar', '1 min'], errors='coerce')) + # gh-13613: these should not error because errors='ignore' + invalid_data = 'apple' + self.assertEqual(invalid_data, to_timedelta( + invalid_data, errors='ignore')) + + invalid_data = ['apple', '1 days'] + tm.assert_numpy_array_equal( + np.array(invalid_data, dtype=object), + to_timedelta(invalid_data, errors='ignore')) + + invalid_data = pd.Index(['apple', '1 days']) + tm.assert_index_equal(invalid_data, to_timedelta( + invalid_data, errors='ignore')) + + invalid_data = Series(['apple', '1 days']) + tm.assert_series_equal(invalid_data, to_timedelta( + invalid_data, errors='ignore')) + def test_to_timedelta_via_apply(self): # GH 5458 expected = Series([np.timedelta64(1, 's')]) @@ -1432,6 +1490,23 @@ def test_comparisons_nat(self): expected = np.array([True, True, True, True, True, False]) self.assert_numpy_array_equal(result, expected) + def test_ops_error_str(self): + # GH 13624 + tdi = TimedeltaIndex(['1 day', '2 days']) + + for l, r in [(tdi, 'a'), ('a', tdi)]: + with tm.assertRaises(TypeError): + l + r + + with tm.assertRaises(TypeError): + l > r + + with tm.assertRaises(TypeError): + l == r + + with tm.assertRaises(TypeError): + l != r + def test_map(self): rng = timedelta_range('1 day', periods=10) @@ -1448,7 +1523,7 @@ def test_misc_coverage(self): tm.assertIsInstance(list(result.values())[0][0], Timedelta) idx = TimedeltaIndex(['3d', '1d', '2d']) - self.assertTrue(idx.equals(list(idx))) + self.assertFalse(idx.equals(list(idx))) non_td = Index(list('abc')) self.assertFalse(idx.equals(list(non_td))) @@ -1547,12 +1622,14 @@ def test_sort_values(self): ordered, dexer = idx.sort_values(return_indexer=True) self.assertTrue(ordered.is_monotonic) self.assert_numpy_array_equal(dexer, - np.array([1, 2, 0], dtype=np.int64)) + np.array([1, 2, 0]), + check_dtype=False) ordered, dexer = idx.sort_values(return_indexer=True, ascending=False) self.assertTrue(ordered[::-1].is_monotonic) self.assert_numpy_array_equal(dexer, - np.array([0, 2, 1], dtype=np.int64)) + np.array([0, 2, 1]), + check_dtype=False) def test_insert(self): @@ -1739,13 +1816,13 @@ def test_join_self(self): kinds = 'outer', 'inner', 'left', 'right' for kind in kinds: joined = index.join(index, how=kind) - self.assertIs(index, joined) + tm.assert_index_equal(index, joined) def test_factorize(self): idx1 = TimedeltaIndex(['1 day', '1 day', '2 day', '2 day', '3 day', '3 day']) - exp_arr = np.array([0, 0, 1, 1, 2, 2]) + exp_arr = np.array([0, 0, 1, 1, 2, 2], dtype=np.intp) exp_idx = TimedeltaIndex(['1 day', '2 day', '3 day']) arr, idx = idx1.factorize() @@ -1758,7 +1835,7 @@ def test_factorize(self): # freq must be preserved idx3 = timedelta_range('1 day', periods=4, freq='s') - exp_arr = np.array([0, 1, 2, 3]) + exp_arr = np.array([0, 1, 2, 3], dtype=np.intp) arr, idx = idx3.factorize() self.assert_numpy_array_equal(arr, exp_arr) self.assert_index_equal(idx, idx3) diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index f6d80f7ee410b..ac48fcc2551ea 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -12,10 +12,10 @@ import pandas.lib as lib import pandas.tslib as tslib +from pandas.types.common import is_datetime64_ns_dtype import pandas as pd import pandas.compat as compat import pandas.core.common as com -import pandas.core.datetools as datetools import pandas.tseries.frequencies as frequencies import pandas.tseries.offsets as offsets import pandas.tseries.tools as tools @@ -565,7 +565,7 @@ def test_frame_fillna_limit(self): def test_frame_setitem_timestamp(self): # 2155 columns = DatetimeIndex(start='1/1/2012', end='2/1/2012', - freq=datetools.bday) + freq=offsets.BDay()) index = lrange(10) data = DataFrame(columns=columns, index=index) t = datetime(2012, 11, 1) @@ -744,6 +744,14 @@ def test_to_datetime_unit(self): seconds=t) for t in range(20)] + [NaT]) assert_series_equal(result, expected) + # GH13834 + s = Series([epoch + t for t in np.arange(0, 2, .25)] + + [iNaT]).astype(float) + result = to_datetime(s, unit='s') + expected = Series([Timestamp('2013-06-09 02:42:28') + timedelta( + seconds=t) for t in np.arange(0, 2, .25)] + [NaT]) + assert_series_equal(result, expected) + s = concat([Series([epoch + t for t in range(20)] ).astype(float), Series([np.nan])], ignore_index=True) @@ -968,13 +976,20 @@ def test_nat_vector_field_access(self): fields = ['year', 'quarter', 'month', 'day', 'hour', 'minute', 'second', 'microsecond', 'nanosecond', 'week', 'dayofyear', - 'days_in_month'] + 'days_in_month', 'is_leap_year'] + for field in fields: result = getattr(idx, field) - expected = [getattr(x, field) if x is not NaT else np.nan - for x in idx] + expected = [getattr(x, field) for x in idx] self.assert_numpy_array_equal(result, np.array(expected)) + s = pd.Series(idx) + + for field in fields: + result = getattr(s.dt, field) + expected = [getattr(x, field) for x in idx] + self.assert_series_equal(result, pd.Series(expected)) + def test_nat_scalar_field_access(self): fields = ['year', 'quarter', 'month', 'day', 'hour', 'minute', 'second', 'microsecond', 'nanosecond', 'week', 'dayofyear', @@ -1004,7 +1019,13 @@ def test_NaT_methods(self): for method in nat_methods: if hasattr(NaT, method): - self.assertIs(getattr(NaT, method)(), NaT) + # see gh-8254 + exp_warning = None + if method == 'to_datetime': + exp_warning = FutureWarning + with tm.assert_produces_warning( + exp_warning, check_stacklevel=False): + self.assertIs(getattr(NaT, method)(), NaT) # GH 12300 self.assertEqual(NaT.isoformat(), 'NaT') @@ -1074,6 +1095,71 @@ def test_to_datetime_freq(self): self.assertEqual(xp.freq, rs.freq) self.assertEqual(xp.tzinfo, rs.tzinfo) + def test_range_edges(self): + # GH 13672 + idx = DatetimeIndex(start=Timestamp('1970-01-01 00:00:00.000000001'), + end=Timestamp('1970-01-01 00:00:00.000000004'), + freq='N') + exp = DatetimeIndex(['1970-01-01 00:00:00.000000001', + '1970-01-01 00:00:00.000000002', + '1970-01-01 00:00:00.000000003', + '1970-01-01 00:00:00.000000004']) + tm.assert_index_equal(idx, exp) + + idx = DatetimeIndex(start=Timestamp('1970-01-01 00:00:00.000000004'), + end=Timestamp('1970-01-01 00:00:00.000000001'), + freq='N') + exp = DatetimeIndex([]) + tm.assert_index_equal(idx, exp) + + idx = DatetimeIndex(start=Timestamp('1970-01-01 00:00:00.000000001'), + end=Timestamp('1970-01-01 00:00:00.000000001'), + freq='N') + exp = DatetimeIndex(['1970-01-01 00:00:00.000000001']) + tm.assert_index_equal(idx, exp) + + idx = DatetimeIndex(start=Timestamp('1970-01-01 00:00:00.000001'), + end=Timestamp('1970-01-01 00:00:00.000004'), + freq='U') + exp = DatetimeIndex(['1970-01-01 00:00:00.000001', + '1970-01-01 00:00:00.000002', + '1970-01-01 00:00:00.000003', + '1970-01-01 00:00:00.000004']) + tm.assert_index_equal(idx, exp) + + idx = DatetimeIndex(start=Timestamp('1970-01-01 00:00:00.001'), + end=Timestamp('1970-01-01 00:00:00.004'), + freq='L') + exp = DatetimeIndex(['1970-01-01 00:00:00.001', + '1970-01-01 00:00:00.002', + '1970-01-01 00:00:00.003', + '1970-01-01 00:00:00.004']) + tm.assert_index_equal(idx, exp) + + idx = DatetimeIndex(start=Timestamp('1970-01-01 00:00:01'), + end=Timestamp('1970-01-01 00:00:04'), freq='S') + exp = DatetimeIndex(['1970-01-01 00:00:01', '1970-01-01 00:00:02', + '1970-01-01 00:00:03', '1970-01-01 00:00:04']) + tm.assert_index_equal(idx, exp) + + idx = DatetimeIndex(start=Timestamp('1970-01-01 00:01'), + end=Timestamp('1970-01-01 00:04'), freq='T') + exp = DatetimeIndex(['1970-01-01 00:01', '1970-01-01 00:02', + '1970-01-01 00:03', '1970-01-01 00:04']) + tm.assert_index_equal(idx, exp) + + idx = DatetimeIndex(start=Timestamp('1970-01-01 01:00'), + end=Timestamp('1970-01-01 04:00'), freq='H') + exp = DatetimeIndex(['1970-01-01 01:00', '1970-01-01 02:00', + '1970-01-01 03:00', '1970-01-01 04:00']) + tm.assert_index_equal(idx, exp) + + idx = DatetimeIndex(start=Timestamp('1970-01-01'), + end=Timestamp('1970-01-04'), freq='D') + exp = DatetimeIndex(['1970-01-01', '1970-01-02', + '1970-01-03', '1970-01-04']) + tm.assert_index_equal(idx, exp) + def test_range_misspecified(self): # GH #1095 @@ -1261,13 +1347,6 @@ def test_format_pre_1900_dates(self): ts = Series(1, index=rng) repr(ts) - def test_repeat(self): - rng = date_range('1/1/2000', '1/1/2001') - - result = rng.repeat(5) - self.assertIsNone(result.freq) - self.assertEqual(len(result), 5 * len(rng)) - def test_at_time(self): rng = date_range('1/1/2000', '1/5/2000', freq='5min') ts = Series(np.random.randn(len(rng)), index=rng) @@ -1519,6 +1598,27 @@ def test_dti_constructor_years_only(self): (rng3, expected3), (rng4, expected4)]: tm.assert_index_equal(rng, expected) + def test_dti_constructor_small_int(self): + # GH 13721 + exp = DatetimeIndex(['1970-01-01 00:00:00.00000000', + '1970-01-01 00:00:00.00000001', + '1970-01-01 00:00:00.00000002']) + + for dtype in [np.int64, np.int32, np.int16, np.int8]: + arr = np.array([0, 10, 20], dtype=dtype) + tm.assert_index_equal(DatetimeIndex(arr), exp) + + def test_dti_constructor_numpy_timeunits(self): + # GH 9114 + base = pd.to_datetime(['2000-01-01T00:00', '2000-01-02T00:00', 'NaT']) + + for dtype in ['datetime64[h]', 'datetime64[m]', 'datetime64[s]', + 'datetime64[ms]', 'datetime64[us]', 'datetime64[ns]']: + values = base.values.astype(dtype) + + tm.assert_index_equal(DatetimeIndex(values), base) + tm.assert_index_equal(to_datetime(values), base) + def test_normalize(self): rng = date_range('1/1/2000 9:30', periods=10, freq='D') @@ -1817,7 +1917,7 @@ def test_astype_object(self): self.assertEqual(casted.tolist(), exp_values) def test_catch_infinite_loop(self): - offset = datetools.DateOffset(minute=5) + offset = offsets.DateOffset(minute=5) # blow up, don't loop forever self.assertRaises(Exception, date_range, datetime(2011, 11, 11), datetime(2011, 11, 12), freq=offset) @@ -2282,7 +2382,7 @@ def test_to_datetime_tz_psycopg2(self): i = pd.DatetimeIndex([ '2000-01-01 08:00:00+00:00' ], tz=psycopg2.tz.FixedOffsetTimezone(offset=-300, name=None)) - self.assertFalse(com.is_datetime64_ns_dtype(i)) + self.assertFalse(is_datetime64_ns_dtype(i)) # tz coerceion result = pd.to_datetime(i, errors='coerce') @@ -2440,15 +2540,19 @@ def test_unit_mixed(self): def test_index_to_datetime(self): idx = Index(['1/1/2000', '1/2/2000', '1/3/2000']) - result = idx.to_datetime() - expected = DatetimeIndex(datetools.to_datetime(idx.values)) - tm.assert_index_equal(result, expected) + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + result = idx.to_datetime() + expected = DatetimeIndex(pd.to_datetime(idx.values)) + tm.assert_index_equal(result, expected) - today = datetime.today() - idx = Index([today], dtype=object) - result = idx.to_datetime() - expected = DatetimeIndex([today]) - tm.assert_index_equal(result, expected) + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + today = datetime.today() + idx = Index([today], dtype=object) + result = idx.to_datetime() + expected = DatetimeIndex([today]) + tm.assert_index_equal(result, expected) def test_dataframe(self): @@ -2563,6 +2667,33 @@ def test_dataframe(self): with self.assertRaises(ValueError): to_datetime(df2) + def test_dataframe_dtypes(self): + # #13451 + df = DataFrame({'year': [2015, 2016], + 'month': [2, 3], + 'day': [4, 5]}) + + # int16 + result = to_datetime(df.astype('int16')) + expected = Series([Timestamp('20150204 00:00:00'), + Timestamp('20160305 00:00:00')]) + assert_series_equal(result, expected) + + # mixed dtypes + df['month'] = df['month'].astype('int8') + df['day'] = df['day'].astype('int8') + result = to_datetime(df) + expected = Series([Timestamp('20150204 00:00:00'), + Timestamp('20160305 00:00:00')]) + assert_series_equal(result, expected) + + # float + df = DataFrame({'year': [2000, 2001], + 'month': [1.5, 1], + 'day': [1, 1]}) + with self.assertRaises(ValueError): + to_datetime(df) + class TestDatetimeIndex(tm.TestCase): _multiprocess_can_split_ = True @@ -2900,7 +3031,7 @@ def test_misc_coverage(self): tm.assertIsInstance(list(result.values())[0][0], Timestamp) idx = DatetimeIndex(['2000-01-03', '2000-01-01', '2000-01-02']) - self.assertTrue(idx.equals(list(idx))) + self.assertFalse(idx.equals(list(idx))) non_datetime = Index(list('abc')) self.assertFalse(idx.equals(list(non_datetime))) @@ -3095,10 +3226,14 @@ def test_datetime64_with_DateOffset(self): exp = klass([Timestamp('2001-1-1'), Timestamp('2001-2-1')]) assert_func(result, exp) - s = klass([Timestamp('2000-01-05 00:15:00'), Timestamp( - '2000-01-31 00:23:00'), Timestamp('2000-01-01'), Timestamp( - '2000-03-31'), Timestamp('2000-02-29'), Timestamp( - '2000-12-31')]) + s = klass([Timestamp('2000-01-05 00:15:00'), + Timestamp('2000-01-31 00:23:00'), + Timestamp('2000-01-01'), + Timestamp('2000-03-31'), + Timestamp('2000-02-29'), + Timestamp('2000-12-31'), + Timestamp('2000-05-15'), + Timestamp('2001-06-15')]) # DateOffset relativedelta fastpath relative_kwargs = [('years', 2), ('months', 5), ('days', 3), @@ -3115,6 +3250,7 @@ def test_datetime64_with_DateOffset(self): # assert these are equal on a piecewise basis offsets = ['YearBegin', ('YearBegin', {'month': 5}), 'YearEnd', ('YearEnd', {'month': 5}), 'MonthBegin', 'MonthEnd', + 'SemiMonthEnd', 'SemiMonthBegin', 'Week', ('Week', { 'weekday': 3 }), 'BusinessDay', 'BDay', 'QuarterEnd', 'QuarterBegin', @@ -3642,7 +3778,7 @@ def test_ns_index(self): dtstart = np.datetime64('2012-09-20T00:00:00') dt = dtstart + np.arange(nsamples) * np.timedelta64(ns, 'ns') - freq = ns * pd.datetools.Nano() + freq = ns * offsets.Nano() index = pd.DatetimeIndex(dt, freq=freq, name='time') self.assert_index_parameters(index) @@ -3666,7 +3802,7 @@ def test_factorize(self): idx1 = DatetimeIndex(['2014-01', '2014-01', '2014-02', '2014-02', '2014-03', '2014-03']) - exp_arr = np.array([0, 0, 1, 1, 2, 2]) + exp_arr = np.array([0, 0, 1, 1, 2, 2], dtype=np.intp) exp_idx = DatetimeIndex(['2014-01', '2014-02', '2014-03']) arr, idx = idx1.factorize() @@ -3688,13 +3824,13 @@ def test_factorize(self): idx2 = pd.DatetimeIndex(['2014-03', '2014-03', '2014-02', '2014-01', '2014-03', '2014-01']) - exp_arr = np.array([2, 2, 1, 0, 2, 0]) + exp_arr = np.array([2, 2, 1, 0, 2, 0], dtype=np.intp) exp_idx = DatetimeIndex(['2014-01', '2014-02', '2014-03']) arr, idx = idx2.factorize(sort=True) self.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, exp_idx) - exp_arr = np.array([0, 0, 1, 2, 0, 2]) + exp_arr = np.array([0, 0, 1, 2, 0, 2], dtype=np.intp) exp_idx = DatetimeIndex(['2014-03', '2014-02', '2014-01']) arr, idx = idx2.factorize() self.assert_numpy_array_equal(arr, exp_arr) @@ -3702,11 +3838,42 @@ def test_factorize(self): # freq must be preserved idx3 = date_range('2000-01', periods=4, freq='M', tz='Asia/Tokyo') - exp_arr = np.array([0, 1, 2, 3]) + exp_arr = np.array([0, 1, 2, 3], dtype=np.intp) arr, idx = idx3.factorize() self.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, idx3) + def test_factorize_tz(self): + # GH 13750 + for tz in [None, 'UTC', 'US/Eastern', 'Asia/Tokyo']: + base = pd.date_range('2016-11-05', freq='H', periods=100, tz=tz) + idx = base.repeat(5) + + exp_arr = np.arange(100, dtype=np.intp).repeat(5) + + for obj in [idx, pd.Series(idx)]: + arr, res = obj.factorize() + self.assert_numpy_array_equal(arr, exp_arr) + tm.assert_index_equal(res, base) + + def test_factorize_dst(self): + # GH 13750 + idx = pd.date_range('2016-11-06', freq='H', periods=12, + tz='US/Eastern') + + for obj in [idx, pd.Series(idx)]: + arr, res = obj.factorize() + self.assert_numpy_array_equal(arr, np.arange(12, dtype=np.intp)) + tm.assert_index_equal(res, idx) + + idx = pd.date_range('2016-06-13', freq='H', periods=12, + tz='US/Eastern') + + for obj in [idx, pd.Series(idx)]: + arr, res = obj.factorize() + self.assert_numpy_array_equal(arr, np.arange(12, dtype=np.intp)) + tm.assert_index_equal(res, idx) + def test_slice_with_negative_step(self): ts = Series(np.arange(20), date_range('2014-01-01', periods=20, freq='MS')) @@ -3852,36 +4019,36 @@ def test_datetimeindex_accessors(self): self.assertEqual(dti.is_month_start[0], 1) tests = [ - (Timestamp('2013-06-01', offset='M').is_month_start, 1), - (Timestamp('2013-06-01', offset='BM').is_month_start, 0), - (Timestamp('2013-06-03', offset='M').is_month_start, 0), - (Timestamp('2013-06-03', offset='BM').is_month_start, 1), - (Timestamp('2013-02-28', offset='Q-FEB').is_month_end, 1), - (Timestamp('2013-02-28', offset='Q-FEB').is_quarter_end, 1), - (Timestamp('2013-02-28', offset='Q-FEB').is_year_end, 1), - (Timestamp('2013-03-01', offset='Q-FEB').is_month_start, 1), - (Timestamp('2013-03-01', offset='Q-FEB').is_quarter_start, 1), - (Timestamp('2013-03-01', offset='Q-FEB').is_year_start, 1), - (Timestamp('2013-03-31', offset='QS-FEB').is_month_end, 1), - (Timestamp('2013-03-31', offset='QS-FEB').is_quarter_end, 0), - (Timestamp('2013-03-31', offset='QS-FEB').is_year_end, 0), - (Timestamp('2013-02-01', offset='QS-FEB').is_month_start, 1), - (Timestamp('2013-02-01', offset='QS-FEB').is_quarter_start, 1), - (Timestamp('2013-02-01', offset='QS-FEB').is_year_start, 1), - (Timestamp('2013-06-30', offset='BQ').is_month_end, 0), - (Timestamp('2013-06-30', offset='BQ').is_quarter_end, 0), - (Timestamp('2013-06-30', offset='BQ').is_year_end, 0), - (Timestamp('2013-06-28', offset='BQ').is_month_end, 1), - (Timestamp('2013-06-28', offset='BQ').is_quarter_end, 1), - (Timestamp('2013-06-28', offset='BQ').is_year_end, 0), - (Timestamp('2013-06-30', offset='BQS-APR').is_month_end, 0), - (Timestamp('2013-06-30', offset='BQS-APR').is_quarter_end, 0), - (Timestamp('2013-06-30', offset='BQS-APR').is_year_end, 0), - (Timestamp('2013-06-28', offset='BQS-APR').is_month_end, 1), - (Timestamp('2013-06-28', offset='BQS-APR').is_quarter_end, 1), - (Timestamp('2013-03-29', offset='BQS-APR').is_year_end, 1), - (Timestamp('2013-11-01', offset='AS-NOV').is_year_start, 1), - (Timestamp('2013-10-31', offset='AS-NOV').is_year_end, 1), + (Timestamp('2013-06-01', freq='M').is_month_start, 1), + (Timestamp('2013-06-01', freq='BM').is_month_start, 0), + (Timestamp('2013-06-03', freq='M').is_month_start, 0), + (Timestamp('2013-06-03', freq='BM').is_month_start, 1), + (Timestamp('2013-02-28', freq='Q-FEB').is_month_end, 1), + (Timestamp('2013-02-28', freq='Q-FEB').is_quarter_end, 1), + (Timestamp('2013-02-28', freq='Q-FEB').is_year_end, 1), + (Timestamp('2013-03-01', freq='Q-FEB').is_month_start, 1), + (Timestamp('2013-03-01', freq='Q-FEB').is_quarter_start, 1), + (Timestamp('2013-03-01', freq='Q-FEB').is_year_start, 1), + (Timestamp('2013-03-31', freq='QS-FEB').is_month_end, 1), + (Timestamp('2013-03-31', freq='QS-FEB').is_quarter_end, 0), + (Timestamp('2013-03-31', freq='QS-FEB').is_year_end, 0), + (Timestamp('2013-02-01', freq='QS-FEB').is_month_start, 1), + (Timestamp('2013-02-01', freq='QS-FEB').is_quarter_start, 1), + (Timestamp('2013-02-01', freq='QS-FEB').is_year_start, 1), + (Timestamp('2013-06-30', freq='BQ').is_month_end, 0), + (Timestamp('2013-06-30', freq='BQ').is_quarter_end, 0), + (Timestamp('2013-06-30', freq='BQ').is_year_end, 0), + (Timestamp('2013-06-28', freq='BQ').is_month_end, 1), + (Timestamp('2013-06-28', freq='BQ').is_quarter_end, 1), + (Timestamp('2013-06-28', freq='BQ').is_year_end, 0), + (Timestamp('2013-06-30', freq='BQS-APR').is_month_end, 0), + (Timestamp('2013-06-30', freq='BQS-APR').is_quarter_end, 0), + (Timestamp('2013-06-30', freq='BQS-APR').is_year_end, 0), + (Timestamp('2013-06-28', freq='BQS-APR').is_month_end, 1), + (Timestamp('2013-06-28', freq='BQS-APR').is_quarter_end, 1), + (Timestamp('2013-03-29', freq='BQS-APR').is_year_end, 1), + (Timestamp('2013-11-01', freq='AS-NOV').is_year_start, 1), + (Timestamp('2013-10-31', freq='AS-NOV').is_year_end, 1), (Timestamp('2012-02-01').days_in_month, 29), (Timestamp('2013-02-01').days_in_month, 28)] @@ -3966,7 +4133,7 @@ def test_datetimeindex_constructor(self): edate = datetime(2000, 1, 1) idx = DatetimeIndex(start=sdate, freq='1B', periods=20) self.assertEqual(len(idx), 20) - self.assertEqual(idx[0], sdate + 0 * datetools.bday) + self.assertEqual(idx[0], sdate + 0 * offsets.BDay()) self.assertEqual(idx.freq, 'B') idx = DatetimeIndex(end=edate, freq=('D', 5), periods=20) @@ -3976,19 +4143,19 @@ def test_datetimeindex_constructor(self): idx1 = DatetimeIndex(start=sdate, end=edate, freq='W-SUN') idx2 = DatetimeIndex(start=sdate, end=edate, - freq=datetools.Week(weekday=6)) + freq=offsets.Week(weekday=6)) self.assertEqual(len(idx1), len(idx2)) self.assertEqual(idx1.offset, idx2.offset) idx1 = DatetimeIndex(start=sdate, end=edate, freq='QS') idx2 = DatetimeIndex(start=sdate, end=edate, - freq=datetools.QuarterBegin(startingMonth=1)) + freq=offsets.QuarterBegin(startingMonth=1)) self.assertEqual(len(idx1), len(idx2)) self.assertEqual(idx1.offset, idx2.offset) idx1 = DatetimeIndex(start=sdate, end=edate, freq='BQ') idx2 = DatetimeIndex(start=sdate, end=edate, - freq=datetools.BQuarterEnd(startingMonth=12)) + freq=offsets.BQuarterEnd(startingMonth=12)) self.assertEqual(len(idx1), len(idx2)) self.assertEqual(idx1.offset, idx2.offset) @@ -4054,8 +4221,9 @@ def test_dti_set_index_reindex(self): # 11314 # with tz - index = date_range(datetime(2015, 10, 1), datetime( - 2015, 10, 1, 23), freq='H', tz='US/Eastern') + index = date_range(datetime(2015, 10, 1), + datetime(2015, 10, 1, 23), + freq='H', tz='US/Eastern') df = DataFrame(np.random.randn(24, 1), columns=['a'], index=index) new_index = date_range(datetime(2015, 10, 2), datetime(2015, 10, 2, 23), @@ -4231,6 +4399,14 @@ def test_intercept_astype_object(self): result = df.values.squeeze() self.assertTrue((result[:, 0] == expected.values).all()) + def test_nat_operations(self): + # GH 8617 + s = Series([0, pd.NaT], dtype='m8[ns]') + exp = s[0] + self.assertEqual(s.median(), exp) + self.assertEqual(s.min(), exp) + self.assertEqual(s.max(), exp) + class TestTimestamp(tm.TestCase): def test_class_ops_pytz(self): @@ -4355,6 +4531,8 @@ def check(val, unit=None, h=1, s=1, us=0): result = Timestamp('NaT') self.assertIs(result, NaT) + self.assertTrue(isnull(Timestamp('nat'))) + def test_roundtrip(self): # test value to string and back conversions @@ -4540,7 +4718,8 @@ def test_frequency_misc(self): self.assertRaises(ValueError, frequencies.to_offset, ('', '')) - result = frequencies.get_standard_freq(offsets.Hour()) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = frequencies.get_standard_freq(offsets.Hour()) self.assertEqual(result, 'H') def test_hash_equivalent(self): @@ -4608,6 +4787,25 @@ def test_timestamp_compare_series(self): result = right_f(Timestamp('nat'), s_nat) tm.assert_series_equal(result, expected) + def test_is_leap_year(self): + # GH 13727 + for tz in [None, 'UTC', 'US/Eastern', 'Asia/Tokyo']: + dt = Timestamp('2000-01-01 00:00:00', tz=tz) + self.assertTrue(dt.is_leap_year) + self.assertIsInstance(dt.is_leap_year, bool) + + dt = Timestamp('1999-01-01 00:00:00', tz=tz) + self.assertFalse(dt.is_leap_year) + + dt = Timestamp('2004-01-01 00:00:00', tz=tz) + self.assertTrue(dt.is_leap_year) + + dt = Timestamp('2100-01-01 00:00:00', tz=tz) + self.assertFalse(dt.is_leap_year) + + self.assertFalse(pd.NaT.is_leap_year) + self.assertIsInstance(pd.NaT.is_leap_year, bool) + class TestSlicing(tm.TestCase): def test_slice_year(self): @@ -4820,7 +5018,7 @@ def test_shift(self): # GH #1063, multiple of same base result = ts.shift(1, freq='4H') - exp_index = ts.index + datetools.Hour(4) + exp_index = ts.index + offsets.Hour(4) tm.assert_index_equal(result.index, exp_index) idx = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-04']) diff --git a/pandas/tseries/tests/test_timeseries_legacy.py b/pandas/tseries/tests/test_timeseries_legacy.py index 6f58ad3a57b48..d8c01c53fb2e5 100644 --- a/pandas/tseries/tests/test_timeseries_legacy.py +++ b/pandas/tseries/tests/test_timeseries_legacy.py @@ -8,8 +8,8 @@ from pandas import (Index, Series, date_range, Timestamp, DatetimeIndex, Int64Index, to_datetime) -import pandas.core.datetools as datetools -import pandas.tseries.offsets as offsets +from pandas.tseries.frequencies import get_offset, to_offset +from pandas.tseries.offsets import BDay, Micro, Milli, MonthBegin import pandas as pd from pandas.util.testing import assert_series_equal, assert_almost_equal @@ -19,12 +19,11 @@ from pandas import read_pickle from numpy.random import rand import pandas.compat as compat -from pandas.core.datetools import BDay randn = np.random.randn -# infortunately, too much has changed to handle these legacy pickles +# Unfortunately, too much has changed to handle these legacy pickles # class TestLegacySupport(unittest.TestCase): class LegacySupport(object): @@ -65,8 +64,6 @@ def test_unpickle_legacy_frame(self): self.assertEqual(unpickled.index.offset, BDay(1, normalize=True)) def test_unpickle_legacy_series(self): - from pandas.core.datetools import BDay - unpickled = self.series dtindex = DatetimeIndex(start='1/3/2005', end='1/14/2005', @@ -86,7 +83,7 @@ def test_unpickle_legacy_len0_daterange(self): ex_index = DatetimeIndex([], freq='B') self.assert_index_equal(result.index, ex_index) - tm.assertIsInstance(result.index.freq, offsets.BDay) + tm.assertIsInstance(result.index.freq, BDay) self.assertEqual(len(result), 0) def test_arithmetic_interaction(self): @@ -140,7 +137,7 @@ def test_unpickle_daterange(self): rng = read_pickle(filepath) tm.assertIsInstance(rng[0], datetime) - tm.assertIsInstance(rng.offset, offsets.BDay) + tm.assertIsInstance(rng.offset, BDay) self.assertEqual(rng.values.dtype, object) def test_setops(self): @@ -213,20 +210,15 @@ def test_legacy_time_rules(self): new_rng = date_range(start, end, freq=new_freq) self.assert_index_equal(old_rng, new_rng) - # test get_legacy_offset_name - offset = datetools.get_offset(new_freq) - old_name = datetools.get_legacy_offset_name(offset) - self.assertEqual(old_name, old_freq) - def test_ms_vs_MS(self): - left = datetools.get_offset('ms') - right = datetools.get_offset('MS') - self.assertEqual(left, datetools.Milli()) - self.assertEqual(right, datetools.MonthBegin()) + left = get_offset('ms') + right = get_offset('MS') + self.assertEqual(left, Milli()) + self.assertEqual(right, MonthBegin()) def test_rule_aliases(self): - rule = datetools.to_offset('10us') - self.assertEqual(rule, datetools.Micro(10)) + rule = to_offset('10us') + self.assertEqual(rule, Micro(10)) if __name__ == '__main__': diff --git a/pandas/tseries/tests/test_timezones.py b/pandas/tseries/tests/test_timezones.py index afe9d0652db19..b8247fe01b3f2 100644 --- a/pandas/tseries/tests/test_timezones.py +++ b/pandas/tseries/tests/test_timezones.py @@ -5,20 +5,19 @@ import numpy as np import pytz +from pandas.types.dtypes import DatetimeTZDtype from pandas import (Index, Series, DataFrame, isnull, Timestamp) from pandas import DatetimeIndex, to_datetime, NaT from pandas import tslib -import pandas.core.datetools as datetools import pandas.tseries.offsets as offsets from pandas.tseries.index import bdate_range, date_range import pandas.tseries.tools as tools from pytz import NonExistentTimeError import pandas.util.testing as tm -from pandas.types.api import DatetimeTZDtype -from pandas.util.testing import assert_frame_equal +from pandas.util.testing import assert_frame_equal, set_timezone from pandas.compat import lrange, zip try: @@ -169,7 +168,7 @@ def test_timestamp_to_datetime_tzoffset(self): from dateutil.tz import tzoffset tzinfo = tzoffset(None, 7200) expected = Timestamp('3/11/2012 04:00', tz=tzinfo) - result = Timestamp(expected.to_datetime()) + result = Timestamp(expected.to_pydatetime()) self.assertEqual(expected, result) def test_timedelta_push_over_dst_boundary(self): @@ -371,7 +370,7 @@ def test_with_tz(self): # just want it to work start = datetime(2011, 3, 12, tzinfo=pytz.utc) - dr = bdate_range(start, periods=50, freq=datetools.Hour()) + dr = bdate_range(start, periods=50, freq=offsets.Hour()) self.assertIs(dr.tz, pytz.utc) # DateRange with naive datetimes @@ -409,33 +408,33 @@ def test_with_tz_ambiguous_times(self): # March 13, 2011, spring forward, skip from 2 AM to 3 AM dr = date_range(datetime(2011, 3, 13, 1, 30), periods=3, - freq=datetools.Hour()) + freq=offsets.Hour()) self.assertRaises(pytz.NonExistentTimeError, dr.tz_localize, tz) # after dst transition, it works dr = date_range(datetime(2011, 3, 13, 3, 30), periods=3, - freq=datetools.Hour(), tz=tz) + freq=offsets.Hour(), tz=tz) # November 6, 2011, fall back, repeat 2 AM hour dr = date_range(datetime(2011, 11, 6, 1, 30), periods=3, - freq=datetools.Hour()) + freq=offsets.Hour()) self.assertRaises(pytz.AmbiguousTimeError, dr.tz_localize, tz) # UTC is OK dr = date_range(datetime(2011, 3, 13), periods=48, - freq=datetools.Minute(30), tz=pytz.utc) + freq=offsets.Minute(30), tz=pytz.utc) def test_ambiguous_infer(self): # November 6, 2011, fall back, repeat 2 AM hour # With no repeated hours, we cannot infer the transition tz = self.tz('US/Eastern') dr = date_range(datetime(2011, 11, 6, 0), periods=5, - freq=datetools.Hour()) + freq=offsets.Hour()) self.assertRaises(pytz.AmbiguousTimeError, dr.tz_localize, tz) # With repeated hours, we can infer the transition dr = date_range(datetime(2011, 11, 6, 0), periods=5, - freq=datetools.Hour(), tz=tz) + freq=offsets.Hour(), tz=tz) times = ['11/06/2011 00:00', '11/06/2011 01:00', '11/06/2011 01:00', '11/06/2011 02:00', '11/06/2011 03:00'] di = DatetimeIndex(times) @@ -449,7 +448,7 @@ def test_ambiguous_infer(self): # When there is no dst transition, nothing special happens dr = date_range(datetime(2011, 6, 1, 0), periods=10, - freq=datetools.Hour()) + freq=offsets.Hour()) localized = dr.tz_localize(tz) localized_infer = dr.tz_localize(tz, ambiguous='infer') self.assert_index_equal(localized, localized_infer) @@ -463,7 +462,7 @@ def test_ambiguous_flags(self): # Pass in flags to determine right dst transition dr = date_range(datetime(2011, 11, 6, 0), periods=5, - freq=datetools.Hour(), tz=tz) + freq=offsets.Hour(), tz=tz) times = ['11/06/2011 00:00', '11/06/2011 01:00', '11/06/2011 01:00', '11/06/2011 02:00', '11/06/2011 03:00'] @@ -501,7 +500,7 @@ def test_ambiguous_flags(self): # When there is no dst transition, nothing special happens dr = date_range(datetime(2011, 6, 1, 0), periods=10, - freq=datetools.Hour()) + freq=offsets.Hour()) is_dst = np.array([1] * 10) localized = dr.tz_localize(tz) localized_is_dst = dr.tz_localize(tz, ambiguous=is_dst) @@ -1061,6 +1060,46 @@ def test_tslib_tz_convert_dst(self): self.assert_numpy_array_equal(idx.hour, np.array([4, 4], dtype=np.int32)) + def test_tzlocal(self): + # GH 13583 + ts = Timestamp('2011-01-01', tz=dateutil.tz.tzlocal()) + self.assertEqual(ts.tz, dateutil.tz.tzlocal()) + self.assertTrue("tz='tzlocal()')" in repr(ts)) + + tz = tslib.maybe_get_tz('tzlocal()') + self.assertEqual(tz, dateutil.tz.tzlocal()) + + # get offset using normal datetime for test + offset = dateutil.tz.tzlocal().utcoffset(datetime(2011, 1, 1)) + offset = offset.total_seconds() * 1000000000 + self.assertEqual(ts.value + offset, Timestamp('2011-01-01').value) + + def test_tz_localize_tzlocal(self): + # GH 13583 + offset = dateutil.tz.tzlocal().utcoffset(datetime(2011, 1, 1)) + offset = int(offset.total_seconds() * 1000000000) + + dti = date_range(start='2001-01-01', end='2001-03-01') + dti2 = dti.tz_localize(dateutil.tz.tzlocal()) + tm.assert_numpy_array_equal(dti2.asi8 + offset, dti.asi8) + + dti = date_range(start='2001-01-01', end='2001-03-01', + tz=dateutil.tz.tzlocal()) + dti2 = dti.tz_localize(None) + tm.assert_numpy_array_equal(dti2.asi8 - offset, dti.asi8) + + def test_tz_convert_tzlocal(self): + # GH 13583 + # tz_convert doesn't affect to internal + dti = date_range(start='2001-01-01', end='2001-03-01', tz='UTC') + dti2 = dti.tz_convert(dateutil.tz.tzlocal()) + tm.assert_numpy_array_equal(dti2.asi8, dti.asi8) + + dti = date_range(start='2001-01-01', end='2001-03-01', + tz=dateutil.tz.tzlocal()) + dti2 = dti.tz_convert(None) + tm.assert_numpy_array_equal(dti2.asi8, dti.asi8) + class TestTimeZoneCacheKey(tm.TestCase): def test_cache_keys_are_distinct_for_pytz_vs_dateutil(self): @@ -1256,27 +1295,59 @@ def test_append_aware(self): tz='US/Eastern') rng2 = date_range('1/1/2011 02:00', periods=1, freq='H', tz='US/Eastern') - ts1 = Series(np.random.randn(len(rng1)), index=rng1) - ts2 = Series(np.random.randn(len(rng2)), index=rng2) + ts1 = Series([1], index=rng1) + ts2 = Series([2], index=rng2) ts_result = ts1.append(ts2) + + exp_index = DatetimeIndex(['2011-01-01 01:00', '2011-01-01 02:00'], + tz='US/Eastern') + exp = Series([1, 2], index=exp_index) + self.assert_series_equal(ts_result, exp) self.assertEqual(ts_result.index.tz, rng1.tz) rng1 = date_range('1/1/2011 01:00', periods=1, freq='H', tz='UTC') rng2 = date_range('1/1/2011 02:00', periods=1, freq='H', tz='UTC') - ts1 = Series(np.random.randn(len(rng1)), index=rng1) - ts2 = Series(np.random.randn(len(rng2)), index=rng2) + ts1 = Series([1], index=rng1) + ts2 = Series([2], index=rng2) ts_result = ts1.append(ts2) + + exp_index = DatetimeIndex(['2011-01-01 01:00', '2011-01-01 02:00'], + tz='UTC') + exp = Series([1, 2], index=exp_index) + self.assert_series_equal(ts_result, exp) utc = rng1.tz self.assertEqual(utc, ts_result.index.tz) + # GH 7795 + # different tz coerces to object dtype, not UTC rng1 = date_range('1/1/2011 01:00', periods=1, freq='H', tz='US/Eastern') rng2 = date_range('1/1/2011 02:00', periods=1, freq='H', tz='US/Central') - ts1 = Series(np.random.randn(len(rng1)), index=rng1) - ts2 = Series(np.random.randn(len(rng2)), index=rng2) + ts1 = Series([1], index=rng1) + ts2 = Series([2], index=rng2) ts_result = ts1.append(ts2) - self.assertEqual(utc, ts_result.index.tz) + exp_index = Index([Timestamp('1/1/2011 01:00', tz='US/Eastern'), + Timestamp('1/1/2011 02:00', tz='US/Central')]) + exp = Series([1, 2], index=exp_index) + self.assert_series_equal(ts_result, exp) + + def test_append_dst(self): + rng1 = date_range('1/1/2016 01:00', periods=3, freq='H', + tz='US/Eastern') + rng2 = date_range('8/1/2016 01:00', periods=3, freq='H', + tz='US/Eastern') + ts1 = Series([1, 2, 3], index=rng1) + ts2 = Series([10, 11, 12], index=rng2) + ts_result = ts1.append(ts2) + + exp_index = DatetimeIndex(['2016-01-01 01:00', '2016-01-01 02:00', + '2016-01-01 03:00', '2016-08-01 01:00', + '2016-08-01 02:00', '2016-08-01 03:00'], + tz='US/Eastern') + exp = Series([1, 2, 3, 10, 11, 12], index=exp_index) + tm.assert_series_equal(ts_result, exp) + self.assertEqual(ts_result.index.tz, rng1.tz) def test_append_aware_naive(self): rng1 = date_range('1/1/2011 01:00', periods=1, freq='H') @@ -1398,6 +1469,26 @@ def test_normalize_tz(self): self.assertTrue(result.is_normalized) self.assertFalse(rng.is_normalized) + def test_normalize_tz_local(self): + # GH 13459 + from dateutil.tz import tzlocal + + timezones = ['US/Pacific', 'US/Eastern', 'UTC', 'Asia/Kolkata', + 'Asia/Shanghai', 'Australia/Canberra'] + + for timezone in timezones: + with set_timezone(timezone): + rng = date_range('1/1/2000 9:30', periods=10, freq='D', + tz=tzlocal()) + + result = rng.normalize() + expected = date_range('1/1/2000', periods=10, freq='D', + tz=tzlocal()) + self.assert_index_equal(result, expected) + + self.assertTrue(result.is_normalized) + self.assertFalse(rng.is_normalized) + def test_tzaware_offset(self): dates = date_range('2012-11-01', periods=3, tz='US/Pacific') offset = dates + offsets.Hour(5) diff --git a/pandas/tseries/tests/test_tslib.py b/pandas/tseries/tests/test_tslib.py index c6436163b9edb..6cee45df2a63c 100644 --- a/pandas/tseries/tests/test_tslib.py +++ b/pandas/tseries/tests/test_tslib.py @@ -7,7 +7,8 @@ import datetime import pandas as pd -from pandas.core.api import Timestamp, Series, Timedelta, Period, to_datetime +from pandas.core.api import (Timestamp, Index, Series, Timedelta, Period, + to_datetime) from pandas.tslib import get_timezone from pandas._period import period_asfreq, period_ordinal from pandas.tseries.index import date_range, DatetimeIndex @@ -46,12 +47,17 @@ def test_max_valid(self): def test_to_datetime_bijective(self): # Ensure that converting to datetime and back only loses precision # by going from nanoseconds to microseconds. - self.assertEqual( - Timestamp(Timestamp.max.to_pydatetime()).value / 1000, - Timestamp.max.value / 1000) - self.assertEqual( - Timestamp(Timestamp.min.to_pydatetime()).value / 1000, - Timestamp.min.value / 1000) + exp_warning = None if Timestamp.max.nanosecond == 0 else UserWarning + with tm.assert_produces_warning(exp_warning, check_stacklevel=False): + self.assertEqual( + Timestamp(Timestamp.max.to_pydatetime()).value / 1000, + Timestamp.max.value / 1000) + + exp_warning = None if Timestamp.min.nanosecond == 0 else UserWarning + with tm.assert_produces_warning(exp_warning, check_stacklevel=False): + self.assertEqual( + Timestamp(Timestamp.min.to_pydatetime()).value / 1000, + Timestamp.min.value / 1000) class TestTimestamp(tm.TestCase): @@ -255,6 +261,48 @@ def test_constructor_keyword(self): hour=1, minute=2, second=3, microsecond=999999)), repr(Timestamp('2015-11-12 01:02:03.999999'))) + def test_constructor_fromordinal(self): + base = datetime.datetime(2000, 1, 1) + + ts = Timestamp.fromordinal(base.toordinal(), freq='D') + self.assertEqual(base, ts) + self.assertEqual(ts.freq, 'D') + self.assertEqual(base.toordinal(), ts.toordinal()) + + ts = Timestamp.fromordinal(base.toordinal(), tz='US/Eastern') + self.assertEqual(pd.Timestamp('2000-01-01', tz='US/Eastern'), ts) + self.assertEqual(base.toordinal(), ts.toordinal()) + + def test_constructor_offset_depr(self): + # GH 12160 + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + ts = Timestamp('2011-01-01', offset='D') + self.assertEqual(ts.freq, 'D') + + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + self.assertEqual(ts.offset, 'D') + + msg = "Can only specify freq or offset, not both" + with tm.assertRaisesRegexp(TypeError, msg): + Timestamp('2011-01-01', offset='D', freq='D') + + def test_constructor_offset_depr_fromordinal(self): + # GH 12160 + base = datetime.datetime(2000, 1, 1) + + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + ts = Timestamp.fromordinal(base.toordinal(), offset='D') + self.assertEqual(pd.Timestamp('2000-01-01'), ts) + self.assertEqual(ts.freq, 'D') + self.assertEqual(base.toordinal(), ts.toordinal()) + + msg = "Can only specify freq or offset, not both" + with tm.assertRaisesRegexp(TypeError, msg): + Timestamp.fromordinal(base.toordinal(), offset='D', freq='D') + def test_conversion(self): # GH 9255 ts = Timestamp('2000-01-01') @@ -312,13 +360,13 @@ def test_repr(self): self.assertNotIn(freq_repr, repr(date_tz)) self.assertEqual(date_tz, eval(repr(date_tz))) - date_freq = Timestamp(date, offset=freq) + date_freq = Timestamp(date, freq=freq) self.assertIn(date, repr(date_freq)) self.assertNotIn(tz_repr, repr(date_freq)) self.assertIn(freq_repr, repr(date_freq)) self.assertEqual(date_freq, eval(repr(date_freq))) - date_tz_freq = Timestamp(date, tz=tz, offset=freq) + date_tz_freq = Timestamp(date, tz=tz, freq=freq) self.assertIn(date, repr(date_tz_freq)) self.assertIn(tz_repr, repr(date_tz_freq)) self.assertIn(freq_repr, repr(date_tz_freq)) @@ -573,6 +621,26 @@ def test_pprint(self): 'foo': 1}""" self.assertEqual(result, expected) + def to_datetime_depr(self): + # see gh-8254 + ts = Timestamp('2011-01-01') + + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + expected = datetime.datetime(2011, 1, 1) + result = ts.to_datetime() + self.assertEqual(result, expected) + + def to_pydatetime_nonzero_nano(self): + ts = Timestamp('2011-01-01 9:00:00.123456789') + + # Warn the user of data loss (nanoseconds). + with tm.assert_produces_warning(UserWarning, + check_stacklevel=False): + expected = datetime.datetime(2011, 1, 1, 9, 0, 0, 123456) + result = ts.to_pydatetime() + self.assertEqual(result, expected) + class TestDatetimeParsingWrappers(tm.TestCase): def test_does_not_convert_mixed_integer(self): @@ -656,14 +724,19 @@ def test_parsers(self): yearfirst=yearfirst) result2 = to_datetime(date_str, yearfirst=yearfirst) result3 = to_datetime([date_str], yearfirst=yearfirst) + # result5 is used below result4 = to_datetime(np.array([date_str], dtype=object), yearfirst=yearfirst) - result6 = DatetimeIndex([date_str], yearfirst=yearfirst)[0] - self.assertEqual(result1, expected) - self.assertEqual(result2, expected) - self.assertEqual(result3, expected) - self.assertEqual(result4, expected) - self.assertEqual(result6, expected) + result6 = DatetimeIndex([date_str], yearfirst=yearfirst) + # result7 is used below + result8 = DatetimeIndex(Index([date_str]), yearfirst=yearfirst) + result9 = DatetimeIndex(Series([date_str]), yearfirst=yearfirst) + + for res in [result1, result2]: + self.assertEqual(res, expected) + for res in [result3, result4, result6, result8, result9]: + exp = DatetimeIndex([pd.Timestamp(expected)]) + tm.assert_index_equal(res, exp) # these really need to have yearfist, but we don't support if not yearfirst: @@ -851,9 +924,7 @@ def test_parsers_monthfreq(self): for date_str, expected in compat.iteritems(cases): result1, _, _ = tools.parse_time_string(date_str, freq='M') - result2 = tools._to_datetime(date_str, freq='M') self.assertEqual(result1, expected) - self.assertEqual(result2, expected) def test_parsers_quarterly_with_freq(self): msg = ('Incorrect quarterly string is given, quarter ' @@ -1182,6 +1253,13 @@ def test_nat_arithmetic(self): self.assertIs(left - right, pd.NaT) self.assertIs(right - left, pd.NaT) + # int addition / subtraction + for (left, right) in [(pd.NaT, 2), (pd.NaT, 0), (pd.NaT, -3)]: + self.assertIs(right + left, pd.NaT) + self.assertIs(left + right, pd.NaT) + self.assertIs(left - right, pd.NaT) + self.assertIs(right - left, pd.NaT) + def test_nat_arithmetic_index(self): # GH 11718 @@ -1373,16 +1451,14 @@ def _check_round(freq, expected): result = stamp.round(freq=freq) self.assertEqual(result, expected) - for freq, expected in [ - ('D', Timestamp('2000-01-05 00:00:00')), - ('H', Timestamp('2000-01-05 05:00:00')), - ('S', Timestamp('2000-01-05 05:09:15')) - ]: + for freq, expected in [('D', Timestamp('2000-01-05 00:00:00')), + ('H', Timestamp('2000-01-05 05:00:00')), + ('S', Timestamp('2000-01-05 05:09:15'))]: _check_round(freq, expected) - msg = "Could not evaluate" - tm.assertRaisesRegexp(ValueError, msg, - stamp.round, 'foo') + msg = pd.tseries.frequencies._INVALID_FREQ_ERROR + with self.assertRaisesRegexp(ValueError, msg): + stamp.round('foo') class TestTimestampOps(tm.TestCase): diff --git a/pandas/tseries/tests/test_util.py b/pandas/tseries/tests/test_util.py index 9c5c9b7a03445..96da32a4a845c 100644 --- a/pandas/tseries/tests/test_util.py +++ b/pandas/tseries/tests/test_util.py @@ -21,10 +21,13 @@ def test_daily(self): rng = date_range('1/1/2000', '12/31/2004', freq='D') ts = Series(np.random.randn(len(rng)), index=rng) - annual = pivot_annual(ts, 'D') + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + annual = pivot_annual(ts, 'D') doy = ts.index.dayofyear - doy[(~isleapyear(ts.index.year)) & (doy >= 60)] += 1 + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + doy[(~isleapyear(ts.index.year)) & (doy >= 60)] += 1 for i in range(1, 367): subset = ts[doy == i] @@ -50,10 +53,13 @@ def test_hourly(self): grouped = ts_hourly.groupby(ts_hourly.index.year) hoy = grouped.apply(lambda x: x.reset_index(drop=True)) hoy = hoy.index.droplevel(0).values - hoy[~isleapyear(ts_hourly.index.year) & (hoy >= 1416)] += 24 + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + hoy[~isleapyear(ts_hourly.index.year) & (hoy >= 1416)] += 24 hoy += 1 - annual = pivot_annual(ts_hourly) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + annual = pivot_annual(ts_hourly) ts_hourly = ts_hourly.astype(float) for i in [1, 1416, 1417, 1418, 1439, 1440, 1441, 8784]: @@ -78,7 +84,8 @@ def test_monthly(self): rng = date_range('1/1/2000', '12/31/2004', freq='M') ts = Series(np.random.randn(len(rng)), index=rng) - annual = pivot_annual(ts, 'M') + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + annual = pivot_annual(ts, 'M') month = ts.index.month for i in range(1, 13): @@ -97,6 +104,16 @@ def test_period_daily(self): def test_period_weekly(self): pass + def test_isleapyear_deprecate(self): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + self.assertTrue(isleapyear(2000)) + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + self.assertFalse(isleapyear(2001)) + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + self.assertTrue(isleapyear(2004)) + def test_normalize_date(): value = date(2012, 9, 7) diff --git a/pandas/tseries/timedeltas.py b/pandas/tseries/timedeltas.py index 7ff5d7adcaa35..2ca3fcea8005b 100644 --- a/pandas/tseries/timedeltas.py +++ b/pandas/tseries/timedeltas.py @@ -3,10 +3,14 @@ """ import numpy as np +import pandas as pd import pandas.tslib as tslib -from pandas.core.common import (ABCSeries, is_integer_dtype, - is_timedelta64_dtype, is_list_like, - _ensure_object, ABCIndexClass) + +from pandas.types.common import (_ensure_object, + is_integer_dtype, + is_timedelta64_dtype, + is_list_like) +from pandas.types.generic import ABCSeries, ABCIndexClass from pandas.util.decorators import deprecate_kwarg @@ -62,37 +66,22 @@ def to_timedelta(arg, unit='ns', box=True, errors='raise', coerce=None): """ unit = _validate_timedelta_unit(unit) - def _convert_listlike(arg, box, unit, name=None): - - if isinstance(arg, (list, tuple)) or not hasattr(arg, 'dtype'): - arg = np.array(list(arg), dtype='O') - - # these are shortcutable - if is_timedelta64_dtype(arg): - value = arg.astype('timedelta64[ns]') - elif is_integer_dtype(arg): - value = arg.astype('timedelta64[{0}]'.format( - unit)).astype('timedelta64[ns]', copy=False) - else: - value = tslib.array_to_timedelta64( - _ensure_object(arg), unit=unit, errors=errors) - value = value.astype('timedelta64[ns]', copy=False) - - if box: - from pandas import TimedeltaIndex - value = TimedeltaIndex(value, unit='ns', name=name) - return value + if errors not in ('ignore', 'raise', 'coerce'): + raise ValueError("errors must be one of 'ignore', " + "'raise', or 'coerce'}") if arg is None: return arg elif isinstance(arg, ABCSeries): from pandas import Series - values = _convert_listlike(arg._values, box=False, unit=unit) - return Series(values, index=arg.index, name=arg.name, dtype='m8[ns]') + values = _convert_listlike(arg._values, unit=unit, + box=False, errors=errors) + return Series(values, index=arg.index, name=arg.name) elif isinstance(arg, ABCIndexClass): - return _convert_listlike(arg, box=box, unit=unit, name=arg.name) + return _convert_listlike(arg, unit=unit, box=box, + errors=errors, name=arg.name) elif is_list_like(arg) and getattr(arg, 'ndim', 1) == 1: - return _convert_listlike(arg, box=box, unit=unit) + return _convert_listlike(arg, unit=unit, box=box, errors=errors) elif getattr(arg, 'ndim', 1) > 1: raise TypeError('arg must be a string, timedelta, list, tuple, ' '1-d array, or Series') @@ -140,13 +129,55 @@ def _validate_timedelta_unit(arg): def _coerce_scalar_to_timedelta_type(r, unit='ns', box=True, errors='raise'): - """ - convert strings to timedelta; coerce to Timedelta (if box), else - np.timedelta64 - """ + """Convert string 'r' to a timedelta object.""" + + try: + result = tslib.convert_to_timedelta64(r, unit) + except ValueError: + if errors == 'raise': + raise + elif errors == 'ignore': + return r + + # coerce + result = pd.NaT - result = tslib.convert_to_timedelta(r, unit, errors) if box: result = tslib.Timedelta(result) - return result + + +def _convert_listlike(arg, unit='ns', box=True, errors='raise', name=None): + """Convert a list of objects to a timedelta index object.""" + + if isinstance(arg, (list, tuple)) or not hasattr(arg, 'dtype'): + arg = np.array(list(arg), dtype='O') + + # these are shortcut-able + if is_timedelta64_dtype(arg): + value = arg.astype('timedelta64[ns]') + elif is_integer_dtype(arg): + value = arg.astype('timedelta64[{0}]'.format( + unit)).astype('timedelta64[ns]', copy=False) + else: + try: + value = tslib.array_to_timedelta64(_ensure_object(arg), + unit=unit, errors=errors) + value = value.astype('timedelta64[ns]', copy=False) + except ValueError: + if errors == 'ignore': + return arg + else: + # This else-block accounts for the cases when errors='raise' + # and errors='coerce'. If errors == 'raise', these errors + # should be raised. If errors == 'coerce', we shouldn't + # expect any errors to be raised, since all parsing errors + # cause coercion to pd.NaT. However, if an error / bug is + # introduced that causes an Exception to be raised, we would + # like to surface it. + raise + + if box: + from pandas import TimedeltaIndex + value = TimedeltaIndex(value, unit='ns', name=name) + return value diff --git a/pandas/tseries/tools.py b/pandas/tseries/tools.py index d5e87d1df2462..93d35ff964e69 100644 --- a/pandas/tseries/tools.py +++ b/pandas/tseries/tools.py @@ -4,8 +4,17 @@ import pandas.lib as lib import pandas.tslib as tslib -import pandas.core.common as com -from pandas.core.common import ABCIndexClass, ABCSeries, ABCDataFrame + +from pandas.types.common import (_ensure_object, + is_datetime64_ns_dtype, + is_datetime64_dtype, + is_datetime64tz_dtype, + is_integer_dtype, + is_list_like) +from pandas.types.generic import (ABCIndexClass, ABCSeries, + ABCDataFrame) +from pandas.types.missing import notnull + import pandas.compat as compat from pandas.util.decorators import deprecate_kwarg @@ -161,7 +170,7 @@ def _guess_datetime_format(dt_str, dayfirst=False, def _guess_datetime_format_for_array(arr, **kwargs): # Try to guess the format based on the first non-NaN element - non_nan_elements = com.notnull(arr).nonzero()[0] + non_nan_elements = notnull(arr).nonzero()[0] if len(non_nan_elements): return _guess_datetime_format(arr[non_nan_elements[0]], **kwargs) @@ -286,40 +295,29 @@ def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, 1 loop, best of 3: 471 ms per loop """ - return _to_datetime(arg, errors=errors, dayfirst=dayfirst, - yearfirst=yearfirst, - utc=utc, box=box, format=format, exact=exact, - unit=unit, infer_datetime_format=infer_datetime_format) - -def _to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, - utc=None, box=True, format=None, exact=True, - unit=None, freq=None, infer_datetime_format=False): - """ - Same as to_datetime, but accept freq for - DatetimeIndex internal construction - """ from pandas.tseries.index import DatetimeIndex - def _convert_listlike(arg, box, format, name=None): + tz = 'utc' if utc else None + + def _convert_listlike(arg, box, format, name=None, tz=tz): if isinstance(arg, (list, tuple)): arg = np.array(arg, dtype='O') # these are shortcutable - if com.is_datetime64_ns_dtype(arg): + if is_datetime64_ns_dtype(arg): if box and not isinstance(arg, DatetimeIndex): try: - return DatetimeIndex(arg, tz='utc' if utc else None, - name=name) + return DatetimeIndex(arg, tz=tz, name=name) except ValueError: pass return arg - elif com.is_datetime64tz_dtype(arg): + elif is_datetime64tz_dtype(arg): if not isinstance(arg, DatetimeIndex): - return DatetimeIndex(arg, tz='utc' if utc else None) + return DatetimeIndex(arg, tz=tz, name=name) if utc: arg = arg.tz_convert(None).tz_localize('UTC') return arg @@ -335,14 +333,13 @@ def _convert_listlike(arg, box, format, name=None): from pandas import Index return Index(result) - return DatetimeIndex(result, tz='utc' if utc else None, - name=name) + return DatetimeIndex(result, tz=tz, name=name) return result elif getattr(arg, 'ndim', 1) > 1: raise TypeError('arg must be a string, datetime, list, tuple, ' '1-d array, or Series') - arg = com._ensure_object(arg) + arg = _ensure_object(arg) require_iso8601 = False if infer_datetime_format and format is None: @@ -373,8 +370,8 @@ def _convert_listlike(arg, box, format, name=None): # fallback if result is None: try: - result = tslib.array_strptime( - arg, format, exact=exact, errors=errors) + result = tslib.array_strptime(arg, format, exact=exact, + errors=errors) except tslib.OutOfBoundsDatetime: if errors == 'raise': raise @@ -395,14 +392,11 @@ def _convert_listlike(arg, box, format, name=None): utc=utc, dayfirst=dayfirst, yearfirst=yearfirst, - freq=freq, require_iso8601=require_iso8601 ) - if com.is_datetime64_dtype(result) and box: - result = DatetimeIndex(result, - tz='utc' if utc else None, - name=name) + if is_datetime64_dtype(result) and box: + result = DatetimeIndex(result, tz=tz, name=name) return result except ValueError as e: @@ -424,7 +418,7 @@ def _convert_listlike(arg, box, format, name=None): return _assemble_from_unit_mappings(arg, errors=errors) elif isinstance(arg, ABCIndexClass): return _convert_listlike(arg, box, format, name=arg.name) - elif com.is_list_like(arg): + elif is_list_like(arg): return _convert_listlike(arg, box, format) return _convert_listlike(np.array([arg]), box, format)[0] @@ -508,7 +502,12 @@ def f(value): def coerce(values): # we allow coercion to if errors allows - return to_numeric(values, errors=errors) + values = to_numeric(values, errors=errors) + + # prevent overflow in case of int8 or int16 + if is_integer_dtype(values): + values = values.astype('int64', copy=False) + return values values = (coerce(arg[unit_rev['year']]) * 10000 + coerce(arg[unit_rev['month']]) * 100 + @@ -569,7 +568,7 @@ def calc_with_mask(carg, mask): # a float with actual np.nan try: carg = arg.astype(np.float64) - return calc_with_mask(carg, com.notnull(carg)) + return calc_with_mask(carg, notnull(carg)) except: pass @@ -649,7 +648,7 @@ def parse_time_string(arg, freq=None, dayfirst=None, yearfirst=None): def _guess_time_format_for_array(arr): # Try to guess the format based on the first non-NaN element - non_nan_elements = com.notnull(arr).nonzero()[0] + non_nan_elements = notnull(arr).nonzero()[0] if len(non_nan_elements): element = arr[non_nan_elements[0]] for time_format in _time_formats: @@ -700,7 +699,7 @@ def _convert_listlike(arg, format): raise TypeError('arg must be a string, datetime, list, tuple, ' '1-d array, or Series') - arg = com._ensure_object(arg) + arg = _ensure_object(arg) if infer_time_format and format is None: format = _guess_time_format_for_array(arg) @@ -757,7 +756,7 @@ def _convert_listlike(arg, format): return Series(values, index=arg.index, name=arg.name) elif isinstance(arg, ABCIndexClass): return _convert_listlike(arg, format) - elif com.is_list_like(arg): + elif is_list_like(arg): return _convert_listlike(arg, format) return _convert_listlike(np.array([arg]), format)[0] diff --git a/pandas/tseries/util.py b/pandas/tseries/util.py index 7e314657cb25c..59daa8d7780b4 100644 --- a/pandas/tseries/util.py +++ b/pandas/tseries/util.py @@ -1,12 +1,16 @@ +import warnings + from pandas.compat import lrange import numpy as np -import pandas.core.common as com +from pandas.types.common import _ensure_platform_int from pandas.core.frame import DataFrame import pandas.core.nanops as nanops def pivot_annual(series, freq=None): """ + Deprecated. Use ``pivot_table`` instead. + Group a series by years, taking leap years into account. The output has as many rows as distinct years in the original series, @@ -35,6 +39,10 @@ def pivot_annual(series, freq=None): ------- annual : DataFrame """ + + msg = "pivot_annual is deprecated. Use pivot_table instead" + warnings.warn(msg, FutureWarning) + index = series.index year = index.year years = nanops.unique1d(year) @@ -69,7 +77,7 @@ def pivot_annual(series, freq=None): raise NotImplementedError(freq) flat_index = (year - years.min()) * width + offset - flat_index = com._ensure_platform_int(flat_index) + flat_index = _ensure_platform_int(flat_index) values = np.empty((len(years), width)) values.fill(np.nan) @@ -87,6 +95,10 @@ def isleapyear(year): year : integer / sequence A given (list of) year(s). """ + + msg = "isleapyear is deprecated. Use .is_leap_year property instead" + warnings.warn(msg, FutureWarning) + year = np.asarray(year) return np.logical_or(year % 400 == 0, np.logical_and(year % 4 == 0, year % 100 > 0)) diff --git a/pandas/tslib.pxd b/pandas/tslib.pxd index d6c5810e1d713..aa8cbcb2cedc7 100644 --- a/pandas/tslib.pxd +++ b/pandas/tslib.pxd @@ -1,7 +1,7 @@ from numpy cimport ndarray, int64_t cdef convert_to_tsobject(object, object, object, bint, bint) -cdef convert_to_timedelta64(object, object, object) +cpdef convert_to_timedelta64(object, object) cpdef object maybe_get_tz(object) cdef bint _is_utc(object) cdef bint _is_tzlocal(object) diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index 6453e65ecdc81..9073ad0abd535 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -1,5 +1,7 @@ # cython: profile=False +import warnings + cimport numpy as np from numpy cimport (int8_t, int32_t, int64_t, import_array, ndarray, NPY_INT64, NPY_DATETIME, NPY_TIMEDELTA) @@ -34,7 +36,8 @@ cdef extern from "datetime_helper.h": from datetime cimport cmp_pandas_datetimestruct from libc.stdlib cimport free -from util cimport is_integer_object, is_float_object, is_datetime64_object, is_timedelta64_object +from util cimport (is_integer_object, is_float_object, is_datetime64_object, + is_timedelta64_object) cimport util from datetime cimport * @@ -47,8 +50,10 @@ from datetime import time as datetime_time import re # dateutil compat -from dateutil.tz import (tzoffset, tzlocal as _dateutil_tzlocal, tzfile as _dateutil_tzfile, - tzutc as _dateutil_tzutc, tzstr as _dateutil_tzstr) +from dateutil.tz import (tzoffset, tzlocal as _dateutil_tzlocal, + tzfile as _dateutil_tzfile, + tzutc as _dateutil_tzutc, + tzstr as _dateutil_tzstr) from pandas.compat import is_platform_windows if is_platform_windows(): @@ -59,10 +64,12 @@ from dateutil.relativedelta import relativedelta from dateutil.parser import DEFAULTPARSER from pytz.tzinfo import BaseTzInfo as _pytz_BaseTzInfo -from pandas.compat import parse_date, string_types, iteritems, StringIO, callable +from pandas.compat import (parse_date, string_types, iteritems, + StringIO, callable) import operator import collections +import warnings # initialize numpy import_array() @@ -86,24 +93,31 @@ try: except NameError: # py3 basestring = str -cdef inline object create_timestamp_from_ts(int64_t value, pandas_datetimestruct dts, object tz, object offset): + +cdef inline object create_timestamp_from_ts( + int64_t value, pandas_datetimestruct dts, + object tz, object freq): cdef _Timestamp ts_base ts_base = _Timestamp.__new__(Timestamp, dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us, tz) - ts_base.value = value - ts_base.offset = offset + ts_base.freq = freq ts_base.nanosecond = dts.ps / 1000 return ts_base -cdef inline object create_datetime_from_ts(int64_t value, pandas_datetimestruct dts, object tz, object offset): + +cdef inline object create_datetime_from_ts( + int64_t value, pandas_datetimestruct dts, + object tz, object freq): return datetime(dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us, tz) -def ints_to_pydatetime(ndarray[int64_t] arr, tz=None, offset=None, box=False): - # convert an i8 repr to an ndarray of datetimes or Timestamp (if box == True) + +def ints_to_pydatetime(ndarray[int64_t] arr, tz=None, freq=None, box=False): + # convert an i8 repr to an ndarray of datetimes or Timestamp (if box == + # True) cdef: Py_ssize_t i, n = len(arr) @@ -113,9 +127,9 @@ def ints_to_pydatetime(ndarray[int64_t] arr, tz=None, offset=None, box=False): ndarray[object] result = np.empty(n, dtype=object) object (*func_create)(int64_t, pandas_datetimestruct, object, object) - if box and util.is_string_object(offset): + if box and util.is_string_object(freq): from pandas.tseries.frequencies import to_offset - offset = to_offset(offset) + freq = to_offset(freq) if box: func_create = create_timestamp_from_ts @@ -129,16 +143,18 @@ def ints_to_pydatetime(ndarray[int64_t] arr, tz=None, offset=None, box=False): if value == NPY_NAT: result[i] = NaT else: - pandas_datetime_to_datetimestruct(value, PANDAS_FR_ns, &dts) - result[i] = func_create(value, dts, tz, offset) + pandas_datetime_to_datetimestruct( + value, PANDAS_FR_ns, &dts) + result[i] = func_create(value, dts, tz, freq) elif _is_tzlocal(tz) or _is_fixed_offset(tz): for i in range(n): value = arr[i] if value == NPY_NAT: result[i] = NaT else: - pandas_datetime_to_datetimestruct(value, PANDAS_FR_ns, &dts) - dt = create_datetime_from_ts(value, dts, tz, offset) + pandas_datetime_to_datetimestruct( + value, PANDAS_FR_ns, &dts) + dt = create_datetime_from_ts(value, dts, tz, freq) dt = dt + tz.utcoffset(dt) if box: dt = Timestamp(dt) @@ -159,11 +175,13 @@ def ints_to_pydatetime(ndarray[int64_t] arr, tz=None, offset=None, box=False): # find right representation of dst etc in pytz timezone new_tz = tz._tzinfos[tz._transition_info[pos]] else: - # no zone-name change for dateutil tzs - dst etc represented in single object. + # no zone-name change for dateutil tzs - dst etc + # represented in single object. new_tz = tz - pandas_datetime_to_datetimestruct(value + deltas[pos], PANDAS_FR_ns, &dts) - result[i] = func_create(value, dts, new_tz, offset) + pandas_datetime_to_datetimestruct( + value + deltas[pos], PANDAS_FR_ns, &dts) + result[i] = func_create(value, dts, new_tz, freq) else: for i in range(n): @@ -172,12 +190,14 @@ def ints_to_pydatetime(ndarray[int64_t] arr, tz=None, offset=None, box=False): result[i] = NaT else: pandas_datetime_to_datetimestruct(value, PANDAS_FR_ns, &dts) - result[i] = func_create(value, dts, None, offset) + result[i] = func_create(value, dts, None, freq) return result + def ints_to_pytimedelta(ndarray[int64_t] arr, box=False): - # convert an i8 repr to an ndarray of timedelta or Timedelta (if box == True) + # convert an i8 repr to an ndarray of timedelta or Timedelta (if box == + # True) cdef: Py_ssize_t i, n = len(arr) @@ -193,7 +213,7 @@ def ints_to_pytimedelta(ndarray[int64_t] arr, box=False): if box: result[i] = Timedelta(value) else: - result[i] = timedelta(microseconds=int(value)/1000) + result[i] = timedelta(microseconds=int(value) / 1000) return result @@ -201,6 +221,7 @@ def ints_to_pytimedelta(ndarray[int64_t] arr, box=False): cdef inline bint _is_tzlocal(object tz): return isinstance(tz, _dateutil_tzlocal) + cdef inline bint _is_fixed_offset(object tz): if _treat_tz_as_dateutil(tz): if len(tz._trans_idx) == 0 and len(tz._trans_list) == 0: @@ -208,7 +229,8 @@ cdef inline bint _is_fixed_offset(object tz): else: return 0 elif _treat_tz_as_pytz(tz): - if len(tz._transition_info) == 0 and len(tz._utc_transition_times) == 0: + if (len(tz._transition_info) == 0 + and len(tz._utc_transition_times) == 0): return 1 else: return 0 @@ -219,6 +241,8 @@ _no_input = object() # Python front end to C extension type _Timestamp # This serves as the box for datetime64 + + class Timestamp(_Timestamp): """TimeStamp is the pandas equivalent of python's Datetime and is interchangable with it in most cases. It's the type used @@ -233,12 +257,14 @@ class Timestamp(_Timestamp): ---------- ts_input : datetime-like, str, int, float Value to be converted to Timestamp - offset : str, DateOffset + freq : str, DateOffset Offset which Timestamp will have tz : string, pytz.timezone, dateutil.tz.tzfile or None Time zone for time which Timestamp will have. unit : string numpy unit used for conversion, if ts_input is int or float + offset : str, DateOffset + Deprecated, use freq The other two forms mimic the parameters from ``datetime.datetime``. They can be passed by either position or keyword, but not both mixed together. @@ -246,7 +272,7 @@ class Timestamp(_Timestamp): :func:`datetime.datetime` Parameters ------------------------------------ - .. versionadded:: 0.18.2 + .. versionadded:: 0.19.0 year : int month : int @@ -259,10 +285,24 @@ class Timestamp(_Timestamp): """ @classmethod - def fromordinal(cls, ordinal, offset=None, tz=None): - """ passed an ordinal, translate and convert to a ts - note: by definition there cannot be any tz info on the ordinal itself """ - return cls(datetime.fromordinal(ordinal),offset=offset,tz=tz) + def fromordinal(cls, ordinal, freq=None, tz=None, offset=None): + """ + passed an ordinal, translate and convert to a ts + note: by definition there cannot be any tz info on the ordinal itself + + Parameters + ---------- + ordinal : int + date corresponding to a proleptic Gregorian ordinal + freq : str, DateOffset + Offset which Timestamp will have + tz : string, pytz.timezone, dateutil.tz.tzfile or None + Time zone for time which Timestamp will have. + offset : str, DateOffset + Deprecated, use freq + """ + return cls(datetime.fromordinal(ordinal), + freq=freq, tz=tz, offset=offset) @classmethod def now(cls, tz=None): @@ -309,11 +349,12 @@ class Timestamp(_Timestamp): def combine(cls, date, time): return cls(datetime.combine(date, time)) - def __new__(cls, - object ts_input=_no_input, object offset=None, tz=None, unit=None, - year=None, month=None, day=None, - hour=None, minute=None, second=None, microsecond=None, - tzinfo=None): + def __new__(cls, object ts_input=_no_input, + object freq=None, tz=None, unit=None, + year=None, month=None, day=None, + hour=None, minute=None, second=None, microsecond=None, + tzinfo=None, + object offset=None): # The parameter list folds together legacy parameter names (the first # four) and positional and keyword parameter names from pydatetime. # @@ -338,25 +379,37 @@ class Timestamp(_Timestamp): cdef _TSObject ts cdef _Timestamp ts_base + if offset is not None: + # deprecate offset kwd in 0.19.0, GH13593 + if freq is not None: + msg = "Can only specify freq or offset, not both" + raise TypeError(msg) + warnings.warn("offset is deprecated. Use freq instead", + FutureWarning) + freq = offset + if ts_input is _no_input: # User passed keyword arguments. return Timestamp(datetime(year, month, day, hour or 0, - minute or 0, second or 0, microsecond or 0, tzinfo), - tz=tzinfo) - elif is_integer_object(offset): + minute or 0, second or 0, + microsecond or 0, tzinfo), + tz=tzinfo) + elif is_integer_object(freq): # User passed positional arguments: - # Timestamp(year, month, day[, hour[, minute[, second[, microsecond[, tzinfo]]]]]) - return Timestamp(datetime(ts_input, offset, tz, unit or 0, - year or 0, month or 0, day or 0, hour), tz=hour) + # Timestamp(year, month, day[, hour[, minute[, second[, + # microsecond[, tzinfo]]]]]) + return Timestamp(datetime(ts_input, freq, tz, unit or 0, + year or 0, month or 0, day or 0, + hour), tz=hour) ts = convert_to_tsobject(ts_input, tz, unit, 0, 0) if ts.value == NPY_NAT: return NaT - if util.is_string_object(offset): + if util.is_string_object(freq): from pandas.tseries.frequencies import to_offset - offset = to_offset(offset) + freq = to_offset(freq) # make datetime happy ts_base = _Timestamp.__new__(cls, ts.dts.year, ts.dts.month, @@ -365,12 +418,11 @@ class Timestamp(_Timestamp): # fill out rest of data ts_base.value = ts.value - ts_base.offset = offset + ts_base.freq = freq ts_base.nanosecond = ts.dts.ps / 1000 return ts_base - def _round(self, freq, rounder): cdef int64_t unit @@ -382,7 +434,7 @@ class Timestamp(_Timestamp): value = self.tz_localize(None).value else: value = self.value - result = Timestamp(unit*rounder(value/float(unit)),unit='ns') + result = Timestamp(unit * rounder(value / float(unit)), unit='ns') if self.tz is not None: result = result.tz_localize(self.tz) return result @@ -433,16 +485,18 @@ class Timestamp(_Timestamp): return self.tzinfo @property - def freq(self): - return self.offset + def offset(self): + warnings.warn(".offset is deprecated. Use .freq instead", + FutureWarning) + return self.freq def __setstate__(self, state): self.value = state[0] - self.offset = state[1] + self.freq = state[1] self.tzinfo = state[2] def __reduce__(self): - object_state = self.value, self.offset, self.tzinfo + object_state = self.value, self.freq, self.tzinfo return (Timestamp, object_state) def to_period(self, freq=None): @@ -462,7 +516,8 @@ class Timestamp(_Timestamp): @property def weekday_name(self): - out = get_date_name_field(np.array([self.value], dtype=np.int64), 'weekday_name') + out = get_date_name_field( + np.array([self.value], dtype=np.int64), 'weekday_name') return out[0] @property @@ -491,7 +546,7 @@ class Timestamp(_Timestamp): @property def freqstr(self): - return getattr(self.offset, 'freqstr', self.offset) + return getattr(self.freq, 'freqstr', self.freq) @property def is_month_start(self): @@ -517,6 +572,10 @@ class Timestamp(_Timestamp): def is_year_end(self): return self._get_start_end_field('is_year_end') + @property + def is_leap_year(self): + return bool(is_leapyear(self.year)) + def tz_localize(self, tz, ambiguous='raise', errors='raise'): """ Convert naive Timestamp to local time zone, or remove @@ -539,7 +598,7 @@ class Timestamp(_Timestamp): - 'coerce' will return NaT if the timestamp can not be converted into the specified timezone - .. versionadded:: 0.18.2 + .. versionadded:: 0.19.0 Returns ------- @@ -557,8 +616,8 @@ class Timestamp(_Timestamp): # tz naive, localize tz = maybe_get_tz(tz) if not isinstance(ambiguous, basestring): - ambiguous = [ambiguous] - value = tz_localize_to_utc(np.array([self.value],dtype='i8'), tz, + ambiguous = [ambiguous] + value = tz_localize_to_utc(np.array([self.value], dtype='i8'), tz, ambiguous=ambiguous, errors=errors)[0] return Timestamp(value, tz=tz) else: @@ -570,7 +629,6 @@ class Timestamp(_Timestamp): raise TypeError('Cannot localize tz-aware Timestamp, use ' 'tz_convert for conversions') - def tz_convert(self, tz): """ Convert tz-aware Timestamp to another time zone. @@ -602,23 +660,7 @@ class Timestamp(_Timestamp): def replace(self, **kwds): return Timestamp(datetime.replace(self, **kwds), - offset=self.offset) - - def to_pydatetime(self, warn=True): - """ - If warn=True, issue warning if nanoseconds is nonzero - """ - cdef: - pandas_datetimestruct dts - _TSObject ts - - if self.nanosecond != 0 and warn: - print 'Warning: discarding nonzero nanoseconds' - ts = convert_to_tsobject(self, self.tzinfo, None, 0, 0) - - return datetime(ts.dts.year, ts.dts.month, ts.dts.day, - ts.dts.hour, ts.dts.min, ts.dts.sec, - ts.dts.us, ts.tzinfo) + freq=self.freq) def isoformat(self, sep='T'): base = super(_Timestamp, self).isoformat(sep=sep) @@ -658,25 +700,26 @@ class Timestamp(_Timestamp): year -= 1 month += 12 return (day + - np.fix((153*month - 457)/5) + - 365*year + + np.fix((153 * month - 457) / 5) + + 365 * year + np.floor(year / 4) - np.floor(year / 100) + np.floor(year / 400) + 1721118.5 + (self.hour + - self.minute/60.0 + - self.second/3600.0 + - self.microsecond/3600.0/1e+6 + - self.nanosecond/3600.0/1e+9 - )/24.0) + self.minute / 60.0 + + self.second / 3600.0 + + self.microsecond / 3600.0 / 1e+6 + + self.nanosecond / 3600.0 / 1e+9 + ) / 24.0) def normalize(self): """ Normalize Timestamp to midnight, preserving tz information. """ - normalized_value = date_normalize(np.array([self.value], dtype='i8'), tz=self.tz)[0] + normalized_value = date_normalize( + np.array([self.value], dtype='i8'), tz=self.tz)[0] return Timestamp(normalized_value).tz_localize(self.tz) def __radd__(self, other): @@ -685,7 +728,9 @@ class Timestamp(_Timestamp): return self + other -_nat_strings = set(['NaT','nat','NAT','nan','NaN','NAN']) +_nat_strings = set(['NaT', 'nat', 'NAT', 'nan', 'NaN', 'NAN']) + + class NaTType(_NaT): """(N)ot-(A)-(T)ime, the time equivalent of NaN""" @@ -724,6 +769,10 @@ class NaTType(_NaT): # GH 10939 return np.nan + @property + def is_leap_year(self): + return False + def __rdiv__(self, other): return _nat_rdivide_op(self, other) @@ -739,39 +788,42 @@ class NaTType(_NaT): return NotImplemented - fields = ['year', 'quarter', 'month', 'day', 'hour', 'minute', 'second', 'millisecond', 'microsecond', 'nanosecond', - 'week', 'dayofyear', 'days_in_month', 'daysinmonth', 'dayofweek', 'weekday_name'] + 'week', 'dayofyear', 'days_in_month', 'daysinmonth', 'dayofweek', + 'weekday_name'] for field in fields: prop = property(fget=lambda self: np.nan) setattr(NaTType, field, prop) -# GH9513 NaT methods (except to_datetime64) to raise, return np.nan, or return NaT -# create functions that raise, for binding to NaTType + +# GH9513 NaT methods (except to_datetime64) to raise, return np.nan, or +# return NaT create functions that raise, for binding to NaTType def _make_error_func(func_name): def f(*args, **kwargs): raise ValueError("NaTType does not support " + func_name) f.__name__ = func_name return f + def _make_nat_func(func_name): def f(*args, **kwargs): return NaT f.__name__ = func_name return f + def _make_nan_func(func_name): def f(*args, **kwargs): return np.nan f.__name__ = func_name return f -_nat_methods = ['date', 'now', 'replace', 'to_datetime', 'today'] +_nat_methods = ['date', 'now', 'replace', 'to_pydatetime', 'today'] _nan_methods = ['weekday', 'isoweekday', 'total_seconds'] -_implemented_methods = ['to_datetime64', 'isoformat'] +_implemented_methods = ['to_datetime', 'to_datetime64', 'isoformat'] _implemented_methods.extend(_nat_methods) _implemented_methods.extend(_nan_methods) @@ -789,7 +841,9 @@ for _maybe_method_name in dir(NaTType): if (callable(_maybe_method) and not _maybe_method_name.startswith("_") and _maybe_method_name not in _implemented_methods): - setattr(NaTType, _maybe_method_name, _make_error_func(_maybe_method_name)) + setattr(NaTType, _maybe_method_name, + _make_error_func(_maybe_method_name)) + def __nat_unpickle(*args): # return constant defined in the module @@ -823,22 +877,6 @@ cdef inline bint _cmp_nat_dt(_NaT lhs, _Timestamp rhs, int op) except -1: return _nat_scalar_rules[op] -cdef _tz_format(object obj, object zone): - try: - return obj.strftime(' %%Z, tz=%s' % zone) - except: - return ', tz=%s' % zone - -def is_timestamp_array(ndarray[object] values): - cdef int i, n = len(values) - if n == 0: - return False - for i in range(n): - if not is_timestamp(values[i]): - return False - return True - - cpdef object get_value_box(ndarray arr, object loc): cdef: Py_ssize_t i, sz @@ -911,16 +949,6 @@ cdef inline bint _is_multiple(int64_t us, int64_t mult): return us % mult == 0 -def apply_offset(ndarray[object] values, object offset): - cdef: - Py_ssize_t i, n = len(values) - ndarray[int64_t] new_values - object boxed - - result = np.empty(n, dtype='M8[ns]') - new_values = result.view('i8') - - cdef inline bint _cmp_scalar(int64_t lhs, int64_t rhs, int op) except -1: if op == Py_EQ: return lhs == rhs @@ -953,9 +981,10 @@ cdef str _NDIM_STRING = "ndim" # (see Timestamp class above). This will serve as a C extension type that # shadows the python class, where we do any heavy lifting. cdef class _Timestamp(datetime): + cdef readonly: int64_t value, nanosecond - object offset # frequency reference + object freq # frequency reference def __hash__(_Timestamp self): if self.nanosecond: @@ -973,7 +1002,7 @@ cdef class _Timestamp(datetime): ots = other elif isinstance(other, datetime): if self.nanosecond == 0: - val = self.to_datetime() + val = self.to_pydatetime() return PyObject_RichCompareBool(val, other, op) try: @@ -1029,13 +1058,15 @@ cdef class _Timestamp(datetime): pass tz = ", tz='{0}'".format(zone) if zone is not None else "" - offset = ", offset='{0}'".format(self.offset.freqstr) if self.offset is not None else "" + freq = ", freq='{0}'".format( + self.freq.freqstr) if self.freq is not None else "" - return "Timestamp('{stamp}'{tz}{offset})".format(stamp=stamp, tz=tz, offset=offset) + return "Timestamp('{stamp}'{tz}{freq})".format( + stamp=stamp, tz=tz, freq=freq) cdef bint _compare_outside_nanorange(_Timestamp self, datetime other, int op) except -1: - cdef datetime dtval = self.to_datetime() + cdef datetime dtval = self.to_pydatetime() self._assert_tzawareness_compat(other) @@ -1065,9 +1096,28 @@ cdef class _Timestamp(datetime): raise TypeError('Cannot compare tz-naive and tz-aware timestamps') cpdef datetime to_datetime(_Timestamp self): + """ + DEPRECATED: use :meth:`to_pydatetime` instead. + + Convert a Timestamp object to a native Python datetime object. + """ + warnings.warn("to_datetime is deprecated. Use self.to_pydatetime()", + FutureWarning, stacklevel=2) + return self.to_pydatetime(warn=False) + + cpdef datetime to_pydatetime(_Timestamp self, warn=True): + """ + Convert a Timestamp object to a native Python datetime object. + + If warn=True, issue a warning if nanoseconds is nonzero. + """ cdef: pandas_datetimestruct dts _TSObject ts + + if self.nanosecond != 0 and warn: + warnings.warn("Discarding nonzero nanoseconds in conversion", + UserWarning, stacklevel=2) ts = convert_to_tsobject(self, self.tzinfo, None, 0, 0) dts = ts.dts return datetime(dts.year, dts.month, dts.day, @@ -1083,17 +1133,22 @@ cdef class _Timestamp(datetime): if is_timedelta64_object(other): other_int = other.astype('timedelta64[ns]').view('i8') - return Timestamp(self.value + other_int, tz=self.tzinfo, offset=self.offset) + return Timestamp(self.value + other_int, + tz=self.tzinfo, freq=self.freq) elif is_integer_object(other): - if self.offset is None: + if self is NaT: + # to be compat with Period + return NaT + elif self.freq is None: raise ValueError("Cannot add integral value to Timestamp " - "without offset.") - return Timestamp((self.offset * other).apply(self), offset=self.offset) + "without freq.") + return Timestamp((self.freq * other).apply(self), freq=self.freq) elif isinstance(other, timedelta) or hasattr(other, 'delta'): nanos = _delta_to_nanoseconds(other) - result = Timestamp(self.value + nanos, tz=self.tzinfo, offset=self.offset) + result = Timestamp(self.value + nanos, + tz=self.tzinfo, freq=self.freq) if getattr(other, 'normalize', False): result = Timestamp(normalize_date(result)) return result @@ -1127,21 +1182,27 @@ cdef class _Timestamp(datetime): return NaT # coerce if necessary if we are a Timestamp-like - if isinstance(self, datetime) and (isinstance(other, datetime) or is_datetime64_object(other)): + if (isinstance(self, datetime) + and (isinstance(other, datetime) + or is_datetime64_object(other))): self = Timestamp(self) other = Timestamp(other) # validate tz's if get_timezone(self.tzinfo) != get_timezone(other.tzinfo): - raise TypeError("Timestamp subtraction must have the same timezones or no timezones") + raise TypeError( + "Timestamp subtraction must have the " + "same timezones or no timezones") - # scalar Timestamp/datetime - Timestamp/datetime -> yields a Timedelta + # scalar Timestamp/datetime - Timestamp/datetime -> yields a + # Timedelta try: - return Timedelta(self.value-other.value) + return Timedelta(self.value -other.value) except (OverflowError, OutOfBoundsDatetime): pass - # scalar Timestamp/datetime - Timedelta -> yields a Timestamp (with same timezone if specified) + # scalar Timestamp/datetime - Timedelta -> yields a Timestamp (with + # same timezone if specified) return datetime.__sub__(self, other) cpdef _get_field(self, field): @@ -1149,9 +1210,12 @@ cdef class _Timestamp(datetime): return int(out[0]) cpdef _get_start_end_field(self, field): - month_kw = self.freq.kwds.get('startingMonth', self.freq.kwds.get('month', 12)) if self.freq else 12 + month_kw = self.freq.kwds.get( + 'startingMonth', self.freq.kwds.get( + 'month', 12)) if self.freq else 12 freqstr = self.freqstr if self.freq else None - out = get_start_end_field(np.array([self.value], dtype=np.int64), field, freqstr, month_kw) + out = get_start_end_field( + np.array([self.value], dtype=np.int64), field, freqstr, month_kw) return out[0] property _repr_base: @@ -1340,19 +1404,20 @@ cdef convert_to_tsobject(object ts, object tz, object unit, obj.value = NPY_NAT else: obj.value = _get_datetime64_nanos(ts) - pandas_datetime_to_datetimestruct(obj.value, PANDAS_FR_ns, &obj.dts) + pandas_datetime_to_datetimestruct( + obj.value, PANDAS_FR_ns, &obj.dts) elif is_integer_object(ts): if ts == NPY_NAT: obj.value = NPY_NAT else: - ts = ts * cast_from_unit(None,unit) + ts = ts * cast_from_unit(None, unit) obj.value = ts pandas_datetime_to_datetimestruct(ts, PANDAS_FR_ns, &obj.dts) elif util.is_float_object(ts): if ts != ts or ts == NPY_NAT: obj.value = NPY_NAT else: - ts = cast_from_unit(ts,unit) + ts = cast_from_unit(ts, unit) obj.value = ts pandas_datetime_to_datetimestruct(ts, PANDAS_FR_ns, &obj.dts) elif PyDateTime_Check(ts): @@ -1403,7 +1468,9 @@ cdef convert_to_tsobject(object ts, object tz, object unit, ts = datetime.combine(ts, datetime_time()) return convert_to_tsobject(ts, tz, None, 0, 0) elif getattr(ts, '_typ', None) == 'period': - raise ValueError("Cannot convert Period to Timestamp unambiguously. Use to_timestamp") + raise ValueError( + "Cannot convert Period to Timestamp " + "unambiguously. Use to_timestamp") else: raise TypeError('Cannot convert input to Timestamp') @@ -1444,7 +1511,8 @@ cpdef convert_str_to_tsobject(object ts, object tz, object unit, else: try: _string_to_dts(ts, &obj.dts, &out_local, &out_tzoffset) - obj.value = pandas_datetimestruct_to_datetime(PANDAS_FR_ns, &obj.dts) + obj.value = pandas_datetimestruct_to_datetime( + PANDAS_FR_ns, &obj.dts) _check_dts_bounds(&obj.dts) if out_local == 1: obj.tzinfo = pytz.FixedOffset(out_tzoffset) @@ -1462,12 +1530,14 @@ cpdef convert_str_to_tsobject(object ts, object tz, object unit, ts = tz_convert_single(ts, tz, 'UTC') except ValueError: try: - ts = parse_datetime_string(ts, dayfirst=dayfirst, yearfirst=yearfirst) + ts = parse_datetime_string( + ts, dayfirst=dayfirst, yearfirst=yearfirst) except Exception: raise ValueError("could not convert string to Timestamp") return convert_to_tsobject(ts, tz, unit, dayfirst, yearfirst) + def _test_parse_iso8601(object ts): """ TESTING ONLY: Parse string into Timestamp using iso8601 parser. Used @@ -1500,8 +1570,12 @@ cdef inline void _localize_tso(_TSObject obj, object tz): dt = datetime(obj.dts.year, obj.dts.month, obj.dts.day, obj.dts.hour, obj.dts.min, obj.dts.sec, obj.dts.us, tz) delta = int(total_seconds(_get_utcoffset(tz, dt))) * 1000000000 - pandas_datetime_to_datetimestruct(obj.value + delta, - PANDAS_FR_ns, &obj.dts) + if obj.value != NPY_NAT: + pandas_datetime_to_datetimestruct(obj.value + delta, + PANDAS_FR_ns, &obj.dts) + else: + pandas_datetime_to_datetimestruct(obj.value, + PANDAS_FR_ns, &obj.dts) obj.tzinfo = tz else: # Adjust datetime64 timestamp, recompute datetimestruct @@ -1509,24 +1583,32 @@ cdef inline void _localize_tso(_TSObject obj, object tz): pos = trans.searchsorted(obj.value, side='right') - 1 - # static/pytz/dateutil specific code if _is_fixed_offset(tz): # statictzinfo - if len(deltas) > 0: + if len(deltas) > 0 and obj.value != NPY_NAT: pandas_datetime_to_datetimestruct(obj.value + deltas[0], PANDAS_FR_ns, &obj.dts) else: - pandas_datetime_to_datetimestruct(obj.value, PANDAS_FR_ns, &obj.dts) + pandas_datetime_to_datetimestruct( + obj.value, PANDAS_FR_ns, &obj.dts) obj.tzinfo = tz elif _treat_tz_as_pytz(tz): inf = tz._transition_info[pos] - pandas_datetime_to_datetimestruct(obj.value + deltas[pos], - PANDAS_FR_ns, &obj.dts) + if obj.value != NPY_NAT: + pandas_datetime_to_datetimestruct(obj.value + deltas[pos], + PANDAS_FR_ns, &obj.dts) + else: + pandas_datetime_to_datetimestruct(obj.value, + PANDAS_FR_ns, &obj.dts) obj.tzinfo = tz._tzinfos[inf] elif _treat_tz_as_dateutil(tz): - pandas_datetime_to_datetimestruct(obj.value + deltas[pos], - PANDAS_FR_ns, &obj.dts) + if obj.value != NPY_NAT: + pandas_datetime_to_datetimestruct(obj.value + deltas[pos], + PANDAS_FR_ns, &obj.dts) + else: + pandas_datetime_to_datetimestruct(obj.value, + PANDAS_FR_ns, &obj.dts) obj.tzinfo = tz else: obj.tzinfo = tz @@ -1558,21 +1640,29 @@ cdef inline bint _is_utc(object tz): cdef inline object _get_zone(object tz): """ We need to do several things here: - 1/ Distinguish between pytz and dateutil timezones - 2/ Not be over-specific (e.g. US/Eastern with/without DST is same *zone* but a different tz object) - 3/ Provide something to serialize when we're storing a datetime object in pytables. - - We return a string prefaced with dateutil if it's a dateutil tz, else just the tz name. It needs to be a - string so that we can serialize it with UJSON/pytables. maybe_get_tz (below) is the inverse of this process. + 1) Distinguish between pytz and dateutil timezones + 2) Not be over-specific (e.g. US/Eastern with/without DST is same *zone* + but a different tz object) + 3) Provide something to serialize when we're storing a datetime object + in pytables. + + We return a string prefaced with dateutil if it's a dateutil tz, else just + the tz name. It needs to be a string so that we can serialize it with + UJSON/pytables. maybe_get_tz (below) is the inverse of this process. """ if _is_utc(tz): return 'UTC' else: if _treat_tz_as_dateutil(tz): if '.tar.gz' in tz._filename: - raise ValueError('Bad tz filename. Dateutil on python 3 on windows has a bug which causes tzfile._filename to be the same for all ' - 'timezone files. Please construct dateutil timezones implicitly by passing a string like "dateutil/Europe/London" ' - 'when you construct your pandas objects instead of passing a timezone object. See https://github.com/pydata/pandas/pull/7362') + raise ValueError( + 'Bad tz filename. Dateutil on python 3 on windows has a ' + 'bug which causes tzfile._filename to be the same for all ' + 'timezone files. Please construct dateutil timezones ' + 'implicitly by passing a string like "dateutil/Europe' + '/London" when you construct your pandas objects instead ' + 'of passing a timezone object. See ' + 'https://github.com/pydata/pandas/pull/7362') return 'dateutil/' + tz._filename else: # tz is a pytz timezone or unknown. @@ -1587,11 +1677,13 @@ cdef inline object _get_zone(object tz): cpdef inline object maybe_get_tz(object tz): """ - (Maybe) Construct a timezone object from a string. If tz is a string, use it to construct a timezone object. - Otherwise, just return tz. + (Maybe) Construct a timezone object from a string. If tz is a string, use + it to construct a timezone object. Otherwise, just return tz. """ if isinstance(tz, string_types): - if tz.startswith('dateutil/'): + if tz == 'tzlocal()': + tz = _dateutil_tzlocal() + elif tz.startswith('dateutil/'): zone = tz[9:] tz = _dateutil_gettz(zone) # On Python 3 on Windows, the filename is not always set correctly. @@ -1604,7 +1696,6 @@ cpdef inline object maybe_get_tz(object tz): return tz - class OutOfBoundsDatetime(ValueError): pass @@ -1624,16 +1715,9 @@ cdef inline _check_dts_bounds(pandas_datetimestruct *dts): dts.day, dts.hour, dts.min, dts.sec) - raise OutOfBoundsDatetime('Out of bounds nanosecond timestamp: %s' % fmt) + raise OutOfBoundsDatetime( + 'Out of bounds nanosecond timestamp: %s' % fmt) -# elif isinstance(ts, _Timestamp): -# tmp = ts -# obj.value = (<_Timestamp> ts).value -# obj.dtval = -# elif isinstance(ts, object): -# # If all else fails -# obj.value = _dtlike_to_datetime64(ts, &obj.dts) -# obj.dtval = _dts_to_pydatetime(&obj.dts) def datetime_to_datetime64(ndarray[object] values): cdef: @@ -1662,7 +1746,8 @@ def datetime_to_datetime64(ndarray[object] values): _check_dts_bounds(&_ts.dts) else: if inferred_tz is not None: - raise ValueError('Cannot mix tz-aware with tz-naive values') + raise ValueError( + 'Cannot mix tz-aware with tz-naive values') iresult[i] = _pydatetime_to_dts(val, &dts) _check_dts_bounds(&dts) else: @@ -1671,9 +1756,9 @@ def datetime_to_datetime64(ndarray[object] values): return result, inferred_tz cdef: - set _not_datelike_strings = set(['a','A','m','M','p','P','t','T']) + set _not_datelike_strings = set(['a', 'A', 'm', 'M', 'p', 'P', 't', 'T']) -cpdef object _does_string_look_like_datetime(object date_string): +cpdef bint _does_string_look_like_datetime(object date_string): if date_string.startswith('0'): # Strings starting with 0 are more consistent with a # date-like string than a number @@ -1715,7 +1800,7 @@ def format_array_from_datetime(ndarray[int64_t] values, object tz=None, pandas_datetimestruct dts if na_rep is None: - na_rep = 'NaT' + na_rep = 'NaT' # if we don't have a format nor tz, then choose # a format based on precision @@ -1753,7 +1838,7 @@ def format_array_from_datetime(ndarray[int64_t] values, object tz=None, elif show_us: res += '.%.6d' % dts.us elif show_ms: - res += '.%.3d' % (dts.us/1000) + res += '.%.3d' % (dts.us /1000) result[i] = res @@ -1783,7 +1868,6 @@ cdef object _TIMEPAT = re.compile(r'^([01]?[0-9]|2[0-3]):([0-5][0-9])') def parse_datetime_string(object date_string, object freq=None, dayfirst=False, yearfirst=False, **kwargs): - """parse datetime string, only returns datetime. Also cares special handling matching time patterns. @@ -1811,8 +1895,14 @@ def parse_datetime_string(object date_string, object freq=None, except ValueError: pass - dt = parse_date(date_string, default=_DEFAULT_DATETIME, - dayfirst=dayfirst, yearfirst=yearfirst, **kwargs) + try: + dt = parse_date(date_string, default=_DEFAULT_DATETIME, + dayfirst=dayfirst, yearfirst=yearfirst, **kwargs) + except TypeError: + # following may be raised from dateutil + # TypeError: 'NoneType' object is not iterable + raise ValueError('Given date string not likely a datetime.') + return dt @@ -1880,23 +1970,27 @@ cdef inline object _parse_dateabbr_string(object date_string, object default, i = date_string.index('Q', 1, 6) if i == 1: quarter = int(date_string[0]) - if date_len == 4 or (date_len == 5 and date_string[i + 1] == '-'): + if date_len == 4 or (date_len == 5 + and date_string[i + 1] == '-'): # r'(\d)Q-?(\d\d)') year = 2000 + int(date_string[-2:]) - elif date_len == 6 or (date_len == 7 and date_string[i + 1] == '-'): + elif date_len == 6 or (date_len == 7 + and date_string[i + 1] == '-'): # r'(\d)Q-?(\d\d\d\d)') year = int(date_string[-4:]) else: raise ValueError elif i == 2 or i == 3: # r'(\d\d)-?Q(\d)' - if date_len == 4 or (date_len == 5 and date_string[i - 1] == '-'): + if date_len == 4 or (date_len == 5 + and date_string[i - 1] == '-'): quarter = int(date_string[-1]) year = 2000 + int(date_string[:2]) else: raise ValueError elif i == 4 or i == 5: - if date_len == 6 or (date_len == 7 and date_string[i - 1] == '-'): + if date_len == 6 or (date_len == 7 + and date_string[i - 1] == '-'): # r'(\d\d\d\d)-?Q(\d)' quarter = int(date_string[-1]) year = int(date_string[:4]) @@ -1904,7 +1998,8 @@ cdef inline object _parse_dateabbr_string(object date_string, object default, raise ValueError if not (1 <= quarter <= 4): - msg = 'Incorrect quarterly string is given, quarter must be between 1 and 4: {0}' + msg = ('Incorrect quarterly string is given, quarter must be ' + 'between 1 and 4: {0}') raise DateParseError(msg.format(date_string)) if freq is not None: @@ -1912,7 +2007,8 @@ cdef inline object _parse_dateabbr_string(object date_string, object default, try: mnum = _MONTH_NUMBERS[_get_rule_month(freq)] + 1 except (KeyError, ValueError): - msg = 'Unable to retrieve month information from given freq: {0}'.format(freq) + msg = ('Unable to retrieve month information from given ' + 'freq: {0}').format(freq) raise DateParseError(msg) month = (mnum + (quarter - 1) * 3) % 12 + 1 @@ -1929,7 +2025,8 @@ cdef inline object _parse_dateabbr_string(object date_string, object default, except ValueError: pass - if date_len == 6 and (freq == 'M' or getattr(freq, 'rule_code', None) == 'M'): + if date_len == 6 and (freq == 'M' or getattr( + freq, 'rule_code', None) == 'M'): year = int(date_string[:4]) month = int(date_string[4:6]) try: @@ -2015,7 +2112,8 @@ def dateutil_parse(object timestr, object default, ignoretz=False, # const for parsers -_DEFAULT_DATETIME = datetime(1, 1, 1).replace(hour=0, minute=0, second=0, microsecond=0) +_DEFAULT_DATETIME = datetime(1, 1, 1).replace( + hour=0, minute=0, second=0, microsecond=0) _MONTHS = ['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', 'OCT', 'NOV', 'DEC'] _MONTH_NUMBERS = dict((k, i) for i, k in enumerate(_MONTHS)) @@ -2059,7 +2157,9 @@ cpdef array_with_unit_to_datetime(ndarray values, unit, errors='coerce'): int64_t m ndarray[float64_t] fvalues ndarray mask - bint is_ignore=errors=='ignore', is_coerce=errors=='coerce', is_raise=errors=='raise' + bint is_ignore = errors=='ignore' + bint is_coerce = errors=='coerce' + bint is_raise = errors=='raise' bint need_to_iterate=True ndarray[int64_t] iresult ndarray[object] oresult @@ -2079,7 +2179,7 @@ cpdef array_with_unit_to_datetime(ndarray values, unit, errors='coerce'): # if we have nulls that are not type-compat # then need to iterate try: - iresult = values.astype('i8') + iresult = values.astype('i8', casting='same_kind', copy=False) mask = iresult == iNaT iresult[mask] = 0 fvalues = iresult.astype('f8') * m @@ -2090,9 +2190,11 @@ cpdef array_with_unit_to_datetime(ndarray values, unit, errors='coerce'): # check the bounds if not need_to_iterate: - if (fvalues < _NS_LOWER_BOUND).any() or (fvalues > _NS_UPPER_BOUND).any(): - raise OutOfBoundsDatetime("cannot convert input with unit '{0}'".format(unit)) - result = (iresult*m).astype('M8[ns]') + if ((fvalues < _NS_LOWER_BOUND).any() + or (fvalues > _NS_UPPER_BOUND).any()): + raise OutOfBoundsDatetime( + "cannot convert input with unit '{0}'".format(unit)) + result = (iresult *m).astype('M8[ns]') iresult = result.view('i8') iresult[mask] = iNaT return result @@ -2116,10 +2218,9 @@ cpdef array_with_unit_to_datetime(ndarray values, unit, errors='coerce'): iresult[i] = cast_from_unit(val, unit) except OverflowError: if is_raise: - raise OutOfBoundsDatetime("cannot convert input {0}" - "with the unit '{1}'".format( - val, - unit)) + raise OutOfBoundsDatetime( + "cannot convert input {0} with the unit " + "'{1}'".format(val, unit)) elif is_ignore: raise AssertionError iresult[i] = NPY_NAT @@ -2133,19 +2234,17 @@ cpdef array_with_unit_to_datetime(ndarray values, unit, errors='coerce'): iresult[i] = cast_from_unit(float(val), unit) except ValueError: if is_raise: - raise ValueError("non convertible value {0}" - "with the unit '{1}'".format( - val, - unit)) + raise ValueError( + "non convertible value {0} with the unit " + "'{1}'".format(val, unit)) elif is_ignore: raise AssertionError iresult[i] = NPY_NAT except: if is_raise: - raise OutOfBoundsDatetime("cannot convert input {0}" - "with the unit '{1}'".format( - val, - unit)) + raise OutOfBoundsDatetime( + "cannot convert input {0} with the unit " + "'{1}'".format(val, unit)) elif is_ignore: raise AssertionError iresult[i] = NPY_NAT @@ -2198,7 +2297,7 @@ cpdef array_with_unit_to_datetime(ndarray values, unit, errors='coerce'): cpdef array_to_datetime(ndarray[object] values, errors='raise', - dayfirst=False, yearfirst=False, freq=None, + dayfirst=False, yearfirst=False, format=None, utc=None, require_iso8601=False): cdef: @@ -2207,8 +2306,13 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', ndarray[int64_t] iresult ndarray[object] oresult pandas_datetimestruct dts - bint utc_convert = bool(utc), seen_integer=0, seen_string=0, seen_datetime=0 - bint is_raise=errors=='raise', is_ignore=errors=='ignore', is_coerce=errors=='coerce' + bint utc_convert = bool(utc) + bint seen_integer = 0 + bint seen_string = 0 + bint seen_datetime = 0 + bint is_raise = errors=='raise' + bint is_ignore = errors=='ignore' + bint is_coerce = errors=='coerce' _TSObject _ts int out_local=0, out_tzoffset=0 @@ -2307,7 +2411,8 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', seen_string=1 _string_to_dts(val, &dts, &out_local, &out_tzoffset) - value = pandas_datetimestruct_to_datetime(PANDAS_FR_ns, &dts) + value = pandas_datetimestruct_to_datetime( + PANDAS_FR_ns, &dts) if out_local == 1: tz = pytz.FixedOffset(out_tzoffset) value = tz_convert_single(value, tz, 'UTC') @@ -2320,14 +2425,15 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', iresult[i] = NPY_NAT continue elif is_raise: - raise ValueError("time data %r does match format specified" % - (val,)) + raise ValueError( + "time data %r doesn't match format " + "specified" % (val,)) else: return values try: py_dt = parse_datetime_string(val, dayfirst=dayfirst, - yearfirst=yearfirst, freq=freq) + yearfirst=yearfirst) except Exception: if is_coerce: iresult[i] = NPY_NAT @@ -2365,7 +2471,8 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', if is_integer_object(val) or is_float_object(val): result[i] = NPY_NAT elif is_raise: - raise ValueError("mixed datetimes and integers in passed array") + raise ValueError( + "mixed datetimes and integers in passed array") else: raise TypeError @@ -2380,10 +2487,10 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', # set as nan except if its a NaT if _checknull_with_nat(val): - if val.view('i8') == NPY_NAT: - oresult[i] = NaT - else: + if PyFloat_Check(val): oresult[i] = np.nan + else: + oresult[i] = NaT elif util.is_datetime64_object(val): if get_datetime64_value(val) == NPY_NAT: oresult[i] = NaT @@ -2407,7 +2514,7 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', try: oresult[i] = parse_datetime_string(val, dayfirst=dayfirst, - yearfirst=yearfirst, freq=freq) + yearfirst=yearfirst) _pydatetime_to_dts(oresult[i], &dts) _check_dts_bounds(&dts) except Exception: @@ -2422,34 +2529,11 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', return oresult -def parse_str_array_to_datetime(ndarray values, dayfirst=False, - yearfirst=False, object freq=None): - """Shortcut to parse str array for quicker DatetimeIndex construction""" - cdef: - Py_ssize_t i, n = len(values) - object val, py_dt - ndarray[int64_t] iresult - _TSObject _ts - - iresult = np.empty(n, dtype='i8') - - for i in range(n): - val = values[i] - try: - py_dt = parse_datetime_string(val, dayfirst=dayfirst, - yearfirst=yearfirst, freq=freq) - except Exception: - raise ValueError - _ts = convert_to_tsobject(py_dt, None, None, 0, 0) - iresult[i] = _ts.value - - return iresult - -# Similar to Timestamp/datetime, this is a construction requirement for timedeltas -# we need to do object instantiation in python -# This will serve as a C extension type that -# shadows the python class, where we do any heavy lifting. +# Similar to Timestamp/datetime, this is a construction requirement for +# timedeltas that we need to do object instantiation in python. This will +# serve as a C extension type that shadows the Python class, where we do any +# heavy lifting. cdef class _Timedelta(timedelta): cdef readonly: @@ -2509,21 +2593,20 @@ cdef class _Timedelta(timedelta): """ compute the components """ - cdef int64_t sfrac, ifrac, ivalue = self.value - cdef float64_t frac + cdef int64_t sfrac, ifrac, frac, ivalue = self.value if self.is_populated: return # put frac in seconds - frac = float(ivalue)/1e9 + frac = ivalue /(1000 *1000 *1000) if frac < 0: self._sign = -1 # even fraction - if int(-frac/86400) != -frac/86400.0: - self._d = int(-frac/86400.0+1) - frac += 86400*self._d + if (-frac % 86400) != 0: + self._d = -frac /86400 + 1 + frac += 86400 *self._d else: frac = -frac else: @@ -2531,40 +2614,39 @@ cdef class _Timedelta(timedelta): self._d = 0 if frac >= 86400: - self._d += int(frac / 86400) - frac -= self._d * 86400 + self._d += frac / 86400 + frac -= self._d * 86400 if frac >= 3600: - self._h = int(frac / 3600) - frac -= self._h * 3600 + self._h = frac / 3600 + frac -= self._h * 3600 else: self._h = 0 if frac >= 60: - self._m = int(frac / 60) - frac -= self._m * 60 + self._m = frac / 60 + frac -= self._m * 60 else: self._m = 0 if frac >= 0: - self._s = int(frac) - frac -= self._s + self._s = frac + frac -= self._s else: self._s = 0 - if frac != 0: - - # reset so we don't lose precision - sfrac = int((self._h*3600 + self._m*60 + self._s)*1e9) - if self._sign < 0: - ifrac = ivalue + self._d*DAY_NS - sfrac - else: - ifrac = ivalue - (self._d*DAY_NS + sfrac) + sfrac = (self._h * 3600 + self._m * 60 + + self._s) * (1000 * 1000 * 1000) + if self._sign < 0: + ifrac = ivalue + self._d *DAY_NS - sfrac + else: + ifrac = ivalue - (self._d *DAY_NS + sfrac) - self._ms = int(ifrac/1e6) - ifrac -= self._ms*1000*1000 - self._us = int(ifrac/1e3) - ifrac -= self._us*1000 + if ifrac != 0: + self._ms = ifrac /(1000 *1000) + ifrac -= self._ms *1000 *1000 + self._us = ifrac /1000 + ifrac -= self._us *1000 self._ns = ifrac else: self._ms = 0 @@ -2578,16 +2660,20 @@ cdef class _Timedelta(timedelta): return an actual datetime.timedelta object note: we lose nanosecond resolution if any """ - return timedelta(microseconds=int(self.value)/1000) + return timedelta(microseconds=int(self.value) /1000) cpdef bint _has_ns(self): return self.value % 1000 != 0 # components named tuple -Components = collections.namedtuple('Components',['days','hours','minutes','seconds','milliseconds','microseconds','nanoseconds']) +Components = collections.namedtuple('Components', [ + 'days', 'hours', 'minutes', 'seconds', + 'milliseconds', 'microseconds', 'nanoseconds']) # Python front end to C extension type _Timedelta # This serves as the box for timedelta64 + + class Timedelta(_Timedelta): """ Represents a duration, the difference between two dates or times. @@ -2600,7 +2686,8 @@ class Timedelta(_Timedelta): value : Timedelta, timedelta, np.timedelta64, string, or integer unit : string, [D,h,m,s,ms,us,ns] Denote the unit of the input, if input is an integer. Default 'ns'. - days, seconds, microseconds, milliseconds, minutes, hours, weeks : numeric, optional + days, seconds, microseconds, + milliseconds, minutes, hours, weeks : numeric, optional Values for construction in compat with datetime.timedelta. np ints and floats will be coereced to python ints and floats. @@ -2610,48 +2697,57 @@ class Timedelta(_Timedelta): """ - def __new__(cls, object value=None, unit=None, **kwargs): + def __new__(cls, object value=_no_input, unit=None, **kwargs): cdef _Timedelta td_base - if value is None: + if value is _no_input: if not len(kwargs): - raise ValueError("cannot construct a TimeDelta without a value/unit or descriptive keywords (days,seconds....)") + raise ValueError( + "cannot construct a Timedelta without a value/unit or " + "descriptive keywords (days,seconds....)") def _to_py_int_float(v): if is_integer_object(v): return int(v) elif is_float_object(v): return float(v) - raise TypeError("Invalid type {0}. Must be int or float.".format(type(v))) + raise TypeError( + "Invalid type {0}. Must be int or float.".format(type(v))) - kwargs = dict([ (k, _to_py_int_float(v)) for k, v in iteritems(kwargs) ]) + kwargs = dict([ (k, _to_py_int_float(v)) + for k, v in iteritems(kwargs) ]) try: - nano = kwargs.pop('nanoseconds',0) - value = convert_to_timedelta64(timedelta(**kwargs),'ns',False) + nano + nano = kwargs.pop('nanoseconds', 0) + value = convert_to_timedelta64( + timedelta(**kwargs), 'ns') + nano except TypeError as e: - raise ValueError("cannot construct a TimeDelta from the passed arguments, allowed keywords are " - "[weeks, days, hours, minutes, seconds, milliseconds, microseconds, nanoseconds]") + raise ValueError("cannot construct a Timedelta from the " + "passed arguments, allowed keywords are " + "[weeks, days, hours, minutes, seconds, " + "milliseconds, microseconds, nanoseconds]") if isinstance(value, Timedelta): value = value.value elif util.is_string_object(value): - value = np.timedelta64(parse_timedelta_string(value, False)) + value = np.timedelta64(parse_timedelta_string(value)) elif isinstance(value, timedelta): - value = convert_to_timedelta64(value,'ns',False) + value = convert_to_timedelta64(value, 'ns') elif isinstance(value, np.timedelta64): if unit is not None: value = value.astype('timedelta64[{0}]'.format(unit)) value = value.astype('timedelta64[ns]') - elif hasattr(value,'delta'): - value = np.timedelta64(_delta_to_nanoseconds(value.delta),'ns') + elif hasattr(value, 'delta'): + value = np.timedelta64(_delta_to_nanoseconds(value.delta), 'ns') elif is_integer_object(value) or util.is_float_object(value): # unit=None is de-facto 'ns' - value = convert_to_timedelta64(value,unit,False) + value = convert_to_timedelta64(value, unit) elif _checknull_with_nat(value): return NaT else: - raise ValueError("Value must be Timedelta, string, integer, float, timedelta or convertible") + raise ValueError( + "Value must be Timedelta, string, integer, " + "float, timedelta or convertible") if isinstance(value, np.timedelta64): value = value.view('i8') @@ -2661,7 +2757,7 @@ class Timedelta(_Timedelta): return NaT # make timedelta happy - td_base = _Timedelta.__new__(cls, microseconds=int(value)/1000) + td_base = _Timedelta.__new__(cls, microseconds=int(value) /1000) td_base.value = value td_base.is_populated = 0 return td_base @@ -2682,19 +2778,19 @@ class Timedelta(_Timedelta): self._ensure_components() if self._ns: - return "N" + return "N" elif self._us: - return "U" + return "U" elif self._ms: - return "L" + return "L" elif self._s: - return "S" + return "S" elif self._m: - return "T" + return "T" elif self._h: - return "H" + return "H" else: - return "D" + return "D" def _round(self, freq, rounder): @@ -2702,8 +2798,8 @@ class Timedelta(_Timedelta): from pandas.tseries.frequencies import to_offset unit = to_offset(freq).nanos - result = unit*rounder(self.value/float(unit)) - return Timedelta(result,unit='ns') + result = unit *rounder(self.value /float(unit)) + return Timedelta(result, unit='ns') def round(self, freq): """ @@ -2760,43 +2856,49 @@ class Timedelta(_Timedelta): self._ensure_components() if self._sign < 0: - sign_pretty = "-" - sign2_pretty = " +" + sign_pretty = "-" + sign2_pretty = " +" else: - sign_pretty = "" - sign2_pretty = " " + sign_pretty = "" + sign2_pretty = " " # show everything if format == 'all': - seconds_pretty = "%02d.%03d%03d%03d" % (self._s, self._ms, self._us, self._ns) - return "%s%d days%s%02d:%02d:%s" % (sign_pretty, self._d, sign2_pretty, self._h, self._m, seconds_pretty) + seconds_pretty = "%02d.%03d%03d%03d" % ( + self._s, self._ms, self._us, self._ns) + return "%s%d days%s%02d:%02d:%s" % (sign_pretty, self._d, + sign2_pretty, self._h, + self._m, seconds_pretty) # by default not showing nano if self._ms or self._us or self._ns: - seconds_pretty = "%02d.%03d%03d" % (self._s, self._ms, self._us) + seconds_pretty = "%02d.%03d%03d" % (self._s, self._ms, self._us) else: - seconds_pretty = "%02d" % self._s + seconds_pretty = "%02d" % self._s # if we have a partial day - subs = self._h or self._m or self._s or self._ms or self._us or self._ns + subs = (self._h or self._m or self._s or + self._ms or self._us or self._ns) if format == 'even_day': - if not subs: - return "%s%d days" % (sign_pretty, self._d) + if not subs: + return "%s%d days" % (sign_pretty, self._d) elif format == 'sub_day': - if not self._d: + if not self._d: - # degenerate, don't need the extra space - if self._sign > 0: - sign2_pretty = "" - return "%s%s%02d:%02d:%s" % (sign_pretty, sign2_pretty, self._h, self._m, seconds_pretty) + # degenerate, don't need the extra space + if self._sign > 0: + sign2_pretty = "" + return "%s%s%02d:%02d:%s" % (sign_pretty, sign2_pretty, + self._h, self._m, seconds_pretty) if subs or format=='long': - return "%s%d days%s%02d:%02d:%s" % (sign_pretty, self._d, sign2_pretty, self._h, self._m, seconds_pretty) + return "%s%d days%s%02d:%02d:%s" % (sign_pretty, self._d, + sign2_pretty, self._h, + self._m, seconds_pretty) return "%s%d days" % (sign_pretty, self._d) - def __repr__(self): return "Timedelta('{0}')".format(self._repr_base(format='long')) def __str__(self): @@ -2807,10 +2909,12 @@ class Timedelta(_Timedelta): """ Return a Components NamedTuple-like """ self._ensure_components() if self._sign < 0: - return Components(-self._d,self._h,self._m,self._s,self._ms,self._us,self._ns) + return Components(-self._d, self._h, self._m, self._s, + self._ms, self._us, self._ns) # return the named tuple - return Components(self._d,self._h,self._m,self._s,self._ms,self._us,self._ns) + return Components(self._d, self._h, self._m, self._s, + self._ms, self._us, self._ns) @property def days(self): @@ -2821,7 +2925,7 @@ class Timedelta(_Timedelta): """ self._ensure_components() if self._sign < 0: - return -1*self._d + return -1 *self._d return self._d @property @@ -2832,7 +2936,7 @@ class Timedelta(_Timedelta): .components will return the shown components """ self._ensure_components() - return self._h*3600 + self._m*60 + self._s + return self._h *3600 + self._m *60 + self._s @property def microseconds(self): @@ -2842,7 +2946,7 @@ class Timedelta(_Timedelta): .components will return the shown components """ self._ensure_components() - return self._ms*1000 + self._us + return self._ms *1000 + self._us @property def nanoseconds(self): @@ -2858,7 +2962,7 @@ class Timedelta(_Timedelta): """ Total duration of timedelta in seconds (to ns precision) """ - return 1e-9*self.value + return 1e-9 *self.value def __setstate__(self, state): (value) = state @@ -2879,13 +2983,13 @@ class Timedelta(_Timedelta): def _validate_ops_compat(self, other): # return True if we are compat with operating if _checknull_with_nat(other): - return True + return True elif isinstance(other, (Timedelta, timedelta, np.timedelta64)): - return True + return True elif util.is_string_object(other): - return True - elif hasattr(other,'delta'): - return True + return True + elif hasattr(other, 'delta'): + return True return False # higher than np.ndarray and np.matrix @@ -2914,10 +3018,17 @@ class Timedelta(_Timedelta): if not self._validate_ops_compat(other): return NotImplemented - other = Timedelta(other) if other is NaT: return NaT + + try: + other = Timedelta(other) + except ValueError: + # failed to parse as timedelta + return NotImplemented + return Timedelta(op(self.value, other.value), unit='ns') + f.__name__ = name return f @@ -2937,9 +3048,9 @@ class Timedelta(_Timedelta): # only integers and floats allowed if not (is_integer_object(other) or is_float_object(other)): - return NotImplemented + return NotImplemented - return Timedelta(other*self.value, unit='ns') + return Timedelta(other *self.value, unit='ns') __rmul__ = __mul__ @@ -2950,7 +3061,7 @@ class Timedelta(_Timedelta): # integers or floats if is_integer_object(other) or is_float_object(other): - return Timedelta(self.value/other, unit='ns') + return Timedelta(self.value /other, unit='ns') if not self._validate_ops_compat(other): return NotImplemented @@ -2958,7 +3069,7 @@ class Timedelta(_Timedelta): other = Timedelta(other) if other is NaT: return np.nan - return self.value/float(other.value) + return self.value /float(other.value) def __rtruediv__(self, other): if hasattr(other, 'dtype'): @@ -2973,13 +3084,13 @@ class Timedelta(_Timedelta): return float(other.value) / self.value if not PY3: - __div__ = __truediv__ - __rdiv__ = __rtruediv__ + __div__ = __truediv__ + __rdiv__ = __rtruediv__ def _not_implemented(self, *args, **kwargs): return NotImplemented - __floordiv__ = _not_implemented + __floordiv__ = _not_implemented __rfloordiv__ = _not_implemented def _op_unary_method(func, name): @@ -2995,77 +3106,83 @@ class Timedelta(_Timedelta): __abs__ = _op_unary_method(lambda x: abs(x), '__abs__') # resolution in ns -Timedelta.min = Timedelta(np.iinfo(np.int64).min+1) +Timedelta.min = Timedelta(np.iinfo(np.int64).min +1) Timedelta.max = Timedelta(np.iinfo(np.int64).max) cdef PyTypeObject* td_type = Timedelta + cdef inline bint is_timedelta(object o): return Py_TYPE(o) == td_type # isinstance(o, Timedelta) -def array_to_timedelta64(ndarray[object] values, unit='ns', errors='raise'): - """ convert an ndarray to an array of ints that are timedeltas - force conversion if coerce = True, - else will raise if cannot convert """ + +cpdef array_to_timedelta64(ndarray[object] values, unit='ns', errors='raise'): + """ + Convert an ndarray to an array of timedeltas. If errors == 'coerce', + coerce non-convertible objects to NaT. Otherwise, raise. + """ + cdef: Py_ssize_t i, n ndarray[int64_t] iresult - bint is_raise=errors=='raise', is_ignore=errors=='ignore', is_coerce=errors=='coerce' - assert is_raise or is_ignore or is_coerce + if errors not in ('ignore', 'raise', 'coerce'): + raise ValueError("errors must be one of 'ignore', " + "'raise', or 'coerce'}") n = values.shape[0] result = np.empty(n, dtype='m8[ns]') iresult = result.view('i8') - # usually we have all strings - # if so then we hit the fast path + # Usually, we have all strings. If so, we hit the fast path. + # If this path fails, we try conversion a different way, and + # this is where all of the error handling will take place. try: for i in range(n): - result[i] = parse_timedelta_string(values[i], is_coerce) + result[i] = parse_timedelta_string(values[i]) except: for i in range(n): - result[i] = convert_to_timedelta64(values[i], unit, is_coerce) - return iresult - + try: + result[i] = convert_to_timedelta64(values[i], unit) + except ValueError: + if errors == 'coerce': + result[i] = NPY_NAT + else: + raise -def convert_to_timedelta(object ts, object unit='ns', errors='raise'): - cdef bint is_raise=errors=='raise', is_ignore=errors=='ignore', is_coerce=errors=='coerce' + return iresult - assert is_raise or is_ignore or is_coerce - return convert_to_timedelta64(ts, unit, is_coerce) - -cdef dict timedelta_abbrevs = { 'D' : 'd', - 'd' : 'd', - 'days' : 'd', - 'day' : 'd', - 'hours' : 'h', - 'hour' : 'h', - 'hr' : 'h', - 'h' : 'h', - 'm' : 'm', - 'minute' : 'm', - 'min' : 'm', - 'minutes' : 'm', - 's' : 's', - 'seconds' : 's', - 'sec' : 's', - 'second' : 's', - 'ms' : 'ms', - 'milliseconds' : 'ms', - 'millisecond' : 'ms', - 'milli' : 'ms', - 'millis' : 'ms', - 'us' : 'us', - 'microseconds' : 'us', - 'microsecond' : 'us', - 'micro' : 'us', - 'micros' : 'us', - 'ns' : 'ns', - 'nanoseconds' : 'ns', - 'nano' : 'ns', - 'nanos' : 'ns', - 'nanosecond' : 'ns', +cdef dict timedelta_abbrevs = { 'D': 'd', + 'd': 'd', + 'days': 'd', + 'day': 'd', + 'hours': 'h', + 'hour': 'h', + 'hr': 'h', + 'h': 'h', + 'm': 'm', + 'minute': 'm', + 'min': 'm', + 'minutes': 'm', + 's': 's', + 'seconds': 's', + 'sec': 's', + 'second': 's', + 'ms': 'ms', + 'milliseconds': 'ms', + 'millisecond': 'ms', + 'milli': 'ms', + 'millis': 'ms', + 'us': 'us', + 'microseconds': 'us', + 'microsecond': 'us', + 'micro': 'us', + 'micros': 'us', + 'ns': 'ns', + 'nanoseconds': 'ns', + 'nano': 'ns', + 'nanos': 'ns', + 'nanosecond': 'ns', } timedelta_abbrevs_map = timedelta_abbrevs @@ -3101,15 +3218,10 @@ cdef inline timedelta_from_spec(object number, object frac, object unit): n = ''.join(number) + '.' + ''.join(frac) return cast_from_unit(float(n), unit) -cdef inline parse_timedelta_string(object ts, coerce=False): +cdef inline parse_timedelta_string(object ts): """ - Parse an regular format timedelta string - - Return an int64_t or raise a ValueError on an invalid parse - - if coerce, set a non-valid value to NaT - - Return a ns based int64 + Parse a regular format timedelta string. Return an int64_t (in ns) + or raise a ValueError on an invalid parse. """ cdef: @@ -3120,7 +3232,8 @@ cdef inline parse_timedelta_string(object ts, coerce=False): list number=[], frac=[], unit=[] # neg : tracks if we have a leading negative for the value - # have_dot : tracks if we are processing a dot (either post hhmmss or inside an expression) + # have_dot : tracks if we are processing a dot (either post hhmmss or + # inside an expression) # have_value : track if we have at least 1 leading unit # have_hhmmss : tracks if we have a regular format hh:mm:ss @@ -3165,13 +3278,7 @@ cdef inline parse_timedelta_string(object ts, coerce=False): number.append(c) else: - - try: - r = timedelta_from_spec(number, frac, unit) - except ValueError: - if coerce: - return NPY_NAT - raise + r = timedelta_from_spec(number, frac, unit) unit, number, frac = [], [c], [] result += timedelta_as_neg(r, neg) @@ -3198,9 +3305,9 @@ cdef inline parse_timedelta_string(object ts, coerce=False): result += timedelta_as_neg(r, neg) have_hhmmss = 1 else: - if coerce: - return NPY_NAT - raise ValueError("expecting hh:mm:ss format, received: {0}".format(ts)) + raise ValueError("expecting hh:mm:ss format, " + "received: {0}".format(ts)) + unit, number = [], [] # after the decimal point @@ -3230,29 +3337,23 @@ cdef inline parse_timedelta_string(object ts, coerce=False): # we had a dot, but we have a fractional # value since we have an unit if have_dot and len(unit): - try: - r = timedelta_from_spec(number, frac, unit) - result += timedelta_as_neg(r, neg) - except ValueError: - if coerce: - return NPY_NAT - raise + r = timedelta_from_spec(number, frac, unit) + result += timedelta_as_neg(r, neg) # we have a dot as part of a regular format # e.g. hh:mm:ss.fffffff elif have_dot: - if (len(number) or len(frac)) and not len(unit) and current_unit is None: - if coerce: - return NPY_NAT + if ((len(number) or len(frac)) and not len(unit) + and current_unit is None): raise ValueError("no units specified") if len(frac) > 0 and len(frac) <= 3: - m = 10**(3-len(frac)) * 1000L * 1000L + m = 10**(3 -len(frac)) * 1000L * 1000L elif len(frac) > 3 and len(frac) <= 6: - m = 10**(6-len(frac)) * 1000L + m = 10**(6 -len(frac)) * 1000L else: - m = 10**(9-len(frac)) + m = 10**(9 -len(frac)) r = int(''.join(frac)) * m result += timedelta_as_neg(r, neg) @@ -3268,38 +3369,24 @@ cdef inline parse_timedelta_string(object ts, coerce=False): # we have a last abbreviation elif len(unit): - if len(number): - try: - r = timedelta_from_spec(number, frac, unit) - result += timedelta_as_neg(r, neg) - except ValueError: - if coerce: - return NPY_NAT - raise + r = timedelta_from_spec(number, frac, unit) + result += timedelta_as_neg(r, neg) else: - if coerce: - return NPY_NAT raise ValueError("unit abbreviation w/o a number") # treat as nanoseconds # but only if we don't have anything else else: - if have_value: raise ValueError("have leftover units") if len(number): - try: - r = timedelta_from_spec(number, frac, 'ns') - result += timedelta_as_neg(r, neg) - except ValueError: - if coerce: - return NPY_NAT - raise + r = timedelta_from_spec(number, frac, 'ns') + result += timedelta_as_neg(r, neg) return result -cdef inline convert_to_timedelta64(object ts, object unit, object coerce): +cpdef convert_to_timedelta64(object ts, object unit): """ Convert an incoming object to a timedelta64 if possible @@ -3310,9 +3397,7 @@ cdef inline convert_to_timedelta64(object ts, object unit, object coerce): - np.int64 (with unit providing a possible modifier) - None/NaT - if coerce, set a non-valid value to NaT - - Return a ns based int64 + Return an ns based int64 # kludgy here until we have a timedelta scalar # handle the numpy < 1.7 case @@ -3334,7 +3419,7 @@ cdef inline convert_to_timedelta64(object ts, object unit, object coerce): else: if util.is_array(ts): ts = ts.astype('int64').item() - if unit in ['Y','M','W']: + if unit in ['Y', 'M', 'W']: ts = np.timedelta64(ts, unit) else: ts = cast_from_unit(ts, unit) @@ -3342,25 +3427,26 @@ cdef inline convert_to_timedelta64(object ts, object unit, object coerce): elif is_float_object(ts): if util.is_array(ts): ts = ts.astype('int64').item() - if unit in ['Y','M','W']: + if unit in ['Y', 'M', 'W']: ts = np.timedelta64(int(ts), unit) else: ts = cast_from_unit(ts, unit) ts = np.timedelta64(ts) elif util.is_string_object(ts): - ts = np.timedelta64(parse_timedelta_string(ts, coerce)) - elif hasattr(ts,'delta'): - ts = np.timedelta64(_delta_to_nanoseconds(ts),'ns') + ts = np.timedelta64(parse_timedelta_string(ts)) + elif hasattr(ts, 'delta'): + ts = np.timedelta64(_delta_to_nanoseconds(ts), 'ns') if isinstance(ts, timedelta): ts = np.timedelta64(ts) elif not isinstance(ts, np.timedelta64): - if coerce: - return np.timedelta64(NPY_NAT) - raise ValueError("Invalid type for timedelta scalar: %s" % type(ts)) + raise ValueError("Invalid type for timedelta " + "scalar: %s" % type(ts)) return ts.astype('timedelta64[ns]') -def array_strptime(ndarray[object] values, object fmt, bint exact=True, errors='raise'): + +def array_strptime(ndarray[object] values, object fmt, + bint exact=True, errors='raise'): """ Parameters ---------- @@ -3379,7 +3465,9 @@ def array_strptime(ndarray[object] values, object fmt, bint exact=True, errors=' int64_t us, ns object val, group_key, ampm, found dict found_key - bint is_raise=errors=='raise', is_ignore=errors=='ignore', is_coerce=errors=='coerce' + bint is_raise = errors=='raise' + bint is_ignore = errors=='ignore' + bint is_coerce = errors=='coerce' assert is_raise or is_ignore or is_coerce @@ -3457,8 +3545,8 @@ def array_strptime(ndarray[object] values, object fmt, bint exact=True, errors=' if is_coerce: iresult[i] = NPY_NAT continue - raise ValueError("time data %r does not match format %r (match)" % - (values[i], fmt)) + raise ValueError("time data %r does not match " + "format %r (match)" % (values[i], fmt)) if len(val) != found.end(): if is_coerce: iresult[i] = NPY_NAT @@ -3473,8 +3561,8 @@ def array_strptime(ndarray[object] values, object fmt, bint exact=True, errors=' if is_coerce: iresult[i] = NPY_NAT continue - raise ValueError("time data %r does not match format %r (search)" % - (values[i], fmt)) + raise ValueError("time data %r does not match format " + "%r (search)" % (values[i], fmt)) year = 1900 month = day = 1 @@ -3578,7 +3666,8 @@ def array_strptime(ndarray[object] values, object fmt, bint exact=True, errors=' # same and yet time.daylight is true; too ambiguous to # be able to tell what timezone has daylight savings if (time.tzname[0] == time.tzname[1] and - time.daylight and found_zone not in ("utc", "gmt")): + time.daylight and found_zone not in ( + "utc", "gmt")): break else: tz = value @@ -3594,9 +3683,10 @@ def array_strptime(ndarray[object] values, object fmt, bint exact=True, errors=' # calculation. try: if julian == -1: - # Need to add 1 to result since first day of the year is 1, not 0. + # Need to add 1 to result since first day of the year is 1, not + # 0. julian = datetime_date(year, month, day).toordinal() - \ - datetime_date(year, 1, 1).toordinal() + 1 + datetime_date(year, 1, 1).toordinal() + 1 else: # Assume that if they bothered to include Julian day it will # be accurate. datetime_result = datetime_date.fromordinal( @@ -3605,10 +3695,10 @@ def array_strptime(ndarray[object] values, object fmt, bint exact=True, errors=' month = datetime_result.month day = datetime_result.day except ValueError: - if is_coerce: - iresult[i] = NPY_NAT - continue - raise + if is_coerce: + iresult[i] = NPY_NAT + continue + raise if weekday == -1: weekday = datetime_date(year, month, day).weekday() @@ -3687,10 +3777,11 @@ cpdef inline int64_t cast_from_unit(object ts, object unit) except? -1: # cast the unit, multiply base/frace separately # to avoid precision issues from float -> int base = ts - frac = ts-base + frac = ts -base if p: - frac = round(frac,p) - return (base*m) + (frac*m) + frac = round(frac, p) + return (base *m) + (frac *m) + def cast_to_nanoseconds(ndarray arr): cdef: @@ -3736,6 +3827,7 @@ def pydt_to_i8(object pydt): return ts.value + def i8_to_pydt(int64_t i8, object tzinfo = None): """ Inverse of pydt_to_i8 @@ -3752,13 +3844,14 @@ try: except: have_pytz = False + def tz_convert(ndarray[int64_t] vals, object tz1, object tz2): cdef: - ndarray[int64_t] utc_dates, tt, result, trans, deltas, posn + ndarray[int64_t] utc_dates, tt, result, trans, deltas Py_ssize_t i, j, pos, n = len(vals) - int64_t v, offset + ndarray[Py_ssize_t] posn + int64_t v, offset, delta pandas_datetimestruct dts - Py_ssize_t trans_len if not have_pytz: import pytz @@ -3767,7 +3860,6 @@ def tz_convert(ndarray[int64_t] vals, object tz1, object tz2): return np.array([], dtype=np.int64) # Convert to UTC - if _get_zone(tz1) != 'UTC': utc_dates = np.empty(n, dtype=np.int64) if _is_tzlocal(tz1): @@ -3790,7 +3882,6 @@ def tz_convert(ndarray[int64_t] vals, object tz1, object tz2): if not len(tt): return vals - trans_len = len(trans) posn = trans.searchsorted(tt, side='right') j = 0 for i in range(n): @@ -3820,24 +3911,26 @@ def tz_convert(ndarray[int64_t] vals, object tz1, object tz2): pandas_datetime_to_datetimestruct(v, PANDAS_FR_ns, &dts) dt = datetime(dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us, tz2) - delta = int(total_seconds(_get_utcoffset(tz2, dt))) * 1000000000 + delta = int(total_seconds( + _get_utcoffset(tz2, dt))) * 1000000000 result[i] = v + delta - return result + return result # Convert UTC to other timezone trans, deltas, typ = _get_dst_info(tz2) - trans_len = len(trans) - - # if all NaT, return all NaT - if (utc_dates==NPY_NAT).all(): - return utc_dates # use first non-NaT element # if all-NaT, return all-NaT if (result==NPY_NAT).all(): return result - posn = trans.searchsorted(utc_dates[utc_dates!=NPY_NAT], side='right') + # if all NaT, return all NaT + tt = utc_dates[utc_dates!=NPY_NAT] + if not len(tt): + return utc_dates + + posn = trans.searchsorted(tt, side='right') + j = 0 for i in range(n): v = utc_dates[i] @@ -3852,6 +3945,7 @@ def tz_convert(ndarray[int64_t] vals, object tz1, object tz2): result[i] = v + offset return result + def tz_convert_single(int64_t val, object tz1, object tz2): cdef: ndarray[int64_t] trans, deltas @@ -3905,7 +3999,8 @@ def tz_convert_single(int64_t val, object tz1, object tz2): dst_cache = {} cdef inline bint _treat_tz_as_pytz(object tz): - return hasattr(tz, '_utc_transition_times') and hasattr(tz, '_transition_info') + return hasattr(tz, '_utc_transition_times') and hasattr( + tz, '_transition_info') cdef inline bint _treat_tz_as_dateutil(object tz): return hasattr(tz, '_trans_list') and hasattr(tz, '_trans_idx') @@ -3918,24 +4013,32 @@ def _p_tz_cache_key(tz): cdef inline object _tz_cache_key(object tz): """ - Return the key in the cache for the timezone info object or None if unknown. + Return the key in the cache for the timezone info object or None + if unknown. - The key is currently the tz string for pytz timezones, the filename for dateutil timezones. + The key is currently the tz string for pytz timezones, the filename for + dateutil timezones. Notes ===== - This cannot just be the hash of a timezone object. Unfortunately, the hashes of two dateutil tz objects - which represent the same timezone are not equal (even though the tz objects will compare equal and - represent the same tz file). - Also, pytz objects are not always hashable so we use str(tz) instead. + This cannot just be the hash of a timezone object. Unfortunately, the + hashes of two dateutil tz objects which represent the same timezone are + not equal (even though the tz objects will compare equal and represent + the same tz file). Also, pytz objects are not always hashable so we use + str(tz) instead. """ if isinstance(tz, _pytz_BaseTzInfo): return tz.zone elif isinstance(tz, _dateutil_tzfile): if '.tar.gz' in tz._filename: - raise ValueError('Bad tz filename. Dateutil on python 3 on windows has a bug which causes tzfile._filename to be the same for all ' - 'timezone files. Please construct dateutil timezones implicitly by passing a string like "dateutil/Europe/London" ' - 'when you construct your pandas objects instead of passing a timezone object. See https://github.com/pydata/pandas/pull/7362') + raise ValueError('Bad tz filename. Dateutil on python 3 on ' + 'windows has a bug which causes tzfile._filename ' + 'to be the same for all timezone files. Please ' + 'construct dateutil timezones implicitly by ' + 'passing a string like "dateutil/Europe/London" ' + 'when you construct your pandas objects instead ' + 'of passing a timezone object. See ' + 'https://github.com/pydata/pandas/pull/7362') return 'dateutil' + tz._filename else: return None @@ -3972,26 +4075,29 @@ cdef object _get_dst_info(object tz): if len(tz._trans_list): # get utc trans times trans_list = _get_utc_trans_times_from_dateutil_tz(tz) - trans = np.hstack([np.array([0], dtype='M8[s]'), # place holder for first item - np.array(trans_list, dtype='M8[s]')]).astype('M8[ns]') # all trans listed + trans = np.hstack([ + np.array([0], dtype='M8[s]'), # place holder for first item + np.array(trans_list, dtype='M8[s]')]).astype( + 'M8[ns]') # all trans listed trans = trans.view('i8') trans[0] = NPY_NAT + 1 # deltas - deltas = np.array([v.offset for v in (tz._ttinfo_before,) + tz._trans_idx], dtype='i8') # + (tz._ttinfo_std,) + deltas = np.array([v.offset for v in ( + tz._ttinfo_before,) + tz._trans_idx], dtype='i8') deltas *= 1000000000 typ = 'dateutil' elif _is_fixed_offset(tz): trans = np.array([NPY_NAT + 1], dtype=np.int64) - deltas = np.array([tz._ttinfo_std.offset], dtype='i8') * 1000000000 + deltas = np.array([tz._ttinfo_std.offset], + dtype='i8') * 1000000000 typ = 'fixed' else: trans = np.array([], dtype='M8[ns]') deltas = np.array([], dtype='i8') typ = None - else: # static tzinfo trans = np.array([NPY_NAT + 1], dtype=np.int64) @@ -4005,8 +4111,9 @@ cdef object _get_dst_info(object tz): cdef object _get_utc_trans_times_from_dateutil_tz(object tz): """ - Transition times in dateutil timezones are stored in local non-dst time. This code - converts them to UTC. It's the reverse of the code in dateutil.tz.tzfile.__init__. + Transition times in dateutil timezones are stored in local non-dst + time. This code converts them to UTC. It's the reverse of the code + in dateutil.tz.tzfile.__init__. """ new_trans = list(tz._trans_list) last_std_offset = 0 @@ -4016,6 +4123,7 @@ cdef object _get_utc_trans_times_from_dateutil_tz(object tz): new_trans[i] = trans - last_std_offset return new_trans + def tot_seconds(td): return total_seconds(td) @@ -4085,7 +4193,8 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None, elif hasattr(ambiguous, '__iter__'): is_dst = True if len(ambiguous) != len(vals): - raise ValueError("Length of ambiguous bool-array must be the same size as vals") + raise ValueError( + "Length of ambiguous bool-array must be the same size as vals") trans, deltas, typ = _get_dst_info(tz) @@ -4098,7 +4207,8 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None, result_b.fill(NPY_NAT) # left side - idx_shifted = (np.maximum(0, trans.searchsorted(vals - DAY_NS, side='right') - 1)).astype(np.int64) + idx_shifted = (np.maximum(0, trans.searchsorted( + vals - DAY_NS, side='right') - 1)).astype(np.int64) for i in range(n): v = vals[i] - deltas[idx_shifted[i]] @@ -4109,7 +4219,8 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None, result_a[i] = v # right side - idx_shifted = (np.maximum(0, trans.searchsorted(vals + DAY_NS, side='right') - 1)).astype(np.int64) + idx_shifted = (np.maximum(0, trans.searchsorted( + vals + DAY_NS, side='right') - 1)).astype(np.int64) for i in range(n): v = vals[i] - deltas[idx_shifted[i]] @@ -4126,36 +4237,39 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None, # Get the ambiguous hours (given the above, these are the hours # where result_a != result_b and neither of them are NAT) both_nat = np.logical_and(result_a != NPY_NAT, result_b != NPY_NAT) - both_eq = result_a == result_b + both_eq = result_a == result_b trans_idx = np.squeeze(np.nonzero(np.logical_and(both_nat, ~both_eq))) if trans_idx.size == 1: stamp = Timestamp(vals[trans_idx]) - raise pytz.AmbiguousTimeError("Cannot infer dst time from %s as" - "there are no repeated times" % stamp) + raise pytz.AmbiguousTimeError( + "Cannot infer dst time from %s as there " + "are no repeated times" % stamp) # Split the array into contiguous chunks (where the difference between - # indices is 1). These are effectively dst transitions in different years - # which is useful for checking that there is not an ambiguous transition - # in an individual year. + # indices is 1). These are effectively dst transitions in different + # years which is useful for checking that there is not an ambiguous + # transition in an individual year. if trans_idx.size > 0: - one_diff = np.where(np.diff(trans_idx)!=1)[0]+1 + one_diff = np.where(np.diff(trans_idx) != 1)[0] +1 trans_grp = np.array_split(trans_idx, one_diff) - # Iterate through each day, if there are no hours where the delta is negative - # (indicates a repeat of hour) the switch cannot be inferred + # Iterate through each day, if there are no hours where the + # delta is negative (indicates a repeat of hour) the switch + # cannot be inferred for grp in trans_grp: delta = np.diff(result_a[grp]) - if grp.size == 1 or np.all(delta>0): + if grp.size == 1 or np.all(delta > 0): stamp = Timestamp(vals[grp[0]]) raise pytz.AmbiguousTimeError(stamp) - # Find the index for the switch and pull from a for dst and b for standard - switch_idx = (delta<=0).nonzero()[0] + # Find the index for the switch and pull from a for dst and b + # for standard + switch_idx = (delta <= 0).nonzero()[0] if switch_idx.size > 1: - raise pytz.AmbiguousTimeError("There are %i dst switches " - "when there should only be 1." - % switch_idx.size) - switch_idx = switch_idx[0]+1 # Pull the only index and adjust + raise pytz.AmbiguousTimeError( + "There are %i dst switches when " + "there should only be 1." % switch_idx.size) + switch_idx = switch_idx[0] + 1 # Pull the only index and adjust a_idx = grp[:switch_idx] b_idx = grp[switch_idx:] dst_hours[grp] = np.hstack((result_a[a_idx], result_b[b_idx])) @@ -4180,9 +4294,9 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None, result[i] = NPY_NAT else: stamp = Timestamp(vals[i]) - raise pytz.AmbiguousTimeError("Cannot infer dst time from %r, "\ - "try using the 'ambiguous' argument" - % stamp) + raise pytz.AmbiguousTimeError( + "Cannot infer dst time from %r, try using the " + "'ambiguous' argument" % stamp) elif left != NPY_NAT: result[i] = left elif right != NPY_NAT: @@ -4262,6 +4376,7 @@ def build_field_sarray(ndarray[int64_t] dtindex): return out + def get_time_micros(ndarray[int64_t] dtindex): """ Datetime as int64 representation to a structured array of fields @@ -4300,7 +4415,7 @@ def get_date_field(ndarray[int64_t] dtindex, object field): _month_offset = np.array( [[ 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365 ], [ 0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366 ]], - dtype=np.int32 ) + dtype=np.int32 ) count = len(dtindex) out = np.empty(count, dtype='i4') @@ -4310,7 +4425,8 @@ def get_date_field(ndarray[int64_t] dtindex, object field): for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) out[i] = dts.year return out @@ -4319,7 +4435,8 @@ def get_date_field(ndarray[int64_t] dtindex, object field): for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) out[i] = dts.month return out @@ -4328,7 +4445,8 @@ def get_date_field(ndarray[int64_t] dtindex, object field): for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) out[i] = dts.day return out @@ -4337,7 +4455,8 @@ def get_date_field(ndarray[int64_t] dtindex, object field): for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) out[i] = dts.hour return out @@ -4346,7 +4465,8 @@ def get_date_field(ndarray[int64_t] dtindex, object field): for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) out[i] = dts.min return out @@ -4355,7 +4475,8 @@ def get_date_field(ndarray[int64_t] dtindex, object field): for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) out[i] = dts.sec return out @@ -4364,7 +4485,8 @@ def get_date_field(ndarray[int64_t] dtindex, object field): for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) out[i] = dts.us return out @@ -4373,7 +4495,8 @@ def get_date_field(ndarray[int64_t] dtindex, object field): for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) out[i] = dts.ps / 1000 return out elif field == 'doy': @@ -4381,9 +4504,10 @@ def get_date_field(ndarray[int64_t] dtindex, object field): for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) isleap = is_leapyear(dts.year) - out[i] = _month_offset[isleap, dts.month-1] + dts.day + out[i] = _month_offset[isleap, dts.month -1] + dts.day return out elif field == 'dow': @@ -4391,7 +4515,8 @@ def get_date_field(ndarray[int64_t] dtindex, object field): for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) out[i] = dayofweek(dts.year, dts.month, dts.day) return out @@ -4400,7 +4525,8 @@ def get_date_field(ndarray[int64_t] dtindex, object field): for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) isleap = is_leapyear(dts.year) isleap_prev = is_leapyear(dts.year - 1) mo_off = _month_offset[isleap, dts.month - 1] @@ -4430,7 +4556,8 @@ def get_date_field(ndarray[int64_t] dtindex, object field): for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) out[i] = dts.month out[i] = ((out[i] - 1) / 3) + 1 return out @@ -4440,15 +4567,19 @@ def get_date_field(ndarray[int64_t] dtindex, object field): for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) out[i] = days_in_month(dts) return out + elif field == 'is_leap_year': + return _isleapyear_arr(get_date_field(dtindex, 'Y')) raise ValueError("Field %s not supported" % field) @cython.wraparound(False) -def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=None, int month_kw=12): +def get_start_end_field(ndarray[int64_t] dtindex, object field, + object freqstr=None, int month_kw=12): """ Given an int64-based datetime index return array of indicators of whether timestamps are at the start/end of the month/quarter/year @@ -4470,21 +4601,24 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=N _month_offset = np.array( [[ 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365 ], [ 0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366 ]], - dtype=np.int32 ) + dtype=np.int32 ) count = len(dtindex) out = np.zeros(count, dtype='int8') if freqstr: if freqstr == 'C': - raise ValueError("Custom business days is not supported by %s" % field) + raise ValueError( + "Custom business days is not supported by %s" % field) is_business = freqstr[0] == 'B' - # YearBegin(), BYearBegin() use month = starting month of year - # QuarterBegin(), BQuarterBegin() use startingMonth = starting month of year - # other offests use month, startingMonth as ending month of year. + # YearBegin(), BYearBegin() use month = starting month of year. + # QuarterBegin(), BQuarterBegin() use startingMonth = starting + # month of year. Other offests use month, startingMonth as ending + # month of year. - if (freqstr[0:2] in ['MS', 'QS', 'AS']) or (freqstr[1:3] in ['MS', 'QS', 'AS']): + if (freqstr[0:2] in ['MS', 'QS', 'AS']) or ( + freqstr[1:3] in ['MS', 'QS', 'AS']): end_month = 12 if month_kw == 1 else month_kw - 1 start_month = month_kw else: @@ -4499,7 +4633,8 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=N for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) ts = convert_to_tsobject(dtindex[i], None, None, 0, 0) dom = dts.day dow = ts_dayofweek(ts) @@ -4511,7 +4646,8 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=N for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) dom = dts.day if dom == 1: @@ -4523,7 +4659,8 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=N for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) ts = convert_to_tsobject(dtindex[i], None, None, 0, 0) isleap = is_leapyear(dts.year) mo_off = _month_offset[isleap, dts.month - 1] @@ -4532,14 +4669,16 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=N ldom = _month_offset[isleap, dts.month] dow = ts_dayofweek(ts) - if (ldom == doy and dow < 5) or (dow == 4 and (ldom - doy <= 2)): + if (ldom == doy and dow < 5) or ( + dow == 4 and (ldom - doy <= 2)): out[i] = 1 return out.view(bool) else: for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) isleap = is_leapyear(dts.year) mo_off = _month_offset[isleap, dts.month - 1] dom = dts.day @@ -4555,19 +4694,22 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=N for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) ts = convert_to_tsobject(dtindex[i], None, None, 0, 0) dom = dts.day dow = ts_dayofweek(ts) - if ((dts.month - start_month) % 3 == 0) and ((dom == 1 and dow < 5) or (dom <= 3 and dow == 0)): + if ((dts.month - start_month) % 3 == 0) and ( + (dom == 1 and dow < 5) or (dom <= 3 and dow == 0)): out[i] = 1 return out.view(bool) else: for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) dom = dts.day if ((dts.month - start_month) % 3 == 0) and dom == 1: @@ -4579,7 +4721,8 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=N for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) ts = convert_to_tsobject(dtindex[i], None, None, 0, 0) isleap = is_leapyear(dts.year) mo_off = _month_offset[isleap, dts.month - 1] @@ -4588,14 +4731,17 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=N ldom = _month_offset[isleap, dts.month] dow = ts_dayofweek(ts) - if ((dts.month - end_month) % 3 == 0) and ((ldom == doy and dow < 5) or (dow == 4 and (ldom - doy <= 2))): + if ((dts.month - end_month) % 3 == 0) and ( + (ldom == doy and dow < 5) or ( + dow == 4 and (ldom - doy <= 2))): out[i] = 1 return out.view(bool) else: for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) isleap = is_leapyear(dts.year) mo_off = _month_offset[isleap, dts.month - 1] dom = dts.day @@ -4611,19 +4757,22 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=N for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) ts = convert_to_tsobject(dtindex[i], None, None, 0, 0) dom = dts.day dow = ts_dayofweek(ts) - if (dts.month == start_month) and ((dom == 1 and dow < 5) or (dom <= 3 and dow == 0)): + if (dts.month == start_month) and ( + (dom == 1 and dow < 5) or (dom <= 3 and dow == 0)): out[i] = 1 return out.view(bool) else: for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) dom = dts.day if (dts.month == start_month) and dom == 1: @@ -4635,7 +4784,8 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=N for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) ts = convert_to_tsobject(dtindex[i], None, None, 0, 0) isleap = is_leapyear(dts.year) dom = dts.day @@ -4644,14 +4794,17 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=N dow = ts_dayofweek(ts) ldom = _month_offset[isleap, dts.month] - if (dts.month == end_month) and ((ldom == doy and dow < 5) or (dow == 4 and (ldom - doy <= 2))): + if (dts.month == end_month) and ( + (ldom == doy and dow < 5) or ( + dow == 4 and (ldom - doy <= 2))): out[i] = 1 return out.view(bool) else: for i in range(count): if dtindex[i] == NPY_NAT: out[i] = -1; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) ts = convert_to_tsobject(dtindex[i], None, None, 0, 0) isleap = is_leapyear(dts.year) mo_off = _month_offset[isleap, dts.month - 1] @@ -4665,6 +4818,7 @@ def get_start_end_field(ndarray[int64_t] dtindex, object field, object freqstr=N raise ValueError("Field %s not supported" % field) + @cython.wraparound(False) @cython.boundscheck(False) def get_date_name_field(ndarray[int64_t] dtindex, object field): @@ -4680,8 +4834,9 @@ def get_date_name_field(ndarray[int64_t] dtindex, object field): int dow _dayname = np.array( - ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'], - dtype=np.object_ ) + ['Monday', 'Tuesday', 'Wednesday', 'Thursday', + 'Friday', 'Saturday', 'Sunday'], + dtype=np.object_ ) count = len(dtindex) out = np.empty(count, dtype=object) @@ -4724,11 +4879,13 @@ def date_normalize(ndarray[int64_t] stamps, tz=None): if stamps[i] == NPY_NAT: result[i] = NPY_NAT continue - pandas_datetime_to_datetimestruct(stamps[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + stamps[i], PANDAS_FR_ns, &dts) result[i] = _normalized_stamp(&dts) return result + @cython.wraparound(False) @cython.boundscheck(False) cdef _normalize_local(ndarray[int64_t] stamps, object tz): @@ -4744,15 +4901,15 @@ cdef _normalize_local(ndarray[int64_t] stamps, object tz): if stamps[i] == NPY_NAT: result[i] = NPY_NAT continue - pandas_datetime_to_datetimestruct(stamps[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + stamps[i], PANDAS_FR_ns, &dts) result[i] = _normalized_stamp(&dts) elif _is_tzlocal(tz): for i in range(n): if stamps[i] == NPY_NAT: result[i] = NPY_NAT continue - pandas_datetime_to_datetimestruct(stamps[i], PANDAS_FR_ns, - &dts) + pandas_datetime_to_datetimestruct(stamps[i], PANDAS_FR_ns, &dts) dt = datetime(dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us, tz) delta = int(total_seconds(_get_utcoffset(tz, dt))) * 1000000000 @@ -4769,7 +4926,7 @@ cdef _normalize_local(ndarray[int64_t] stamps, object tz): pos = _pos # statictzinfo - if typ not in ['pytz','dateutil']: + if typ not in ['pytz', 'dateutil']: for i in range(n): if stamps[i] == NPY_NAT: result[i] = NPY_NAT @@ -4810,12 +4967,10 @@ def dates_normalized(ndarray[int64_t] stamps, tz=None): elif _is_tzlocal(tz): for i in range(n): pandas_datetime_to_datetimestruct(stamps[i], PANDAS_FR_ns, &dts) - if (dts.min + dts.sec + dts.us) > 0: - return False dt = datetime(dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us, tz) dt = dt + tz.utcoffset(dt) - if dt.hour > 0: + if (dt.hour + dt.minute + dt.second + dt.microsecond) > 0: return False else: trans, deltas, typ = _get_dst_info(tz) @@ -4835,8 +4990,18 @@ def dates_normalized(ndarray[int64_t] stamps, tz=None): # Some general helper functions #---------------------------------------------------------------------- -def isleapyear(int64_t year): - return is_leapyear(year) + +cpdef _isleapyear_arr(ndarray years): + cdef: + ndarray[int8_t] out + + # to make NaT result as False + out = np.zeros(len(years), dtype='int8') + out[np.logical_or(years % 400 == 0, + np.logical_and(years % 4 == 0, + years % 100 > 0))] = 1 + return out.view(bool) + def monthrange(int64_t year, int64_t month): cdef: @@ -4846,7 +5011,7 @@ def monthrange(int64_t year, int64_t month): if month < 1 or month > 12: raise ValueError("bad month number 0; must be 1-12") - days = days_per_month_table[is_leapyear(year)][month-1] + days = days_per_month_table[is_leapyear(year)][month -1] return (dayofweek(year, month, 1), days) @@ -4854,7 +5019,7 @@ cdef inline int64_t ts_dayofweek(_TSObject ts): return dayofweek(ts.dts.year, ts.dts.month, ts.dts.day) cdef inline int days_in_month(pandas_datetimestruct dts) nogil: - return days_per_month_table[is_leapyear(dts.year)][dts.month-1] + return days_per_month_table[is_leapyear(dts.year)][dts.month -1] cpdef normalize_date(object dt): """ @@ -4880,10 +5045,14 @@ cdef inline int _year_add_months(pandas_datetimestruct dts, cdef inline int _month_add_months(pandas_datetimestruct dts, int months) nogil: - """new month number after shifting pandas_datetimestruct number of months""" + """ + New month number after shifting pandas_datetimestruct + number of months. + """ cdef int new_month = (dts.month + months) % 12 return 12 if new_month == 0 else new_month + @cython.wraparound(False) @cython.boundscheck(False) def shift_months(int64_t[:] dtindex, int months, object day=None): @@ -4908,7 +5077,8 @@ def shift_months(int64_t[:] dtindex, int months, object day=None): with nogil: for i in range(count): if dtindex[i] == NPY_NAT: out[i] = NPY_NAT; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) dts.year = _year_add_months(dts, months) dts.month = _month_add_months(dts, months) @@ -4922,7 +5092,8 @@ def shift_months(int64_t[:] dtindex, int months, object day=None): with nogil: for i in range(count): if dtindex[i] == NPY_NAT: out[i] = NPY_NAT; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) months_to_roll = months # offset semantics - if on the anchor point and going backwards @@ -4943,7 +5114,8 @@ def shift_months(int64_t[:] dtindex, int months, object day=None): with nogil: for i in range(count): if dtindex[i] == NPY_NAT: out[i] = NPY_NAT; continue - pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + pandas_datetime_to_datetimestruct( + dtindex[i], PANDAS_FR_ns, &dts) months_to_roll = months # similar semantics - when adding shift forward by one @@ -4998,10 +5170,12 @@ except: __all__ = [] + def _getlang(): # Figure out what the current language is set to. return locale.getlocale(locale.LC_TIME) + class LocaleTime(object): """Stores and handles locale-specific information related to time. @@ -5081,8 +5255,9 @@ class LocaleTime(object): # magical; just happened to have used it everywhere else where a # static date was needed. am_pm = [] - for hour in (01,22): - time_tuple = time.struct_time((1999,3,17,hour,44,55,2,76,0)) + for hour in (01, 22): + time_tuple = time.struct_time( + (1999, 3, 17, hour, 44, 55, 2, 76, 0)) am_pm.append(time.strftime("%p", time_tuple).lower()) self.am_pm = am_pm @@ -5094,22 +5269,23 @@ class LocaleTime(object): # overloaded numbers is minimized. The order in which searches for # values within the format string is very important; it eliminates # possible ambiguity for what something represents. - time_tuple = time.struct_time((1999,3,17,22,44,55,2,76,0)) + time_tuple = time.struct_time((1999, 3, 17, 22, 44, 55, 2, 76, 0)) date_time = [None, None, None] date_time[0] = time.strftime("%c", time_tuple).lower() date_time[1] = time.strftime("%x", time_tuple).lower() date_time[2] = time.strftime("%X", time_tuple).lower() replacement_pairs = [('%', '%%'), (self.f_weekday[2], '%A'), - (self.f_month[3], '%B'), (self.a_weekday[2], '%a'), - (self.a_month[3], '%b'), (self.am_pm[1], '%p'), - ('1999', '%Y'), ('99', '%y'), ('22', '%H'), - ('44', '%M'), ('55', '%S'), ('76', '%j'), - ('17', '%d'), ('03', '%m'), ('3', '%m'), - # '3' needed for when no leading zero. - ('2', '%w'), ('10', '%I')] + (self.f_month[3], + '%B'), (self.a_weekday[2], '%a'), + (self.a_month[3], '%b'), (self.am_pm[1], '%p'), + ('1999', '%Y'), ('99', '%y'), ('22', '%H'), + ('44', '%M'), ('55', '%S'), ('76', '%j'), + ('17', '%d'), ('03', '%m'), ('3', '%m'), + # '3' needed for when no leading zero. + ('2', '%w'), ('10', '%I')] replacement_pairs.extend([(tz, "%Z") for tz_values in self.timezone for tz in tz_values]) - for offset,directive in ((0,'%c'), (1,'%x'), (2,'%X')): + for offset, directive in ((0, '%c'), (1, '%x'), (2, '%X')): current_format = date_time[offset] for old, new in replacement_pairs: # Must deal with possible lack of locale info @@ -5121,7 +5297,7 @@ class LocaleTime(object): # If %W is used, then Sunday, 2005-01-03 will fall on week 0 since # 2005-01-03 occurs before the first Monday of the year. Otherwise # %U is used. - time_tuple = time.struct_time((1999,1,3,1,1,1,6,3,0)) + time_tuple = time.struct_time((1999, 1, 3, 1, 1, 1, 6, 3, 0)) if '00' in time.strftime(directive, time_tuple): U_W = '%W' else: @@ -5167,7 +5343,8 @@ class TimeRE(dict): 'f': r"(?P[0-9]{1,9})", 'H': r"(?P2[0-3]|[0-1]\d|\d)", 'I': r"(?P1[0-2]|0[1-9]|[1-9])", - 'j': r"(?P36[0-6]|3[0-5]\d|[1-2]\d\d|0[1-9]\d|00[1-9]|[1-9]\d|0[1-9]|[1-9])", + 'j': (r"(?P36[0-6]|3[0-5]\d|[1-2]\d\d|0[1-9]\d|00[1-9]|" + r"[1-9]\d|0[1-9]|[1-9])"), 'm': r"(?P1[0-2]|0[1-9]|[1-9])", 'M': r"(?P[0-5]\d|\d)", 'S': r"(?P6[0-1]|[0-5]\d|\d)", @@ -5227,11 +5404,11 @@ class TimeRE(dict): whitespace_replacement = re_compile(r'\s+') format = whitespace_replacement.sub(r'\\s+', format) while '%' in format: - directive_index = format.index('%')+1 + directive_index = format.index('%') +1 processed_format = "%s%s%s" % (processed_format, - format[:directive_index-1], + format[:directive_index -1], self[format[directive_index]]) - format = format[directive_index+1:] + format = format[directive_index +1:] return "%s%s" % (processed_format, format) def compile(self, format): @@ -5245,7 +5422,8 @@ _TimeRE_cache = TimeRE() _CACHE_MAX_SIZE = 5 # Max number of regexes stored in _regex_cache _regex_cache = {} -cdef _calc_julian_from_U_or_W(int year, int week_of_year, int day_of_week, int week_starts_Mon): +cdef _calc_julian_from_U_or_W(int year, int week_of_year, + int day_of_week, int week_starts_Mon): """Calculate the Julian day based on the year, week of the year, and day of the week, with week_start_day representing whether the week of the year assumes the week starts on Sunday or Monday (6 or 0).""" diff --git a/pandas/types/api.py b/pandas/types/api.py index 721d8d29bba8b..096dc2f84aa67 100644 --- a/pandas/types/api.py +++ b/pandas/types/api.py @@ -1,75 +1,56 @@ # flake8: noqa import numpy as np -from pandas.compat import string_types -from .dtypes import (CategoricalDtype, CategoricalDtypeType, - DatetimeTZDtype, DatetimeTZDtypeType) -from .generic import (ABCIndex, ABCInt64Index, ABCRangeIndex, - ABCFloat64Index, ABCMultiIndex, - ABCDatetimeIndex, - ABCTimedeltaIndex, ABCPeriodIndex, - ABCCategoricalIndex, - ABCIndexClass, - ABCSeries, ABCDataFrame, ABCPanel, - ABCSparseSeries, ABCSparseArray, - ABCCategorical, ABCPeriod, - ABCGeneric) - -def pandas_dtype(dtype): - """ - Converts input into a pandas only dtype object or a numpy dtype object. - - Parameters - ---------- - dtype : object to be converted - - Returns - ------- - np.dtype or a pandas dtype - """ - if isinstance(dtype, DatetimeTZDtype): - return dtype - elif isinstance(dtype, CategoricalDtype): - return dtype - elif isinstance(dtype, string_types): - try: - return DatetimeTZDtype.construct_from_string(dtype) - except TypeError: - pass - - try: - return CategoricalDtype.construct_from_string(dtype) - except TypeError: - pass - - return np.dtype(dtype) - -def na_value_for_dtype(dtype): - """ - Return a dtype compat na value - - Parameters - ---------- - dtype : string / dtype - - Returns - ------- - dtype compat na value - """ - - from pandas.core import common as com - from pandas import NaT - dtype = pandas_dtype(dtype) - - if (com.is_datetime64_dtype(dtype) or - com.is_datetime64tz_dtype(dtype) or - com.is_timedelta64_dtype(dtype)): - return NaT - elif com.is_float_dtype(dtype): - return np.nan - elif com.is_integer_dtype(dtype): - return 0 - elif com.is_bool_dtype(dtype): - return False - return np.nan +from .common import (pandas_dtype, + is_dtype_equal, + is_extension_type, + + # categorical + is_categorical, + is_categorical_dtype, + + # datetimelike + is_datetimetz, + is_datetime64_dtype, + is_datetime64tz_dtype, + is_datetime64_any_dtype, + is_datetime64_ns_dtype, + is_timedelta64_dtype, + is_timedelta64_ns_dtype, + is_period, + is_period_dtype, + + # string-like + is_string_dtype, + is_object_dtype, + + # sparse + is_sparse, + + # numeric types + is_scalar, + is_sparse, + is_bool, + is_integer, + is_float, + is_complex, + is_number, + is_any_int_dtype, + is_integer_dtype, + is_int64_dtype, + is_numeric_dtype, + is_float_dtype, + is_floating_dtype, + is_bool_dtype, + is_complex_dtype, + + # like + is_re, + is_re_compilable, + is_dict_like, + is_iterator, + is_list_like, + is_hashable, + is_named_tuple, + is_sequence) diff --git a/pandas/types/cast.py b/pandas/types/cast.py new file mode 100644 index 0000000000000..a79862eb195b6 --- /dev/null +++ b/pandas/types/cast.py @@ -0,0 +1,888 @@ +""" routings for casting """ + +from datetime import datetime, timedelta +import numpy as np +from pandas import lib, tslib +from pandas.tslib import iNaT +from pandas.compat import string_types, text_type, PY3 +from .common import (_ensure_object, is_bool, is_integer, is_float, + is_complex, is_datetimetz, is_categorical_dtype, + is_extension_type, is_object_dtype, + is_datetime64tz_dtype, is_datetime64_dtype, + is_timedelta64_dtype, is_dtype_equal, + is_float_dtype, is_complex_dtype, + is_integer_dtype, is_datetime_or_timedelta_dtype, + is_bool_dtype, is_scalar, + _string_dtypes, + _coerce_to_dtype, + _ensure_int8, _ensure_int16, + _ensure_int32, _ensure_int64, + _NS_DTYPE, _TD_DTYPE, _INT64_DTYPE, + _DATELIKE_DTYPES, _POSSIBLY_CAST_DTYPES) +from .dtypes import ExtensionDtype +from .generic import ABCDatetimeIndex, ABCPeriodIndex, ABCSeries +from .missing import isnull, notnull +from .inference import is_list_like + +_int8_max = np.iinfo(np.int8).max +_int16_max = np.iinfo(np.int16).max +_int32_max = np.iinfo(np.int32).max +_int64_max = np.iinfo(np.int64).max + + +def _possibly_convert_platform(values): + """ try to do platform conversion, allow ndarray or list here """ + + if isinstance(values, (list, tuple)): + values = lib.list_to_object_array(list(values)) + if getattr(values, 'dtype', None) == np.object_: + if hasattr(values, '_values'): + values = values._values + values = lib.maybe_convert_objects(values) + + return values + + +def _possibly_downcast_to_dtype(result, dtype): + """ try to cast to the specified dtype (e.g. convert back to bool/int + or could be an astype of float64->float32 + """ + + if is_scalar(result): + return result + + def trans(x): + return x + + if isinstance(dtype, string_types): + if dtype == 'infer': + inferred_type = lib.infer_dtype(_ensure_object(result.ravel())) + if inferred_type == 'boolean': + dtype = 'bool' + elif inferred_type == 'integer': + dtype = 'int64' + elif inferred_type == 'datetime64': + dtype = 'datetime64[ns]' + elif inferred_type == 'timedelta64': + dtype = 'timedelta64[ns]' + + # try to upcast here + elif inferred_type == 'floating': + dtype = 'int64' + if issubclass(result.dtype.type, np.number): + + def trans(x): # noqa + return x.round() + else: + dtype = 'object' + + if isinstance(dtype, string_types): + dtype = np.dtype(dtype) + + try: + + # don't allow upcasts here (except if empty) + if dtype.kind == result.dtype.kind: + if (result.dtype.itemsize <= dtype.itemsize and + np.prod(result.shape)): + return result + + if issubclass(dtype.type, np.floating): + return result.astype(dtype) + elif is_bool_dtype(dtype) or is_integer_dtype(dtype): + + # if we don't have any elements, just astype it + if not np.prod(result.shape): + return trans(result).astype(dtype) + + # do a test on the first element, if it fails then we are done + r = result.ravel() + arr = np.array([r[0]]) + + # if we have any nulls, then we are done + if isnull(arr).any() or not np.allclose(arr, + trans(arr).astype(dtype)): + return result + + # a comparable, e.g. a Decimal may slip in here + elif not isinstance(r[0], (np.integer, np.floating, np.bool, int, + float, bool)): + return result + + if (issubclass(result.dtype.type, (np.object_, np.number)) and + notnull(result).all()): + new_result = trans(result).astype(dtype) + try: + if np.allclose(new_result, result): + return new_result + except: + + # comparison of an object dtype with a number type could + # hit here + if (new_result == result).all(): + return new_result + + # a datetimelike + # GH12821, iNaT is casted to float + elif dtype.kind in ['M', 'm'] and result.dtype.kind in ['i', 'f']: + try: + result = result.astype(dtype) + except: + if dtype.tz: + # convert to datetime and change timezone + from pandas import to_datetime + result = to_datetime(result).tz_localize(dtype.tz) + + except: + pass + + return result + + +def _maybe_upcast_putmask(result, mask, other): + """ + A safe version of putmask that potentially upcasts the result + + Parameters + ---------- + result : ndarray + The destination array. This will be mutated in-place if no upcasting is + necessary. + mask : boolean ndarray + other : ndarray or scalar + The source array or value + + Returns + ------- + result : ndarray + changed : boolean + Set to true if the result array was upcasted + """ + + if mask.any(): + # Two conversions for date-like dtypes that can't be done automatically + # in np.place: + # NaN -> NaT + # integer or integer array -> date-like array + if result.dtype in _DATELIKE_DTYPES: + if is_scalar(other): + if isnull(other): + other = result.dtype.type('nat') + elif is_integer(other): + other = np.array(other, dtype=result.dtype) + elif is_integer_dtype(other): + other = np.array(other, dtype=result.dtype) + + def changeit(): + + # try to directly set by expanding our array to full + # length of the boolean + try: + om = other[mask] + om_at = om.astype(result.dtype) + if (om == om_at).all(): + new_result = result.values.copy() + new_result[mask] = om_at + result[:] = new_result + return result, False + except: + pass + + # we are forced to change the dtype of the result as the input + # isn't compatible + r, _ = _maybe_upcast(result, fill_value=other, copy=True) + np.place(r, mask, other) + + return r, True + + # we want to decide whether place will work + # if we have nans in the False portion of our mask then we need to + # upcast (possibly), otherwise we DON't want to upcast (e.g. if we + # have values, say integers, in the success portion then it's ok to not + # upcast) + new_dtype, _ = _maybe_promote(result.dtype, other) + if new_dtype != result.dtype: + + # we have a scalar or len 0 ndarray + # and its nan and we are changing some values + if (is_scalar(other) or + (isinstance(other, np.ndarray) and other.ndim < 1)): + if isnull(other): + return changeit() + + # we have an ndarray and the masking has nans in it + else: + + if isnull(other[mask]).any(): + return changeit() + + try: + np.place(result, mask, other) + except: + return changeit() + + return result, False + + +def _maybe_promote(dtype, fill_value=np.nan): + + # if we passed an array here, determine the fill value by dtype + if isinstance(fill_value, np.ndarray): + if issubclass(fill_value.dtype.type, (np.datetime64, np.timedelta64)): + fill_value = iNaT + else: + + # we need to change to object type as our + # fill_value is of object type + if fill_value.dtype == np.object_: + dtype = np.dtype(np.object_) + fill_value = np.nan + + # returns tuple of (dtype, fill_value) + if issubclass(dtype.type, (np.datetime64, np.timedelta64)): + # for now: refuse to upcast datetime64 + # (this is because datetime64 will not implicitly upconvert + # to object correctly as of numpy 1.6.1) + if isnull(fill_value): + fill_value = iNaT + else: + if issubclass(dtype.type, np.datetime64): + try: + fill_value = lib.Timestamp(fill_value).value + except: + # the proper thing to do here would probably be to upcast + # to object (but numpy 1.6.1 doesn't do this properly) + fill_value = iNaT + elif issubclass(dtype.type, np.timedelta64): + try: + fill_value = lib.Timedelta(fill_value).value + except: + # as for datetimes, cannot upcast to object + fill_value = iNaT + else: + fill_value = iNaT + elif is_datetimetz(dtype): + if isnull(fill_value): + fill_value = iNaT + elif is_float(fill_value): + if issubclass(dtype.type, np.bool_): + dtype = np.object_ + elif issubclass(dtype.type, np.integer): + dtype = np.float64 + elif is_bool(fill_value): + if not issubclass(dtype.type, np.bool_): + dtype = np.object_ + elif is_integer(fill_value): + if issubclass(dtype.type, np.bool_): + dtype = np.object_ + elif issubclass(dtype.type, np.integer): + # upcast to prevent overflow + arr = np.asarray(fill_value) + if arr != arr.astype(dtype): + dtype = arr.dtype + elif is_complex(fill_value): + if issubclass(dtype.type, np.bool_): + dtype = np.object_ + elif issubclass(dtype.type, (np.integer, np.floating)): + dtype = np.complex128 + elif fill_value is None: + if is_float_dtype(dtype) or is_complex_dtype(dtype): + fill_value = np.nan + elif is_integer_dtype(dtype): + dtype = np.float64 + fill_value = np.nan + elif is_datetime_or_timedelta_dtype(dtype): + fill_value = iNaT + else: + dtype = np.object_ + else: + dtype = np.object_ + + # in case we have a string that looked like a number + if is_categorical_dtype(dtype): + pass + elif is_datetimetz(dtype): + pass + elif issubclass(np.dtype(dtype).type, string_types): + dtype = np.object_ + + return dtype, fill_value + + +def _infer_dtype_from_scalar(val): + """ interpret the dtype from a scalar """ + + dtype = np.object_ + + # a 1-element ndarray + if isinstance(val, np.ndarray): + if val.ndim != 0: + raise ValueError( + "invalid ndarray passed to _infer_dtype_from_scalar") + + dtype = val.dtype + val = val.item() + + elif isinstance(val, string_types): + + # If we create an empty array using a string to infer + # the dtype, NumPy will only allocate one character per entry + # so this is kind of bad. Alternately we could use np.repeat + # instead of np.empty (but then you still don't want things + # coming out as np.str_! + + dtype = np.object_ + + elif isinstance(val, (np.datetime64, + datetime)) and getattr(val, 'tzinfo', None) is None: + val = lib.Timestamp(val).value + dtype = np.dtype('M8[ns]') + + elif isinstance(val, (np.timedelta64, timedelta)): + val = lib.Timedelta(val).value + dtype = np.dtype('m8[ns]') + + elif is_bool(val): + dtype = np.bool_ + + elif is_integer(val): + if isinstance(val, np.integer): + dtype = type(val) + else: + dtype = np.int64 + + elif is_float(val): + if isinstance(val, np.floating): + dtype = type(val) + else: + dtype = np.float64 + + elif is_complex(val): + dtype = np.complex_ + + return dtype, val + + +def _maybe_upcast(values, fill_value=np.nan, dtype=None, copy=False): + """ provide explict type promotion and coercion + + Parameters + ---------- + values : the ndarray that we want to maybe upcast + fill_value : what we want to fill with + dtype : if None, then use the dtype of the values, else coerce to this type + copy : if True always make a copy even if no upcast is required + """ + + if is_extension_type(values): + if copy: + values = values.copy() + else: + if dtype is None: + dtype = values.dtype + new_dtype, fill_value = _maybe_promote(dtype, fill_value) + if new_dtype != values.dtype: + values = values.astype(new_dtype) + elif copy: + values = values.copy() + + return values, fill_value + + +def _possibly_cast_item(obj, item, dtype): + chunk = obj[item] + + if chunk.values.dtype != dtype: + if dtype in (np.object_, np.bool_): + obj[item] = chunk.astype(np.object_) + elif not issubclass(dtype, (np.integer, np.bool_)): # pragma: no cover + raise ValueError("Unexpected dtype encountered: %s" % dtype) + + +def _invalidate_string_dtypes(dtype_set): + """Change string like dtypes to object for + ``DataFrame.select_dtypes()``. + """ + non_string_dtypes = dtype_set - _string_dtypes + if non_string_dtypes != dtype_set: + raise TypeError("string dtypes are not allowed, use 'object' instead") + + +def _maybe_convert_string_to_object(values): + """ + + Convert string-like and string-like array to convert object dtype. + This is to avoid numpy to handle the array as str dtype. + """ + if isinstance(values, string_types): + values = np.array([values], dtype=object) + elif (isinstance(values, np.ndarray) and + issubclass(values.dtype.type, (np.string_, np.unicode_))): + values = values.astype(object) + return values + + +def _maybe_convert_scalar(values): + """ + Convert a python scalar to the appropriate numpy dtype if possible + This avoids numpy directly converting according to platform preferences + """ + if is_scalar(values): + dtype, values = _infer_dtype_from_scalar(values) + try: + values = dtype(values) + except TypeError: + pass + return values + + +def _coerce_indexer_dtype(indexer, categories): + """ coerce the indexer input array to the smallest dtype possible """ + l = len(categories) + if l < _int8_max: + return _ensure_int8(indexer) + elif l < _int16_max: + return _ensure_int16(indexer) + elif l < _int32_max: + return _ensure_int32(indexer) + return _ensure_int64(indexer) + + +def _coerce_to_dtypes(result, dtypes): + """ + given a dtypes and a result set, coerce the result elements to the + dtypes + """ + if len(result) != len(dtypes): + raise AssertionError("_coerce_to_dtypes requires equal len arrays") + + from pandas.tseries.timedeltas import _coerce_scalar_to_timedelta_type + + def conv(r, dtype): + try: + if isnull(r): + pass + elif dtype == _NS_DTYPE: + r = lib.Timestamp(r) + elif dtype == _TD_DTYPE: + r = _coerce_scalar_to_timedelta_type(r) + elif dtype == np.bool_: + # messy. non 0/1 integers do not get converted. + if is_integer(r) and r not in [0, 1]: + return int(r) + r = bool(r) + elif dtype.kind == 'f': + r = float(r) + elif dtype.kind == 'i': + r = int(r) + except: + pass + + return r + + return [conv(r, dtype) for r, dtype in zip(result, dtypes)] + + +def _astype_nansafe(arr, dtype, copy=True): + """ return a view if copy is False, but + need to be very careful as the result shape could change! """ + if not isinstance(dtype, np.dtype): + dtype = _coerce_to_dtype(dtype) + + if issubclass(dtype.type, text_type): + # in Py3 that's str, in Py2 that's unicode + return lib.astype_unicode(arr.ravel()).reshape(arr.shape) + elif issubclass(dtype.type, string_types): + return lib.astype_str(arr.ravel()).reshape(arr.shape) + elif is_datetime64_dtype(arr): + if dtype == object: + return tslib.ints_to_pydatetime(arr.view(np.int64)) + elif dtype == np.int64: + return arr.view(dtype) + elif dtype != _NS_DTYPE: + raise TypeError("cannot astype a datetimelike from [%s] to [%s]" % + (arr.dtype, dtype)) + return arr.astype(_NS_DTYPE) + elif is_timedelta64_dtype(arr): + if dtype == np.int64: + return arr.view(dtype) + elif dtype == object: + return tslib.ints_to_pytimedelta(arr.view(np.int64)) + + # in py3, timedelta64[ns] are int64 + elif ((PY3 and dtype not in [_INT64_DTYPE, _TD_DTYPE]) or + (not PY3 and dtype != _TD_DTYPE)): + + # allow frequency conversions + if dtype.kind == 'm': + mask = isnull(arr) + result = arr.astype(dtype).astype(np.float64) + result[mask] = np.nan + return result + + raise TypeError("cannot astype a timedelta from [%s] to [%s]" % + (arr.dtype, dtype)) + + return arr.astype(_TD_DTYPE) + elif (np.issubdtype(arr.dtype, np.floating) and + np.issubdtype(dtype, np.integer)): + + if np.isnan(arr).any(): + raise ValueError('Cannot convert NA to integer') + elif arr.dtype == np.object_ and np.issubdtype(dtype.type, np.integer): + # work around NumPy brokenness, #1987 + return lib.astype_intsafe(arr.ravel(), dtype).reshape(arr.shape) + + if copy: + return arr.astype(dtype) + return arr.view(dtype) + + +def _possibly_convert_objects(values, convert_dates=True, convert_numeric=True, + convert_timedeltas=True, copy=True): + """ if we have an object dtype, try to coerce dates and/or numbers """ + + # if we have passed in a list or scalar + if isinstance(values, (list, tuple)): + values = np.array(values, dtype=np.object_) + if not hasattr(values, 'dtype'): + values = np.array([values], dtype=np.object_) + + # convert dates + if convert_dates and values.dtype == np.object_: + + # we take an aggressive stance and convert to datetime64[ns] + if convert_dates == 'coerce': + new_values = _possibly_cast_to_datetime(values, 'M8[ns]', + errors='coerce') + + # if we are all nans then leave me alone + if not isnull(new_values).all(): + values = new_values + + else: + values = lib.maybe_convert_objects(values, + convert_datetime=convert_dates) + + # convert timedeltas + if convert_timedeltas and values.dtype == np.object_: + + if convert_timedeltas == 'coerce': + from pandas.tseries.timedeltas import to_timedelta + new_values = to_timedelta(values, coerce=True) + + # if we are all nans then leave me alone + if not isnull(new_values).all(): + values = new_values + + else: + values = lib.maybe_convert_objects( + values, convert_timedelta=convert_timedeltas) + + # convert to numeric + if values.dtype == np.object_: + if convert_numeric: + try: + new_values = lib.maybe_convert_numeric(values, set(), + coerce_numeric=True) + + # if we are all nans then leave me alone + if not isnull(new_values).all(): + values = new_values + + except: + pass + else: + # soft-conversion + values = lib.maybe_convert_objects(values) + + values = values.copy() if copy else values + + return values + + +def _soft_convert_objects(values, datetime=True, numeric=True, timedelta=True, + coerce=False, copy=True): + """ if we have an object dtype, try to coerce dates and/or numbers """ + + conversion_count = sum((datetime, numeric, timedelta)) + if conversion_count == 0: + raise ValueError('At least one of datetime, numeric or timedelta must ' + 'be True.') + elif conversion_count > 1 and coerce: + raise ValueError("Only one of 'datetime', 'numeric' or " + "'timedelta' can be True when when coerce=True.") + + if isinstance(values, (list, tuple)): + # List or scalar + values = np.array(values, dtype=np.object_) + elif not hasattr(values, 'dtype'): + values = np.array([values], dtype=np.object_) + elif not is_object_dtype(values.dtype): + # If not object, do not attempt conversion + values = values.copy() if copy else values + return values + + # If 1 flag is coerce, ensure 2 others are False + if coerce: + # Immediate return if coerce + if datetime: + from pandas import to_datetime + return to_datetime(values, errors='coerce', box=False) + elif timedelta: + from pandas import to_timedelta + return to_timedelta(values, errors='coerce', box=False) + elif numeric: + from pandas import to_numeric + return to_numeric(values, errors='coerce') + + # Soft conversions + if datetime: + values = lib.maybe_convert_objects(values, convert_datetime=datetime) + + if timedelta and is_object_dtype(values.dtype): + # Object check to ensure only run if previous did not convert + values = lib.maybe_convert_objects(values, convert_timedelta=timedelta) + + if numeric and is_object_dtype(values.dtype): + try: + converted = lib.maybe_convert_numeric(values, set(), + coerce_numeric=True) + # If all NaNs, then do not-alter + values = converted if not isnull(converted).all() else values + values = values.copy() if copy else values + except: + pass + + return values + + +def _possibly_castable(arr): + # return False to force a non-fastpath + + # check datetime64[ns]/timedelta64[ns] are valid + # otherwise try to coerce + kind = arr.dtype.kind + if kind == 'M' or kind == 'm': + return arr.dtype in _DATELIKE_DTYPES + + return arr.dtype.name not in _POSSIBLY_CAST_DTYPES + + +def _possibly_infer_to_datetimelike(value, convert_dates=False): + """ + we might have a array (or single object) that is datetime like, + and no dtype is passed don't change the value unless we find a + datetime/timedelta set + + this is pretty strict in that a datetime/timedelta is REQUIRED + in addition to possible nulls/string likes + + ONLY strings are NOT datetimelike + + Parameters + ---------- + value : np.array / Series / Index / list-like + convert_dates : boolean, default False + if True try really hard to convert dates (such as datetime.date), other + leave inferred dtype 'date' alone + + """ + + if isinstance(value, (ABCDatetimeIndex, ABCPeriodIndex)): + return value + elif isinstance(value, ABCSeries): + if isinstance(value._values, ABCDatetimeIndex): + return value._values + + v = value + + if not is_list_like(v): + v = [v] + v = np.array(v, copy=False) + shape = v.shape + if not v.ndim == 1: + v = v.ravel() + + if len(v): + + def _try_datetime(v): + # safe coerce to datetime64 + try: + v = tslib.array_to_datetime(v, errors='raise') + except ValueError: + + # we might have a sequence of the same-datetimes with tz's + # if so coerce to a DatetimeIndex; if they are not the same, + # then these stay as object dtype + try: + from pandas import to_datetime + return to_datetime(v) + except: + pass + + except: + pass + + return v.reshape(shape) + + def _try_timedelta(v): + # safe coerce to timedelta64 + + # will try first with a string & object conversion + from pandas import to_timedelta + try: + return to_timedelta(v)._values.reshape(shape) + except: + return v + + # do a quick inference for perf + sample = v[:min(3, len(v))] + inferred_type = lib.infer_dtype(sample) + + if (inferred_type in ['datetime', 'datetime64'] or + (convert_dates and inferred_type in ['date'])): + value = _try_datetime(v) + elif inferred_type in ['timedelta', 'timedelta64']: + value = _try_timedelta(v) + + # It's possible to have nulls intermixed within the datetime or + # timedelta. These will in general have an inferred_type of 'mixed', + # so have to try both datetime and timedelta. + + # try timedelta first to avoid spurious datetime conversions + # e.g. '00:00:01' is a timedelta but technically is also a datetime + elif inferred_type in ['mixed']: + + if lib.is_possible_datetimelike_array(_ensure_object(v)): + value = _try_timedelta(v) + if lib.infer_dtype(value) in ['mixed']: + value = _try_datetime(v) + + return value + + +def _possibly_cast_to_datetime(value, dtype, errors='raise'): + """ try to cast the array/value to a datetimelike dtype, converting float + nan to iNaT + """ + from pandas.tseries.timedeltas import to_timedelta + from pandas.tseries.tools import to_datetime + + if dtype is not None: + if isinstance(dtype, string_types): + dtype = np.dtype(dtype) + + is_datetime64 = is_datetime64_dtype(dtype) + is_datetime64tz = is_datetime64tz_dtype(dtype) + is_timedelta64 = is_timedelta64_dtype(dtype) + + if is_datetime64 or is_datetime64tz or is_timedelta64: + + # force the dtype if needed + if is_datetime64 and not is_dtype_equal(dtype, _NS_DTYPE): + if dtype.name == 'datetime64[ns]': + dtype = _NS_DTYPE + else: + raise TypeError("cannot convert datetimelike to " + "dtype [%s]" % dtype) + elif is_datetime64tz: + + # our NaT doesn't support tz's + # this will coerce to DatetimeIndex with + # a matching dtype below + if is_scalar(value) and isnull(value): + value = [value] + + elif is_timedelta64 and not is_dtype_equal(dtype, _TD_DTYPE): + if dtype.name == 'timedelta64[ns]': + dtype = _TD_DTYPE + else: + raise TypeError("cannot convert timedeltalike to " + "dtype [%s]" % dtype) + + if is_scalar(value): + if value == tslib.iNaT or isnull(value): + value = tslib.iNaT + else: + value = np.array(value, copy=False) + + # have a scalar array-like (e.g. NaT) + if value.ndim == 0: + value = tslib.iNaT + + # we have an array of datetime or timedeltas & nulls + elif np.prod(value.shape) or not is_dtype_equal(value.dtype, + dtype): + try: + if is_datetime64: + value = to_datetime(value, errors=errors)._values + elif is_datetime64tz: + # input has to be UTC at this point, so just + # localize + value = to_datetime( + value, + errors=errors).tz_localize(dtype.tz) + elif is_timedelta64: + value = to_timedelta(value, errors=errors)._values + except (AttributeError, ValueError, TypeError): + pass + + # coerce datetimelike to object + elif is_datetime64_dtype(value) and not is_datetime64_dtype(dtype): + if is_object_dtype(dtype): + if value.dtype != _NS_DTYPE: + value = value.astype(_NS_DTYPE) + ints = np.asarray(value).view('i8') + return tslib.ints_to_pydatetime(ints) + + # we have a non-castable dtype that was passed + raise TypeError('Cannot cast datetime64 to %s' % dtype) + + else: + + is_array = isinstance(value, np.ndarray) + + # catch a datetime/timedelta that is not of ns variety + # and no coercion specified + if is_array and value.dtype.kind in ['M', 'm']: + dtype = value.dtype + + if dtype.kind == 'M' and dtype != _NS_DTYPE: + value = value.astype(_NS_DTYPE) + + elif dtype.kind == 'm' and dtype != _TD_DTYPE: + value = to_timedelta(value) + + # only do this if we have an array and the dtype of the array is not + # setup already we are not an integer/object, so don't bother with this + # conversion + elif not (is_array and not (issubclass(value.dtype.type, np.integer) or + value.dtype == np.object_)): + value = _possibly_infer_to_datetimelike(value) + + return value + + +def _find_common_type(types): + """Find a common data type among the given dtypes.""" + + if len(types) == 0: + raise ValueError('no types given') + + first = types[0] + # workaround for find_common_type([np.dtype('datetime64[ns]')] * 2) + # => object + if all(is_dtype_equal(first, t) for t in types[1:]): + return first + + if any(isinstance(t, ExtensionDtype) for t in types): + return np.object + + # take lowest unit + if all(is_datetime64_dtype(t) for t in types): + return np.dtype('datetime64[ns]') + if all(is_timedelta64_dtype(t) for t in types): + return np.dtype('timedelta64[ns]') + + return np.find_common_type(types, []) diff --git a/pandas/types/common.py b/pandas/types/common.py new file mode 100644 index 0000000000000..2e7a67112e6db --- /dev/null +++ b/pandas/types/common.py @@ -0,0 +1,451 @@ +""" common type operations """ + +import numpy as np +from pandas.compat import string_types, text_type, binary_type +from pandas import lib, algos +from .dtypes import (CategoricalDtype, CategoricalDtypeType, + DatetimeTZDtype, DatetimeTZDtypeType, + PeriodDtype, PeriodDtypeType, + ExtensionDtype) +from .generic import (ABCCategorical, ABCPeriodIndex, + ABCDatetimeIndex, ABCSeries, + ABCSparseArray, ABCSparseSeries) +from .inference import is_string_like +from .inference import * # noqa + + +_POSSIBLY_CAST_DTYPES = set([np.dtype(t).name + for t in ['O', 'int8', 'uint8', 'int16', 'uint16', + 'int32', 'uint32', 'int64', 'uint64']]) + +_NS_DTYPE = np.dtype('M8[ns]') +_TD_DTYPE = np.dtype('m8[ns]') +_INT64_DTYPE = np.dtype(np.int64) +_DATELIKE_DTYPES = set([np.dtype(t) + for t in ['M8[ns]', 'M8[ns]', + 'm8[ns]', 'm8[ns]']]) + +_ensure_float64 = algos.ensure_float64 +_ensure_float32 = algos.ensure_float32 + + +def _ensure_float(arr): + if issubclass(arr.dtype.type, (np.integer, np.bool_)): + arr = arr.astype(float) + return arr + +_ensure_int64 = algos.ensure_int64 +_ensure_int32 = algos.ensure_int32 +_ensure_int16 = algos.ensure_int16 +_ensure_int8 = algos.ensure_int8 +_ensure_platform_int = algos.ensure_platform_int +_ensure_object = algos.ensure_object + + +def is_object_dtype(arr_or_dtype): + tipo = _get_dtype_type(arr_or_dtype) + return issubclass(tipo, np.object_) + + +def is_sparse(array): + """ return if we are a sparse array """ + return isinstance(array, (ABCSparseArray, ABCSparseSeries)) + + +def is_categorical(array): + """ return if we are a categorical possibility """ + return isinstance(array, ABCCategorical) or is_categorical_dtype(array) + + +def is_datetimetz(array): + """ return if we are a datetime with tz array """ + return ((isinstance(array, ABCDatetimeIndex) and + getattr(array, 'tz', None) is not None) or + is_datetime64tz_dtype(array)) + + +def is_period(array): + """ return if we are a period array """ + return isinstance(array, ABCPeriodIndex) or is_period_arraylike(array) + + +def is_datetime64_dtype(arr_or_dtype): + try: + tipo = _get_dtype_type(arr_or_dtype) + except TypeError: + return False + return issubclass(tipo, np.datetime64) + + +def is_datetime64tz_dtype(arr_or_dtype): + return DatetimeTZDtype.is_dtype(arr_or_dtype) + + +def is_timedelta64_dtype(arr_or_dtype): + tipo = _get_dtype_type(arr_or_dtype) + return issubclass(tipo, np.timedelta64) + + +def is_period_dtype(arr_or_dtype): + return PeriodDtype.is_dtype(arr_or_dtype) + + +def is_categorical_dtype(arr_or_dtype): + return CategoricalDtype.is_dtype(arr_or_dtype) + + +def is_string_dtype(arr_or_dtype): + dtype = _get_dtype(arr_or_dtype) + return dtype.kind in ('O', 'S', 'U') and not is_period_dtype(dtype) + + +def is_period_arraylike(arr): + """ return if we are period arraylike / PeriodIndex """ + if isinstance(arr, ABCPeriodIndex): + return True + elif isinstance(arr, (np.ndarray, ABCSeries)): + return arr.dtype == object and lib.infer_dtype(arr) == 'period' + return getattr(arr, 'inferred_type', None) == 'period' + + +def is_datetime_arraylike(arr): + """ return if we are datetime arraylike / DatetimeIndex """ + if isinstance(arr, ABCDatetimeIndex): + return True + elif isinstance(arr, (np.ndarray, ABCSeries)): + return arr.dtype == object and lib.infer_dtype(arr) == 'datetime' + return getattr(arr, 'inferred_type', None) == 'datetime' + + +def is_datetimelike(arr): + return (arr.dtype in _DATELIKE_DTYPES or + isinstance(arr, ABCPeriodIndex) or + is_datetimetz(arr)) + + +def is_dtype_equal(source, target): + """ return a boolean if the dtypes are equal """ + try: + source = _get_dtype(source) + target = _get_dtype(target) + return source == target + except (TypeError, AttributeError): + + # invalid comparison + # object == category will hit this + return False + + +def is_any_int_dtype(arr_or_dtype): + tipo = _get_dtype_type(arr_or_dtype) + return issubclass(tipo, np.integer) + + +def is_integer_dtype(arr_or_dtype): + tipo = _get_dtype_type(arr_or_dtype) + return (issubclass(tipo, np.integer) and + not issubclass(tipo, (np.datetime64, np.timedelta64))) + + +def is_int64_dtype(arr_or_dtype): + tipo = _get_dtype_type(arr_or_dtype) + return issubclass(tipo, np.int64) + + +def is_int_or_datetime_dtype(arr_or_dtype): + tipo = _get_dtype_type(arr_or_dtype) + return (issubclass(tipo, np.integer) or + issubclass(tipo, (np.datetime64, np.timedelta64))) + + +def is_datetime64_any_dtype(arr_or_dtype): + return (is_datetime64_dtype(arr_or_dtype) or + is_datetime64tz_dtype(arr_or_dtype)) + + +def is_datetime64_ns_dtype(arr_or_dtype): + try: + tipo = _get_dtype(arr_or_dtype) + except TypeError: + return False + return tipo == _NS_DTYPE + + +def is_timedelta64_ns_dtype(arr_or_dtype): + tipo = _get_dtype(arr_or_dtype) + return tipo == _TD_DTYPE + + +def is_datetime_or_timedelta_dtype(arr_or_dtype): + tipo = _get_dtype_type(arr_or_dtype) + return issubclass(tipo, (np.datetime64, np.timedelta64)) + + +def is_numeric_v_string_like(a, b): + """ + numpy doesn't like to compare numeric arrays vs scalar string-likes + + return a boolean result if this is the case for a,b or b,a + + """ + is_a_array = isinstance(a, np.ndarray) + is_b_array = isinstance(b, np.ndarray) + + is_a_numeric_array = is_a_array and is_numeric_dtype(a) + is_b_numeric_array = is_b_array and is_numeric_dtype(b) + is_a_string_array = is_a_array and is_string_like_dtype(a) + is_b_string_array = is_b_array and is_string_like_dtype(b) + + is_a_scalar_string_like = not is_a_array and is_string_like(a) + is_b_scalar_string_like = not is_b_array and is_string_like(b) + + return ((is_a_numeric_array and is_b_scalar_string_like) or + (is_b_numeric_array and is_a_scalar_string_like) or + (is_a_numeric_array and is_b_string_array) or + (is_b_numeric_array and is_a_string_array)) + + +def is_datetimelike_v_numeric(a, b): + # return if we have an i8 convertible and numeric comparison + if not hasattr(a, 'dtype'): + a = np.asarray(a) + if not hasattr(b, 'dtype'): + b = np.asarray(b) + + def is_numeric(x): + return is_integer_dtype(x) or is_float_dtype(x) + + is_datetimelike = needs_i8_conversion + return ((is_datetimelike(a) and is_numeric(b)) or + (is_datetimelike(b) and is_numeric(a))) + + +def is_datetimelike_v_object(a, b): + # return if we have an i8 convertible and object comparsion + if not hasattr(a, 'dtype'): + a = np.asarray(a) + if not hasattr(b, 'dtype'): + b = np.asarray(b) + + def f(x): + return is_object_dtype(x) + + def is_object(x): + return is_integer_dtype(x) or is_float_dtype(x) + + is_datetimelike = needs_i8_conversion + return ((is_datetimelike(a) and is_object(b)) or + (is_datetimelike(b) and is_object(a))) + + +def needs_i8_conversion(arr_or_dtype): + return (is_datetime_or_timedelta_dtype(arr_or_dtype) or + is_datetime64tz_dtype(arr_or_dtype) or + is_period_dtype(arr_or_dtype)) + + +def is_numeric_dtype(arr_or_dtype): + tipo = _get_dtype_type(arr_or_dtype) + return (issubclass(tipo, (np.number, np.bool_)) and + not issubclass(tipo, (np.datetime64, np.timedelta64))) + + +def is_string_like_dtype(arr_or_dtype): + # exclude object as its a mixed dtype + dtype = _get_dtype(arr_or_dtype) + return dtype.kind in ('S', 'U') + + +def is_float_dtype(arr_or_dtype): + tipo = _get_dtype_type(arr_or_dtype) + return issubclass(tipo, np.floating) + + +def is_floating_dtype(arr_or_dtype): + tipo = _get_dtype_type(arr_or_dtype) + return isinstance(tipo, np.floating) + + +def is_bool_dtype(arr_or_dtype): + try: + tipo = _get_dtype_type(arr_or_dtype) + except ValueError: + # this isn't even a dtype + return False + return issubclass(tipo, np.bool_) + + +def is_extension_type(value): + """ + if we are a klass that is preserved by the internals + these are internal klasses that we represent (and don't use a np.array) + """ + if is_categorical(value): + return True + elif is_sparse(value): + return True + elif is_datetimetz(value): + return True + return False + + +def is_complex_dtype(arr_or_dtype): + tipo = _get_dtype_type(arr_or_dtype) + return issubclass(tipo, np.complexfloating) + + +def _coerce_to_dtype(dtype): + """ coerce a string / np.dtype to a dtype """ + if is_categorical_dtype(dtype): + dtype = CategoricalDtype() + elif is_datetime64tz_dtype(dtype): + dtype = DatetimeTZDtype(dtype) + elif is_period_dtype(dtype): + dtype = PeriodDtype(dtype) + else: + dtype = np.dtype(dtype) + return dtype + + +def _get_dtype(arr_or_dtype): + if isinstance(arr_or_dtype, np.dtype): + return arr_or_dtype + elif isinstance(arr_or_dtype, type): + return np.dtype(arr_or_dtype) + elif isinstance(arr_or_dtype, CategoricalDtype): + return arr_or_dtype + elif isinstance(arr_or_dtype, DatetimeTZDtype): + return arr_or_dtype + elif isinstance(arr_or_dtype, PeriodDtype): + return arr_or_dtype + elif isinstance(arr_or_dtype, string_types): + if is_categorical_dtype(arr_or_dtype): + return CategoricalDtype.construct_from_string(arr_or_dtype) + elif is_datetime64tz_dtype(arr_or_dtype): + return DatetimeTZDtype.construct_from_string(arr_or_dtype) + elif is_period_dtype(arr_or_dtype): + return PeriodDtype.construct_from_string(arr_or_dtype) + + if hasattr(arr_or_dtype, 'dtype'): + arr_or_dtype = arr_or_dtype.dtype + return np.dtype(arr_or_dtype) + + +def _get_dtype_type(arr_or_dtype): + if isinstance(arr_or_dtype, np.dtype): + return arr_or_dtype.type + elif isinstance(arr_or_dtype, type): + return np.dtype(arr_or_dtype).type + elif isinstance(arr_or_dtype, CategoricalDtype): + return CategoricalDtypeType + elif isinstance(arr_or_dtype, DatetimeTZDtype): + return DatetimeTZDtypeType + elif isinstance(arr_or_dtype, PeriodDtype): + return PeriodDtypeType + elif isinstance(arr_or_dtype, string_types): + if is_categorical_dtype(arr_or_dtype): + return CategoricalDtypeType + elif is_datetime64tz_dtype(arr_or_dtype): + return DatetimeTZDtypeType + elif is_period_dtype(arr_or_dtype): + return PeriodDtypeType + return _get_dtype_type(np.dtype(arr_or_dtype)) + try: + return arr_or_dtype.dtype.type + except AttributeError: + return type(None) + + +def _get_dtype_from_object(dtype): + """Get a numpy dtype.type-style object. This handles the datetime64[ns] + and datetime64[ns, TZ] compat + + Notes + ----- + If nothing can be found, returns ``object``. + """ + + # type object from a dtype + if isinstance(dtype, type) and issubclass(dtype, np.generic): + return dtype + elif is_categorical(dtype): + return CategoricalDtype().type + elif is_datetimetz(dtype): + return DatetimeTZDtype(dtype).type + elif isinstance(dtype, np.dtype): # dtype object + try: + _validate_date_like_dtype(dtype) + except TypeError: + # should still pass if we don't have a datelike + pass + return dtype.type + elif isinstance(dtype, string_types): + if dtype == 'datetime' or dtype == 'timedelta': + dtype += '64' + + try: + return _get_dtype_from_object(getattr(np, dtype)) + except (AttributeError, TypeError): + # handles cases like _get_dtype(int) + # i.e., python objects that are valid dtypes (unlike user-defined + # types, in general) + # TypeError handles the float16 typecode of 'e' + # further handle internal types + pass + + return _get_dtype_from_object(np.dtype(dtype)) + + +def _validate_date_like_dtype(dtype): + try: + typ = np.datetime_data(dtype)[0] + except ValueError as e: + raise TypeError('%s' % e) + if typ != 'generic' and typ != 'ns': + raise ValueError('%r is too specific of a frequency, try passing %r' % + (dtype.name, dtype.type.__name__)) + + +_string_dtypes = frozenset(map(_get_dtype_from_object, (binary_type, + text_type))) + + +def pandas_dtype(dtype): + """ + Converts input into a pandas only dtype object or a numpy dtype object. + + Parameters + ---------- + dtype : object to be converted + + Returns + ------- + np.dtype or a pandas dtype + """ + if isinstance(dtype, DatetimeTZDtype): + return dtype + elif isinstance(dtype, PeriodDtype): + return dtype + elif isinstance(dtype, CategoricalDtype): + return dtype + elif isinstance(dtype, string_types): + try: + return DatetimeTZDtype.construct_from_string(dtype) + except TypeError: + pass + + if dtype.startswith('period[') or dtype.startswith('Period['): + # do not parse string like U as period[U] + try: + return PeriodDtype.construct_from_string(dtype) + except TypeError: + pass + + try: + return CategoricalDtype.construct_from_string(dtype) + except TypeError: + pass + elif isinstance(dtype, ExtensionDtype): + return dtype + + return np.dtype(dtype) diff --git a/pandas/types/concat.py b/pandas/types/concat.py index 5cd7abb6889b7..8bdd71348a537 100644 --- a/pandas/types/concat.py +++ b/pandas/types/concat.py @@ -3,10 +3,22 @@ """ import numpy as np -import pandas.core.common as com import pandas.tslib as tslib from pandas import compat -from pandas.compat import map +from pandas.core.algorithms import take_1d +from .common import (is_categorical_dtype, + is_sparse, + is_datetimetz, + is_datetime64_dtype, + is_timedelta64_dtype, + is_period_dtype, + is_object_dtype, + is_bool_dtype, + is_dtype_equal, + _NS_DTYPE, + _TD_DTYPE) +from pandas.types.generic import (ABCDatetimeIndex, ABCTimedeltaIndex, + ABCPeriodIndex) def get_dtype_kinds(l): @@ -24,20 +36,24 @@ def get_dtype_kinds(l): for arr in l: dtype = arr.dtype - if com.is_categorical_dtype(dtype): + if is_categorical_dtype(dtype): typ = 'category' - elif com.is_sparse(arr): + elif is_sparse(arr): typ = 'sparse' - elif com.is_datetimetz(arr): - typ = 'datetimetz' - elif com.is_datetime64_dtype(dtype): + elif is_datetimetz(arr): + # if to_concat contains different tz, + # the result must be object dtype + typ = str(arr.dtype) + elif is_datetime64_dtype(dtype): typ = 'datetime' - elif com.is_timedelta64_dtype(dtype): + elif is_timedelta64_dtype(dtype): typ = 'timedelta' - elif com.is_object_dtype(dtype): + elif is_object_dtype(dtype): typ = 'object' - elif com.is_bool_dtype(dtype): + elif is_bool_dtype(dtype): typ = 'bool' + elif is_period_dtype(dtype): + typ = str(arr.dtype) else: typ = dtype.kind typs.add(typ) @@ -51,14 +67,14 @@ def _get_series_result_type(result): """ if isinstance(result, dict): # concat Series with axis 1 - if all(com.is_sparse(c) for c in compat.itervalues(result)): + if all(is_sparse(c) for c in compat.itervalues(result)): from pandas.sparse.api import SparseDataFrame return SparseDataFrame else: from pandas.core.frame import DataFrame return DataFrame - elif com.is_sparse(result): + elif is_sparse(result): # concat Series with axis 1 from pandas.sparse.api import SparseSeries return SparseSeries @@ -116,16 +132,21 @@ def is_nonempty(x): typs = get_dtype_kinds(to_concat) - # these are mandated to handle empties as well - if 'datetime' in typs or 'datetimetz' in typs or 'timedelta' in typs: + _contains_datetime = any(typ.startswith('datetime') for typ in typs) + _contains_period = any(typ.startswith('period') for typ in typs) + + if 'category' in typs: + # this must be priort to _concat_datetime, + # to support Categorical + datetime-like + return _concat_categorical(to_concat, axis=axis) + + elif _contains_datetime or 'timedelta' in typs or _contains_period: return _concat_datetime(to_concat, axis=axis, typs=typs) + # these are mandated to handle empties as well elif 'sparse' in typs: return _concat_sparse(to_concat, axis=axis, typs=typs) - elif 'category' in typs: - return _concat_categorical(to_concat, axis=axis) - if not nonempty: # we have all empties, but may need to coerce the result dtype to # object if we have non-numeric type operands (numpy would otherwise @@ -161,44 +182,113 @@ def _concat_categorical(to_concat, axis=0): A single array, preserving the combined dtypes """ - from pandas.core.categorical import Categorical - - def convert_categorical(x): - # coerce to object dtype - if com.is_categorical_dtype(x.dtype): - return x.get_values() - return x.ravel() - - if get_dtype_kinds(to_concat) - set(['object', 'category']): - # convert to object type and perform a regular concat - return _concat_compat([np.array(x, copy=False, dtype=object) - for x in to_concat], axis=0) + def _concat_asobject(to_concat): + to_concat = [x.get_values() if is_categorical_dtype(x.dtype) + else x.ravel() for x in to_concat] + res = _concat_compat(to_concat) + if axis == 1: + return res.reshape(1, len(res)) + else: + return res # we could have object blocks and categoricals here # if we only have a single categoricals then combine everything # else its a non-compat categorical - categoricals = [x for x in to_concat if com.is_categorical_dtype(x.dtype)] + categoricals = [x for x in to_concat if is_categorical_dtype(x.dtype)] # validate the categories - categories = categoricals[0] - rawcats = categories.categories - for x in categoricals[1:]: - if not categories.is_dtype_equal(x): - raise ValueError("incompatible categories in categorical concat") - - # we've already checked that all categoricals are the same, so if their - # length is equal to the input then we have all the same categories - if len(categoricals) == len(to_concat): - # concating numeric types is much faster than concating object types - # and fastpath takes a shorter path through the constructor - return Categorical(np.concatenate([x.codes for x in to_concat], - axis=0), - rawcats, ordered=categoricals[0].ordered, - fastpath=True) + if len(categoricals) != len(to_concat): + pass + else: + # when all categories are identical + first = to_concat[0] + if all(first.is_dtype_equal(other) for other in to_concat[1:]): + return union_categoricals(categoricals) + + return _concat_asobject(to_concat) + + +def union_categoricals(to_union, sort_categories=False): + """ + Combine list-like of Categoricals, unioning categories. All + categories must have the same dtype. + + .. versionadded:: 0.19.0 + + Parameters + ---------- + to_union : list-like of Categoricals + sort_categories : boolean, default False + If true, resulting categories will be lexsorted, otherwise + they will be ordered as they appear in the data. + + Returns + ------- + result : Categorical + + Raises + ------ + TypeError + - all inputs do not have the same dtype + - all inputs do not have the same ordered property + - all inputs are ordered and their categories are not identical + - sort_categories=True and Categoricals are ordered + ValueError + Emmpty list of categoricals passed + """ + from pandas import Index, Categorical + + if len(to_union) == 0: + raise ValueError('No Categoricals to union') + + first = to_union[0] + + if not all(is_dtype_equal(other.categories.dtype, first.categories.dtype) + for other in to_union[1:]): + raise TypeError("dtype of categories must be the same") + + ordered = False + if all(first.is_dtype_equal(other) for other in to_union[1:]): + # identical categories - fastpath + categories = first.categories + ordered = first.ordered + new_codes = np.concatenate([c.codes for c in to_union]) + + if sort_categories and ordered: + raise TypeError("Cannot use sort_categories=True with " + "ordered Categoricals") + + if sort_categories and not categories.is_monotonic_increasing: + categories = categories.sort_values() + indexer = categories.get_indexer(first.categories) + new_codes = take_1d(indexer, new_codes, fill_value=-1) + elif all(not c.ordered for c in to_union): + # different categories - union and recode + cats = first.categories.append([c.categories for c in to_union[1:]]) + categories = Index(cats.unique()) + if sort_categories: + categories = categories.sort_values() + + new_codes = [] + for c in to_union: + if len(c.categories) > 0: + indexer = categories.get_indexer(c.categories) + new_codes.append(take_1d(indexer, c.codes, fill_value=-1)) + else: + # must be all NaN + new_codes.append(c.codes) + new_codes = np.concatenate(new_codes) else: - concatted = np.concatenate(list(map(convert_categorical, to_concat)), - axis=0) - return Categorical(concatted, rawcats) + # ordered - to show a proper error message + if all(c.ordered for c in to_union): + msg = ("to union ordered Categoricals, " + "all categories must be the same") + raise TypeError(msg) + else: + raise TypeError('Categorical.ordered must be the same') + + return Categorical(new_codes, categories=categories, ordered=ordered, + fastpath=True) def _concat_datetime(to_concat, axis=0, typs=None): @@ -221,17 +311,18 @@ def convert_to_pydatetime(x, axis): # coerce to an object dtype # if dtype is of datetimetz or timezone - if x.dtype.kind == com._NS_DTYPE.kind: + if x.dtype.kind == _NS_DTYPE.kind: if getattr(x, 'tz', None) is not None: x = x.asobject.values else: shape = x.shape - x = tslib.ints_to_pydatetime(x.view(np.int64).ravel()) + x = tslib.ints_to_pydatetime(x.view(np.int64).ravel(), + box=True) x = x.reshape(shape) - elif x.dtype == com._TD_DTYPE: + elif x.dtype == _TD_DTYPE: shape = x.shape - x = tslib.ints_to_pytimedelta(x.view(np.int64).ravel()) + x = tslib.ints_to_pytimedelta(x.view(np.int64).ravel(), box=True) x = x.reshape(shape) if axis == 1: @@ -243,34 +334,71 @@ def convert_to_pydatetime(x, axis): # must be single dtype if len(typs) == 1: + _contains_datetime = any(typ.startswith('datetime') for typ in typs) + _contains_period = any(typ.startswith('period') for typ in typs) - if 'datetimetz' in typs: - # datetime with no tz should be stored as "datetime" in typs, - # thus no need to care - - # we require ALL of the same tz for datetimetz - tzs = set([str(x.tz) for x in to_concat]) - if len(tzs) == 1: - from pandas.tseries.index import DatetimeIndex - new_values = np.concatenate([x.tz_localize(None).asi8 - for x in to_concat]) - return DatetimeIndex(new_values, tz=list(tzs)[0]) + if _contains_datetime: - elif 'datetime' in typs: - new_values = np.concatenate([x.view(np.int64) for x in to_concat], - axis=axis) - return new_values.view(com._NS_DTYPE) + if 'datetime' in typs: + new_values = np.concatenate([x.view(np.int64) for x in + to_concat], axis=axis) + return new_values.view(_NS_DTYPE) + else: + # when to_concat has different tz, len(typs) > 1. + # thus no need to care + return _concat_datetimetz(to_concat) elif 'timedelta' in typs: new_values = np.concatenate([x.view(np.int64) for x in to_concat], axis=axis) - return new_values.view(com._TD_DTYPE) + return new_values.view(_TD_DTYPE) + + elif _contains_period: + # PeriodIndex must be handled by PeriodIndex, + # Thus can't meet this condition ATM + # Must be changed when we adding PeriodDtype + raise NotImplementedError # need to coerce to object to_concat = [convert_to_pydatetime(x, axis) for x in to_concat] return np.concatenate(to_concat, axis=axis) +def _concat_datetimetz(to_concat, name=None): + """ + concat DatetimeIndex with the same tz + all inputs must be DatetimeIndex + it is used in DatetimeIndex.append also + """ + # do not pass tz to set because tzlocal cannot be hashed + if len(set([str(x.dtype) for x in to_concat])) != 1: + raise ValueError('to_concat must have the same tz') + tz = to_concat[0].tz + # no need to localize because internal repr will not be changed + new_values = np.concatenate([x.asi8 for x in to_concat]) + return to_concat[0]._simple_new(new_values, tz=tz, name=name) + + +def _concat_index_asobject(to_concat, name=None): + """ + concat all inputs as object. DatetimeIndex, TimedeltaIndex and + PeriodIndex are converted to object dtype before concatenation + """ + + klasses = ABCDatetimeIndex, ABCTimedeltaIndex, ABCPeriodIndex + to_concat = [x.asobject if isinstance(x, klasses) else x + for x in to_concat] + + from pandas import Index + self = to_concat[0] + attribs = self._get_attributes_dict() + attribs['name'] = name + + to_concat = [x._values if isinstance(x, Index) else x + for x in to_concat] + return self._shallow_copy_with_infer(np.concatenate(to_concat), **attribs) + + def _concat_sparse(to_concat, axis=0, typs=None): """ provide concatenation of an sparse/dense array of arrays each of which is a @@ -299,7 +427,7 @@ def convert_sparse(x, axis): return x if typs is None: - typs = com.get_dtype_kinds(to_concat) + typs = get_dtype_kinds(to_concat) if len(typs) == 1: # concat input as it is if all inputs are sparse @@ -323,7 +451,7 @@ def convert_sparse(x, axis): # input may be sparse / dense mixed and may have different fill_value # input must contain sparse at least 1 - sparses = [c for c in to_concat if com.is_sparse(c)] + sparses = [c for c in to_concat if is_sparse(c)] fill_values = [c.fill_value for c in sparses] sp_indexes = [c.sp_index for c in sparses] diff --git a/pandas/types/dtypes.py b/pandas/types/dtypes.py index 140d494c3e1b2..5b6d7905d4095 100644 --- a/pandas/types/dtypes.py +++ b/pandas/types/dtypes.py @@ -244,6 +244,124 @@ def __eq__(self, other): if isinstance(other, compat.string_types): return other == self.name - return isinstance(other, DatetimeTZDtype) and \ - self.unit == other.unit and \ - str(self.tz) == str(other.tz) + return (isinstance(other, DatetimeTZDtype) and + self.unit == other.unit and + str(self.tz) == str(other.tz)) + + +class PeriodDtypeType(type): + """ + the type of PeriodDtype, this metaclass determines subclass ability + """ + pass + + +class PeriodDtype(ExtensionDtype): + __metaclass__ = PeriodDtypeType + """ + A Period duck-typed class, suitable for holding a period with freq dtype. + + THIS IS NOT A REAL NUMPY DTYPE, but essentially a sub-class of np.int64. + """ + type = PeriodDtypeType + kind = 'O' + str = '|O08' + base = np.dtype('O') + num = 102 + _metadata = ['freq'] + _match = re.compile("(P|p)eriod\[(?P.+)\]") + _cache = {} + + def __new__(cls, freq=None): + """ + Parameters + ---------- + freq : frequency + """ + + if isinstance(freq, PeriodDtype): + return freq + + elif freq is None: + # empty constructor for pickle compat + return object.__new__(cls) + + from pandas.tseries.offsets import DateOffset + if not isinstance(freq, DateOffset): + freq = cls._parse_dtype_strict(freq) + + try: + return cls._cache[freq.freqstr] + except KeyError: + u = object.__new__(cls) + u.freq = freq + cls._cache[freq.freqstr] = u + return u + + @classmethod + def _parse_dtype_strict(cls, freq): + if isinstance(freq, compat.string_types): + if freq.startswith('period[') or freq.startswith('Period['): + m = cls._match.search(freq) + if m is not None: + freq = m.group('freq') + from pandas.tseries.frequencies import to_offset + freq = to_offset(freq) + if freq is not None: + return freq + + raise ValueError("could not construct PeriodDtype") + + @classmethod + def construct_from_string(cls, string): + """ + attempt to construct this type from a string, raise a TypeError + if its not possible + """ + from pandas.tseries.offsets import DateOffset + if isinstance(string, (compat.string_types, DateOffset)): + # avoid tuple to be regarded as freq + try: + return cls(freq=string) + except ValueError: + pass + raise TypeError("could not construct PeriodDtype") + + def __unicode__(self): + return "period[{freq}]".format(freq=self.freq.freqstr) + + @property + def name(self): + return str(self) + + def __hash__(self): + # make myself hashable + return hash(str(self)) + + def __eq__(self, other): + if isinstance(other, compat.string_types): + return other == self.name or other == self.name.title() + + return isinstance(other, PeriodDtype) and self.freq == other.freq + + @classmethod + def is_dtype(cls, dtype): + """ + Return a boolean if we if the passed type is an actual dtype that we + can match (via string or type) + """ + + if isinstance(dtype, compat.string_types): + # PeriodDtype can be instanciated from freq string like "U", + # but dosn't regard freq str like "U" as dtype. + if dtype.startswith('period[') or dtype.startswith('Period['): + try: + if cls._parse_dtype_strict(dtype) is not None: + return True + else: + return False + except ValueError: + return False + else: + return False + return super(PeriodDtype, cls).is_dtype(dtype) diff --git a/pandas/types/inference.py b/pandas/types/inference.py new file mode 100644 index 0000000000000..35a2dc2fb831b --- /dev/null +++ b/pandas/types/inference.py @@ -0,0 +1,104 @@ +""" basic inference routines """ + +import collections +import re +import numpy as np +from numbers import Number +from pandas.compat import (string_types, text_type, + string_and_binary_types) +from pandas import lib + +is_bool = lib.is_bool + +is_integer = lib.is_integer + +is_float = lib.is_float + +is_complex = lib.is_complex + +is_scalar = lib.isscalar + + +def is_number(obj): + return isinstance(obj, (Number, np.number)) + + +def is_string_like(obj): + return isinstance(obj, (text_type, string_types)) + + +def _iterable_not_string(x): + return (isinstance(x, collections.Iterable) and + not isinstance(x, string_types)) + + +def is_iterator(obj): + # python 3 generators have __next__ instead of next + return hasattr(obj, 'next') or hasattr(obj, '__next__') + + +def is_re(obj): + return isinstance(obj, re._pattern_type) + + +def is_re_compilable(obj): + try: + re.compile(obj) + except TypeError: + return False + else: + return True + + +def is_list_like(arg): + return (hasattr(arg, '__iter__') and + not isinstance(arg, string_and_binary_types)) + + +def is_dict_like(arg): + return hasattr(arg, '__getitem__') and hasattr(arg, 'keys') + + +def is_named_tuple(arg): + return isinstance(arg, tuple) and hasattr(arg, '_fields') + + +def is_hashable(arg): + """Return True if hash(arg) will succeed, False otherwise. + + Some types will pass a test against collections.Hashable but fail when they + are actually hashed with hash(). + + Distinguish between these and other types by trying the call to hash() and + seeing if they raise TypeError. + + Examples + -------- + >>> a = ([],) + >>> isinstance(a, collections.Hashable) + True + >>> is_hashable(a) + False + """ + # unfortunately, we can't use isinstance(arg, collections.Hashable), which + # can be faster than calling hash, because numpy scalars on Python 3 fail + # this test + + # reconsider this decision once this numpy bug is fixed: + # https://github.com/numpy/numpy/issues/5562 + + try: + hash(arg) + except TypeError: + return False + else: + return True + + +def is_sequence(x): + try: + iter(x) + len(x) # it has a length + return not isinstance(x, string_and_binary_types) + except (TypeError, AttributeError): + return False diff --git a/pandas/types/missing.py b/pandas/types/missing.py new file mode 100644 index 0000000000000..a4af127e0c381 --- /dev/null +++ b/pandas/types/missing.py @@ -0,0 +1,394 @@ +""" +missing types & inference +""" +import numpy as np +from pandas import lib +from pandas.tslib import NaT, iNaT +from .generic import (ABCMultiIndex, ABCSeries, + ABCIndexClass, ABCGeneric) +from .common import (is_string_dtype, is_datetimelike, + is_datetimelike_v_numeric, is_float_dtype, + is_datetime64_dtype, is_datetime64tz_dtype, + is_timedelta64_dtype, + is_complex_dtype, is_categorical_dtype, + is_string_like_dtype, is_bool_dtype, + is_integer_dtype, is_dtype_equal, + needs_i8_conversion, _ensure_object, + pandas_dtype, + is_scalar, + is_object_dtype, + is_integer, + _TD_DTYPE, + _NS_DTYPE, + _DATELIKE_DTYPES) +from .inference import is_list_like + + +def isnull(obj): + """Detect missing values (NaN in numeric arrays, None/NaN in object arrays) + + Parameters + ---------- + arr : ndarray or object value + Object to check for null-ness + + Returns + ------- + isnulled : array-like of bool or bool + Array or bool indicating whether an object is null or if an array is + given which of the element is null. + + See also + -------- + pandas.notnull: boolean inverse of pandas.isnull + """ + return _isnull(obj) + + +def _isnull_new(obj): + if is_scalar(obj): + return lib.checknull(obj) + # hack (for now) because MI registers as ndarray + elif isinstance(obj, ABCMultiIndex): + raise NotImplementedError("isnull is not defined for MultiIndex") + elif isinstance(obj, (ABCSeries, np.ndarray, ABCIndexClass)): + return _isnull_ndarraylike(obj) + elif isinstance(obj, ABCGeneric): + return obj._constructor(obj._data.isnull(func=isnull)) + elif isinstance(obj, list) or hasattr(obj, '__array__'): + return _isnull_ndarraylike(np.asarray(obj)) + else: + return obj is None + + +def _isnull_old(obj): + """Detect missing values. Treat None, NaN, INF, -INF as null. + + Parameters + ---------- + arr: ndarray or object value + + Returns + ------- + boolean ndarray or boolean + """ + if is_scalar(obj): + return lib.checknull_old(obj) + # hack (for now) because MI registers as ndarray + elif isinstance(obj, ABCMultiIndex): + raise NotImplementedError("isnull is not defined for MultiIndex") + elif isinstance(obj, (ABCSeries, np.ndarray, ABCIndexClass)): + return _isnull_ndarraylike_old(obj) + elif isinstance(obj, ABCGeneric): + return obj._constructor(obj._data.isnull(func=_isnull_old)) + elif isinstance(obj, list) or hasattr(obj, '__array__'): + return _isnull_ndarraylike_old(np.asarray(obj)) + else: + return obj is None + + +_isnull = _isnull_new + + +def _use_inf_as_null(key): + """Option change callback for null/inf behaviour + Choose which replacement for numpy.isnan / -numpy.isfinite is used. + + Parameters + ---------- + flag: bool + True means treat None, NaN, INF, -INF as null (old way), + False means None and NaN are null, but INF, -INF are not null + (new way). + + Notes + ----- + This approach to setting global module values is discussed and + approved here: + + * http://stackoverflow.com/questions/4859217/ + programmatically-creating-variables-in-python/4859312#4859312 + """ + from pandas.core.config import get_option + flag = get_option(key) + if flag: + globals()['_isnull'] = _isnull_old + else: + globals()['_isnull'] = _isnull_new + + +def _isnull_ndarraylike(obj): + + values = getattr(obj, 'values', obj) + dtype = values.dtype + + if is_string_dtype(dtype): + if is_categorical_dtype(values): + from pandas import Categorical + if not isinstance(values, Categorical): + values = values.values + result = values.isnull() + else: + + # Working around NumPy ticket 1542 + shape = values.shape + + if is_string_like_dtype(dtype): + result = np.zeros(values.shape, dtype=bool) + else: + result = np.empty(shape, dtype=bool) + vec = lib.isnullobj(values.ravel()) + result[...] = vec.reshape(shape) + + elif needs_i8_conversion(obj): + # this is the NaT pattern + result = values.view('i8') == iNaT + else: + result = np.isnan(values) + + # box + if isinstance(obj, ABCSeries): + from pandas import Series + result = Series(result, index=obj.index, name=obj.name, copy=False) + + return result + + +def _isnull_ndarraylike_old(obj): + values = getattr(obj, 'values', obj) + dtype = values.dtype + + if is_string_dtype(dtype): + # Working around NumPy ticket 1542 + shape = values.shape + + if is_string_like_dtype(dtype): + result = np.zeros(values.shape, dtype=bool) + else: + result = np.empty(shape, dtype=bool) + vec = lib.isnullobj_old(values.ravel()) + result[:] = vec.reshape(shape) + + elif dtype in _DATELIKE_DTYPES: + # this is the NaT pattern + result = values.view('i8') == iNaT + else: + result = ~np.isfinite(values) + + # box + if isinstance(obj, ABCSeries): + from pandas import Series + result = Series(result, index=obj.index, name=obj.name, copy=False) + + return result + + +def notnull(obj): + """Replacement for numpy.isfinite / -numpy.isnan which is suitable for use + on object arrays. + + Parameters + ---------- + arr : ndarray or object value + Object to check for *not*-null-ness + + Returns + ------- + isnulled : array-like of bool or bool + Array or bool indicating whether an object is *not* null or if an array + is given which of the element is *not* null. + + See also + -------- + pandas.isnull : boolean inverse of pandas.notnull + """ + res = isnull(obj) + if is_scalar(res): + return not res + return ~res + + +def is_null_datelike_scalar(other): + """ test whether the object is a null datelike, e.g. Nat + but guard against passing a non-scalar """ + if other is NaT or other is None: + return True + elif is_scalar(other): + + # a timedelta + if hasattr(other, 'dtype'): + return other.view('i8') == iNaT + elif is_integer(other) and other == iNaT: + return True + return isnull(other) + return False + + +def _is_na_compat(arr, fill_value=np.nan): + """ + Parameters + ---------- + arr: a numpy array + fill_value: fill value, default to np.nan + + Returns + ------- + True if we can fill using this fill_value + """ + dtype = arr.dtype + if isnull(fill_value): + return not (is_bool_dtype(dtype) or + is_integer_dtype(dtype)) + return True + + +def array_equivalent(left, right, strict_nan=False): + """ + True if two arrays, left and right, have equal non-NaN elements, and NaNs + in corresponding locations. False otherwise. It is assumed that left and + right are NumPy arrays of the same dtype. The behavior of this function + (particularly with respect to NaNs) is not defined if the dtypes are + different. + + Parameters + ---------- + left, right : ndarrays + strict_nan : bool, default False + If True, consider NaN and None to be different. + + Returns + ------- + b : bool + Returns True if the arrays are equivalent. + + Examples + -------- + >>> array_equivalent( + ... np.array([1, 2, np.nan]), + ... np.array([1, 2, np.nan])) + True + >>> array_equivalent( + ... np.array([1, np.nan, 2]), + ... np.array([1, 2, np.nan])) + False + """ + + left, right = np.asarray(left), np.asarray(right) + + # shape compat + if left.shape != right.shape: + return False + + # Object arrays can contain None, NaN and NaT. + # string dtypes must be come to this path for NumPy 1.7.1 compat + if is_string_dtype(left) or is_string_dtype(right): + + if not strict_nan: + # isnull considers NaN and None to be equivalent. + return lib.array_equivalent_object( + _ensure_object(left.ravel()), _ensure_object(right.ravel())) + + for left_value, right_value in zip(left, right): + if left_value is NaT and right_value is not NaT: + return False + + elif isinstance(left_value, float) and np.isnan(left_value): + if (not isinstance(right_value, float) or + not np.isnan(right_value)): + return False + else: + if left_value != right_value: + return False + return True + + # NaNs can occur in float and complex arrays. + if is_float_dtype(left) or is_complex_dtype(left): + return ((left == right) | (np.isnan(left) & np.isnan(right))).all() + + # numpy will will not allow this type of datetimelike vs integer comparison + elif is_datetimelike_v_numeric(left, right): + return False + + # M8/m8 + elif needs_i8_conversion(left) and needs_i8_conversion(right): + if not is_dtype_equal(left.dtype, right.dtype): + return False + + left = left.view('i8') + right = right.view('i8') + + # NaNs cannot occur otherwise. + try: + return np.array_equal(left, right) + except AttributeError: + # see gh-13388 + # + # NumPy v1.7.1 has a bug in its array_equal + # function that prevents it from correctly + # comparing two arrays with complex dtypes. + # This bug is corrected in v1.8.0, so remove + # this try-except block as soon as we stop + # supporting NumPy versions < 1.8.0 + if not is_dtype_equal(left.dtype, right.dtype): + return False + + left = left.tolist() + right = right.tolist() + + return left == right + + +def _infer_fill_value(val): + """ + infer the fill value for the nan/NaT from the provided + scalar/ndarray/list-like if we are a NaT, return the correct dtyped + element to provide proper block construction + """ + + if not is_list_like(val): + val = [val] + val = np.array(val, copy=False) + if is_datetimelike(val): + return np.array('NaT', dtype=val.dtype) + elif is_object_dtype(val.dtype): + dtype = lib.infer_dtype(_ensure_object(val)) + if dtype in ['datetime', 'datetime64']: + return np.array('NaT', dtype=_NS_DTYPE) + elif dtype in ['timedelta', 'timedelta64']: + return np.array('NaT', dtype=_TD_DTYPE) + return np.nan + + +def _maybe_fill(arr, fill_value=np.nan): + """ + if we have a compatiable fill_value and arr dtype, then fill + """ + if _is_na_compat(arr, fill_value): + arr.fill(fill_value) + return arr + + +def na_value_for_dtype(dtype): + """ + Return a dtype compat na value + + Parameters + ---------- + dtype : string / dtype + + Returns + ------- + np.dtype or a pandas dtype + """ + dtype = pandas_dtype(dtype) + + if (is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype) or + is_timedelta64_dtype(dtype)): + return NaT + elif is_float_dtype(dtype): + return np.nan + elif is_integer_dtype(dtype): + return 0 + elif is_bool_dtype(dtype): + return False + return np.nan diff --git a/pandas/util/decorators.py b/pandas/util/decorators.py index 58cd0c13d8ec7..e1888a3ffd62a 100644 --- a/pandas/util/decorators.py +++ b/pandas/util/decorators.py @@ -94,7 +94,7 @@ def wrapper(*args, **kwargs): # Substitution and Appender are derived from matplotlib.docstring (1.1.0) -# module http://matplotlib.sourceforge.net/users/license.html +# module http://matplotlib.org/users/license.html class Substitution(object): diff --git a/pandas/util/depr_module.py b/pandas/util/depr_module.py new file mode 100644 index 0000000000000..7e03a000a50ec --- /dev/null +++ b/pandas/util/depr_module.py @@ -0,0 +1,79 @@ +""" +This module houses a utility class for mocking deprecated modules. +It is for internal use only and should not be used beyond this purpose. +""" + +import warnings +import importlib + + +class _DeprecatedModule(object): + """ Class for mocking deprecated modules. + + Parameters + ---------- + deprmod : name of module to be deprecated. + alts : alternative modules to be used to access objects or methods + available in module. + removals : objects or methods in module that will no longer be + accessible once module is removed. + """ + def __init__(self, deprmod, alts=None, removals=None): + self.deprmod = deprmod + + self.alts = alts + if self.alts is not None: + self.alts = frozenset(self.alts) + + self.removals = removals + if self.removals is not None: + self.removals = frozenset(self.removals) + + # For introspection purposes. + self.self_dir = frozenset(dir(self.__class__)) + + def __dir__(self): + _dir = object.__dir__(self) + + if self.removals is not None: + _dir.extend(list(self.removals)) + + if self.alts is not None: + for modname in self.alts: + module = importlib.import_module(modname) + _dir.extend(dir(module)) + + return _dir + + def __getattr__(self, name): + if name in self.self_dir: + return object.__getattribute__(self, name) + + if self.removals is not None and name in self.removals: + with warnings.catch_warnings(): + warnings.filterwarnings('ignore', category=FutureWarning) + module = importlib.import_module(self.deprmod) + + warnings.warn( + "{deprmod}.{name} is deprecated and will be removed in " + "a future version.".format(deprmod=self.deprmod, name=name), + FutureWarning, stacklevel=2) + + return object.__getattribute__(module, name) + + if self.alts is not None: + for modname in self.alts: + module = importlib.import_module(modname) + + if hasattr(module, name): + warnings.warn( + "{deprmod}.{name} is deprecated. Please use " + "{modname}.{name} instead.".format( + deprmod=self.deprmod, modname=modname, name=name), + FutureWarning, stacklevel=2) + + return getattr(module, name) + + raise AttributeError("module '{deprmod}' has no attribute " + "'{name}'".format(deprmod=self.deprmod, + name=name)) diff --git a/pandas/util/print_versions.py b/pandas/util/print_versions.py index e74568f39418c..3747e2ff6ca8f 100644 --- a/pandas/util/print_versions.py +++ b/pandas/util/print_versions.py @@ -4,6 +4,7 @@ import struct import subprocess import codecs +import locale import importlib @@ -47,6 +48,7 @@ def get_sys_info(): ("byteorder", "%s" % sys.byteorder), ("LC_ALL", "%s" % os.environ.get('LC_ALL', "None")), ("LANG", "%s" % os.environ.get('LANG', "None")), + ("LOCALE", "%s.%s" % locale.getlocale()), ]) except: @@ -99,7 +101,10 @@ def show_versions(as_json=False): deps_blob = list() for (modname, ver_f) in deps: try: - mod = importlib.import_module(modname) + if modname in sys.modules: + mod = sys.modules[modname] + else: + mod = importlib.import_module(modname) ver = ver_f(mod) deps_blob.append((modname, ver)) except: diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 03ccfcab24f58..57bb01e5e0406 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -23,11 +23,14 @@ import numpy as np import pandas as pd -from pandas.core.common import (is_sequence, array_equivalent, - is_list_like, is_datetimelike_v_numeric, - is_datetimelike_v_object, - is_number, is_bool, - needs_i8_conversion, is_categorical_dtype) +from pandas.types.missing import array_equivalent +from pandas.types.common import (is_datetimelike_v_numeric, + is_datetimelike_v_object, + is_number, is_bool, + needs_i8_conversion, + is_categorical_dtype, + is_sequence, + is_list_like) from pandas.formats.printing import pprint_thing from pandas.core.algorithms import take_1d @@ -40,7 +43,7 @@ from pandas.computation import expressions as expr -from pandas import (bdate_range, CategoricalIndex, DatetimeIndex, +from pandas import (bdate_range, CategoricalIndex, Categorical, DatetimeIndex, TimedeltaIndex, PeriodIndex, RangeIndex, Index, MultiIndex, Series, DataFrame, Panel, Panel4D) from pandas.util.decorators import deprecate @@ -60,7 +63,6 @@ def set_testing_mode(): # set the testing mode filters testing_mode = os.environ.get('PANDAS_TESTING_MODE', 'None') if 'deprecate' in testing_mode: - warnings.simplefilter('always', _testing_mode_warnings) @@ -747,10 +749,7 @@ def _get_ilevel_values(index, level): unique = index.levels[level] labels = index.labels[level] filled = take_1d(unique.values, labels, fill_value=unique._na_value) - values = unique._simple_new(filled, - name=index.names[level], - freq=getattr(unique, 'freq', None), - tz=getattr(unique, 'tz', None)) + values = unique._shallow_copy(filled, name=index.names[level]) return values # instance validation @@ -881,12 +880,12 @@ def assert_attr_equal(attr, left, right, obj='Attributes'): def assert_is_valid_plot_return_object(objs): import matplotlib.pyplot as plt - if isinstance(objs, np.ndarray): - for el in objs.flat: - assert isinstance(el, plt.Axes), ('one of \'objs\' is not a ' - 'matplotlib Axes instance, ' - 'type encountered {0!r}' - ''.format(el.__class__.__name__)) + if isinstance(objs, (pd.Series, np.ndarray)): + for el in objs.ravel(): + msg = ('one of \'objs\' is not a matplotlib Axes instance, ' + 'type encountered {0!r}') + assert isinstance(el, (plt.Axes, dict)), msg.format( + el.__class__.__name__) else: assert isinstance(objs, (plt.Artist, tuple, dict)), \ ('objs is neither an ndarray of Artist instances nor a ' @@ -963,19 +962,45 @@ def assertNotIsInstance(obj, cls, msg=''): def assert_categorical_equal(left, right, check_dtype=True, - obj='Categorical'): + obj='Categorical', check_category_order=True): + """Test that categoricals are eqivalent + + Parameters + ---------- + left, right : Categorical + Categoricals to compare + check_dtype : bool, default True + Check that integer dtype of the codes are the same + obj : str, default 'Categorical' + Specify object name being compared, internally used to show appropriate + assertion message + check_category_order : bool, default True + Whether the order of the categories should be compared, which + implies identical integer codes. If False, only the resulting + values are compared. The ordered attribute is + checked regardless. + """ assertIsInstance(left, pd.Categorical, '[Categorical] ') assertIsInstance(right, pd.Categorical, '[Categorical] ') - assert_index_equal(left.categories, right.categories, - obj='{0}.categories'.format(obj)) - assert_numpy_array_equal(left.codes, right.codes, check_dtype=check_dtype, - obj='{0}.codes'.format(obj)) + if check_category_order: + assert_index_equal(left.categories, right.categories, + obj='{0}.categories'.format(obj)) + assert_numpy_array_equal(left.codes, right.codes, + check_dtype=check_dtype, + obj='{0}.codes'.format(obj)) + else: + assert_index_equal(left.categories.sort_values(), + right.categories.sort_values(), + obj='{0}.categories'.format(obj)) + assert_index_equal(left.categories.take(left.codes), + right.categories.take(right.codes), + obj='{0}.values'.format(obj)) assert_attr_equal('ordered', left, right, obj=obj) -def raise_assert_detail(obj, message, left, right): +def raise_assert_detail(obj, message, left, right, diff=None): if isinstance(left, np.ndarray): left = pprint_thing(left) if isinstance(right, np.ndarray): @@ -986,12 +1011,16 @@ def raise_assert_detail(obj, message, left, right): {1} [left]: {2} [right]: {3}""".format(obj, message, left, right) + + if diff is not None: + msg = msg + "\n[diff]: {diff}".format(diff=diff) + raise AssertionError(msg) def assert_numpy_array_equal(left, right, strict_nan=False, check_dtype=True, err_msg=None, - obj='numpy array'): + obj='numpy array', check_same=None): """ Checks that 'np.ndarray' is equivalent Parameters @@ -1007,6 +1036,8 @@ def assert_numpy_array_equal(left, right, strict_nan=False, obj : str, default 'numpy array' Specify object name being compared, internally used to show appropriate assertion message + check_same : None|'copy'|'same', default None + Ensure left and right refer/do not refer to the same memory area """ # instance validation @@ -1016,6 +1047,14 @@ def assert_numpy_array_equal(left, right, strict_nan=False, assertIsInstance(left, np.ndarray, '[ndarray] ') assertIsInstance(right, np.ndarray, '[ndarray] ') + def _get_base(obj): + return obj.base if getattr(obj, 'base', None) is not None else obj + + if check_same == 'same': + assertIs(_get_base(left), _get_base(right)) + elif check_same == 'copy': + assertIsNot(_get_base(left), _get_base(right)) + def _raise(left, right, err_msg): if err_msg is None: if left.shape != right.shape: @@ -1065,10 +1104,10 @@ def assert_series_equal(left, right, check_dtype=True, right : Series check_dtype : bool, default True Whether to check the Series dtype is identical. - check_index_type : bool / string {'equiv'}, default False + check_index_type : bool / string {'equiv'}, default 'equiv' Whether to check the Index class, dtype and inferred_type are identical. - check_series_type : bool, default False + check_series_type : bool, default True Whether to check the Series class is identical. check_less_precise : bool or int, default False Specify comparison precision. Only used when check_exact is False. @@ -1280,7 +1319,8 @@ def assert_panelnd_equal(left, right, check_less_precise=False, assert_func=assert_frame_equal, check_names=False, - by_blocks=False): + by_blocks=False, + obj='Panel'): """Check that left and right Panels are equal. Parameters @@ -1301,6 +1341,9 @@ def assert_panelnd_equal(left, right, by_blocks : bool, default False Specify how to compare internal data. If False, compare by columns. If True, compare by blocks. + obj : str, default 'Panel' + Specify the object name being compared, internally used to show + the appropriate assertion message. """ if check_panel_type: @@ -1342,11 +1385,22 @@ def assert_panelnd_equal(left, right, # Sparse -def assert_sp_array_equal(left, right): +def assert_sp_array_equal(left, right, check_dtype=True): + """Check that the left and right SparseArray are equal. + + Parameters + ---------- + left : SparseArray + right : SparseArray + check_dtype : bool, default True + Whether to check the data dtype is identical. + """ + assertIsInstance(left, pd.SparseArray, '[SparseArray]') assertIsInstance(right, pd.SparseArray, '[SparseArray]') - assert_numpy_array_equal(left.sp_values, right.sp_values) + assert_numpy_array_equal(left.sp_values, right.sp_values, + check_dtype=check_dtype) # SparseIndex comparison assertIsInstance(left.sp_index, pd._sparse.SparseIndex, '[SparseIndex]') @@ -1357,15 +1411,38 @@ def assert_sp_array_equal(left, right): left.sp_index, right.sp_index) assert_attr_equal('fill_value', left, right) - assert_attr_equal('dtype', left, right) - assert_numpy_array_equal(left.values, right.values) + if check_dtype: + assert_attr_equal('dtype', left, right) + assert_numpy_array_equal(left.values, right.values, + check_dtype=check_dtype) + +def assert_sp_series_equal(left, right, check_dtype=True, exact_indices=True, + check_series_type=True, check_names=True, + obj='SparseSeries'): + """Check that the left and right SparseSeries are equal. -def assert_sp_series_equal(left, right, exact_indices=True, - check_names=True, obj='SparseSeries'): + Parameters + ---------- + left : SparseSeries + right : SparseSeries + check_dtype : bool, default True + Whether to check the Series dtype is identical. + exact_indices : bool, default True + check_series_type : bool, default True + Whether to check the SparseSeries class is identical. + check_names : bool, default True + Whether to check the SparseSeries name attribute. + obj : str, default 'SparseSeries' + Specify the object name being compared, internally used to show + the appropriate assertion message. + """ assertIsInstance(left, pd.SparseSeries, '[SparseSeries]') assertIsInstance(right, pd.SparseSeries, '[SparseSeries]') + if check_series_type: + assert_class_equal(left, right, obj=obj) + assert_index_equal(left.index, right.index, obj='{0}.index'.format(obj)) @@ -1373,20 +1450,37 @@ def assert_sp_series_equal(left, right, exact_indices=True, if check_names: assert_attr_equal('name', left, right) - assert_attr_equal('dtype', left, right) + if check_dtype: + assert_attr_equal('dtype', left, right) assert_numpy_array_equal(left.values, right.values) -def assert_sp_frame_equal(left, right, exact_indices=True, - obj='SparseDataFrame'): - """ - exact: Series SparseIndex objects must be exactly the same, otherwise just - compare dense representations +def assert_sp_frame_equal(left, right, check_dtype=True, exact_indices=True, + check_frame_type=True, obj='SparseDataFrame'): + """Check that the left and right SparseDataFrame are equal. + + Parameters + ---------- + left : SparseDataFrame + right : SparseDataFrame + check_dtype : bool, default True + Whether to check the Series dtype is identical. + exact_indices : bool, default True + SparseSeries SparseIndex objects must be exactly the same, + otherwise just compare dense representations. + check_frame_type : bool, default True + Whether to check the SparseDataFrame class is identical. + obj : str, default 'SparseDataFrame' + Specify the object name being compared, internally used to show + the appropriate assertion message. """ assertIsInstance(left, pd.SparseDataFrame, '[SparseDataFrame]') assertIsInstance(right, pd.SparseDataFrame, '[SparseDataFrame]') + if check_frame_type: + assert_class_equal(left, right, obj=obj) + assert_index_equal(left.index, right.index, obj='{0}.index'.format(obj)) assert_index_equal(left.columns, right.columns, @@ -1397,9 +1491,11 @@ def assert_sp_frame_equal(left, right, exact_indices=True, # trade-off? if exact_indices: - assert_sp_series_equal(series, right[col]) + assert_sp_series_equal(series, right[col], + check_dtype=check_dtype) else: - assert_series_equal(series.to_dense(), right[col].to_dense()) + assert_series_equal(series.to_dense(), right[col].to_dense(), + check_dtype=check_dtype) assert_attr_equal('default_fill_value', left, right, obj=obj) @@ -1410,22 +1506,6 @@ def assert_sp_frame_equal(left, right, exact_indices=True, assert (col in left) -def assert_sp_panel_equal(left, right, exact_indices=True): - assertIsInstance(left, pd.SparsePanel, '[SparsePanel]') - assertIsInstance(right, pd.SparsePanel, '[SparsePanel]') - - for item, frame in left.iteritems(): - assert (item in right) - # trade-off? - assert_sp_frame_equal(frame, right[item], exact_indices=exact_indices) - - assert_almost_equal(left.default_fill_value, right.default_fill_value) - assert (left.default_kind == right.default_kind) - - for item in right: - assert (item in left) - - def assert_sp_list_equal(left, right): assertIsInstance(left, pd.SparseList, '[SparseList]') assertIsInstance(right, pd.SparseList, '[SparseList]') @@ -1472,7 +1552,7 @@ def makeStringIndex(k=10, name=None): def makeUnicodeIndex(k=10, name=None): - return Index(randu_array(nchars=10, size=k)) + return Index(randu_array(nchars=10, size=k), name=name) def makeCategoricalIndex(k=10, n=3, name=None): @@ -2581,6 +2661,37 @@ def _constructor_sliced(self): return SubclassedSeries +class SubclassedSparseSeries(pd.SparseSeries): + _metadata = ['testattr'] + + @property + def _constructor(self): + return SubclassedSparseSeries + + @property + def _constructor_expanddim(self): + return SubclassedSparseDataFrame + + +class SubclassedSparseDataFrame(pd.SparseDataFrame): + _metadata = ['testattr'] + + @property + def _constructor(self): + return SubclassedSparseDataFrame + + @property + def _constructor_sliced(self): + return SubclassedSparseSeries + + +class SubclassedCategorical(Categorical): + + @property + def _constructor(self): + return SubclassedCategorical + + @contextmanager def patch(ob, attr, value): """Temporarily patch an attribute of an object. @@ -2631,3 +2742,50 @@ def patch(ob, attr, value): delattr(ob, attr) else: setattr(ob, attr, old) + + +@contextmanager +def set_timezone(tz): + """Context manager for temporarily setting a timezone. + + Parameters + ---------- + tz : str + A string representing a valid timezone. + + Examples + -------- + + >>> from datetime import datetime + >>> from dateutil.tz import tzlocal + >>> tzlocal().tzname(datetime.now()) + 'IST' + + >>> with set_timezone('US/Eastern'): + ... tzlocal().tzname(datetime.now()) + ... + 'EDT' + """ + if is_platform_windows(): + import nose + raise nose.SkipTest("timezone setting not supported on windows") + + import os + import time + + def setTZ(tz): + if tz is None: + try: + del os.environ['TZ'] + except: + pass + else: + os.environ['TZ'] = tz + time.tzset() + + orig_tz = os.environ.get('TZ') + setTZ(tz) + try: + yield + finally: + setTZ(orig_tz) diff --git a/pandas/util/validators.py b/pandas/util/validators.py index bbfd24df9c13e..964fa9d9b38d5 100644 --- a/pandas/util/validators.py +++ b/pandas/util/validators.py @@ -3,6 +3,8 @@ for validating data or function arguments """ +from pandas.types.common import is_bool + def _check_arg_length(fname, args, max_fname_arg_count, compat_args): """ @@ -35,8 +37,6 @@ def _check_for_default_values(fname, arg_val_dict, compat_args): checked that arg_val_dict.keys() is a subset of compat_args """ - from pandas.core.common import is_bool - for key in arg_val_dict: # try checking equality directly with '=' operator, # as comparison may have been overriden for the left diff --git a/pandas/window.pyx b/pandas/window.pyx new file mode 100644 index 0000000000000..8235d68e2a88b --- /dev/null +++ b/pandas/window.pyx @@ -0,0 +1,1649 @@ +# cython: profile=False +# cython: boundscheck=False, wraparound=False, cdivision=True + +from numpy cimport * +cimport numpy as np +import numpy as np + +cimport cython + +import_array() + +cimport util + +from libc.stdlib cimport malloc, free + +from numpy cimport NPY_INT8 as NPY_int8 +from numpy cimport NPY_INT16 as NPY_int16 +from numpy cimport NPY_INT32 as NPY_int32 +from numpy cimport NPY_INT64 as NPY_int64 +from numpy cimport NPY_FLOAT16 as NPY_float16 +from numpy cimport NPY_FLOAT32 as NPY_float32 +from numpy cimport NPY_FLOAT64 as NPY_float64 + +from numpy cimport (int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t, + uint32_t, uint64_t, float16_t, float32_t, float64_t) + +int8 = np.dtype(np.int8) +int16 = np.dtype(np.int16) +int32 = np.dtype(np.int32) +int64 = np.dtype(np.int64) +float16 = np.dtype(np.float16) +float32 = np.dtype(np.float32) +float64 = np.dtype(np.float64) + +cdef np.int8_t MINint8 = np.iinfo(np.int8).min +cdef np.int16_t MINint16 = np.iinfo(np.int16).min +cdef np.int32_t MINint32 = np.iinfo(np.int32).min +cdef np.int64_t MINint64 = np.iinfo(np.int64).min +cdef np.float16_t MINfloat16 = np.NINF +cdef np.float32_t MINfloat32 = np.NINF +cdef np.float64_t MINfloat64 = np.NINF + +cdef np.int8_t MAXint8 = np.iinfo(np.int8).max +cdef np.int16_t MAXint16 = np.iinfo(np.int16).max +cdef np.int32_t MAXint32 = np.iinfo(np.int32).max +cdef np.int64_t MAXint64 = np.iinfo(np.int64).max +cdef np.float16_t MAXfloat16 = np.inf +cdef np.float32_t MAXfloat32 = np.inf +cdef np.float64_t MAXfloat64 = np.inf + +cdef double NaN = np.NaN +cdef double nan = NaN + +cdef inline int int_max(int a, int b): return a if a >= b else b +cdef inline int int_min(int a, int b): return a if a <= b else b + +from util cimport numeric + +from skiplist cimport * + +cdef extern from "src/headers/math.h": + double sqrt(double x) nogil + int signbit(double) nogil + +include "skiplist.pyx" + +# Cython implementations of rolling sum, mean, variance, skewness, +# other statistical moment functions +# +# Misc implementation notes +# ------------------------- +# +# - In Cython x * x is faster than x ** 2 for C types, this should be +# periodically revisited to see if it's still true. +# + + +def _check_minp(win, minp, N, floor=None): + """ + Parameters + ---------- + win: int + minp: int or None + N: len of window + floor: int, optional + default 1 + + Returns + ------- + minimum period + """ + + if minp is None: + minp = 1 + if not util.is_integer_object(minp): + raise ValueError("min_periods must be an integer") + if minp > win: + raise ValueError("min_periods (%d) must be <= " + "window (%d)" % (minp, win)) + elif minp > N: + minp = N + 1 + elif minp < 0: + raise ValueError('min_periods must be >= 0') + if floor is None: + floor = 1 + + return max(minp, floor) + +# original C implementation by N. Devillard. +# This code in public domain. +# Function : kth_smallest() +# In : array of elements, # of elements in the array, rank k +# Out : one element +# Job : find the kth smallest element in the array + +# Reference: + +# Author: Wirth, Niklaus +# Title: Algorithms + data structures = programs +# Publisher: Englewood Cliffs: Prentice-Hall, 1976 +# Physical description: 366 p. +# Series: Prentice-Hall Series in Automatic Computation + +# ---------------------------------------------------------------------- +# The indexer objects for rolling +# These define start/end indexers to compute offsets + + +cdef class WindowIndexer: + + cdef: + ndarray start, end + int64_t N, minp, win + bint is_variable + + def get_data(self): + return (self.start, self.end, self.N, + self.win, self.minp, + self.is_variable) + + +cdef class MockFixedWindowIndexer(WindowIndexer): + """ + + We are just checking parameters of the indexer, + and returning a consistent API with fixed/variable + indexers. + + Parameters + ---------- + input: ndarray + input data array + win: int64_t + window size + minp: int64_t + min number of obs in a window to consider non-NaN + index: object + index of the input + floor: optional + unit for flooring + + """ + def __init__(self, ndarray input, int64_t win, int64_t minp, + object index=None, object floor=None): + + assert index is None + self.is_variable = 0 + self.N = len(input) + self.minp = _check_minp(win, minp, self.N, floor=floor) + self.start = np.empty(0, dtype='int64') + self.end = np.empty(0, dtype='int64') + self.win = win + + +cdef class FixedWindowIndexer(WindowIndexer): + """ + create a fixed length window indexer object + that has start & end, that point to offsets in + the index object; these are defined based on the win + arguments + + Parameters + ---------- + input: ndarray + input data array + win: int64_t + window size + minp: int64_t + min number of obs in a window to consider non-NaN + index: object + index of the input + floor: optional + unit for flooring the unit + + """ + def __init__(self, ndarray input, int64_t win, int64_t minp, + object index=None, object floor=None): + cdef ndarray start_s, start_e, end_s, end_e + + assert index is None + self.is_variable = 0 + self.N = len(input) + self.minp = _check_minp(win, minp, self.N, floor=floor) + + start_s = np.zeros(win, dtype='int64') + start_e = np.arange(win, self.N, dtype='int64') - win + 1 + self.start = np.concatenate([start_s, start_e]) + + end_s = np.arange(win, dtype='int64') + 1 + end_e = start_e + win + self.end = np.concatenate([end_s, end_e]) + self.win = win + + +cdef class VariableWindowIndexer(WindowIndexer): + """ + create a variable length window indexer object + that has start & end, that point to offsets in + the index object; these are defined based on the win + arguments + + Parameters + ---------- + input: ndarray + input data array + win: int64_t + window size + minp: int64_t + min number of obs in a window to consider non-NaN + index: ndarray + index of the input + + """ + def __init__(self, ndarray input, int64_t win, int64_t minp, + ndarray index): + + self.is_variable = 1 + self.N = len(index) + self.minp = _check_minp(win, minp, self.N) + + self.start = np.empty(self.N, dtype='int64') + self.start.fill(-1) + + self.end = np.empty(self.N, dtype='int64') + self.end.fill(-1) + + self.build(index, win) + + # max window size + self.win = (self.end - self.start).max() + + def build(self, ndarray[int64_t] index, int64_t win): + + cdef: + ndarray[int64_t] start, end + int64_t start_bound, end_bound, N + Py_ssize_t i, j + + start = self.start + end = self.end + N = self.N + + start[0] = 0 + end[0] = 1 + + with nogil: + + # start is start of slice interval (including) + # end is end of slice interval (not including) + for i in range(1, N): + end_bound = index[i] + start_bound = index[i] - win + + # advance the start bound until we are + # within the constraint + start[i] = i + for j in range(start[i - 1], i): + if index[j] > start_bound: + start[i] = j + break + + # end bound is previous end + # or current index + if index[end[i - 1]] <= end_bound: + end[i] = i + 1 + else: + end[i] = end[i - 1] + + +def get_window_indexer(input, win, minp, index, floor=None, + use_mock=True): + """ + return the correct window indexer for the computation + + Parameters + ---------- + input: 1d ndarray + win: integer, window size + minp: integer, minimum periods + index: 1d ndarray, optional + index to the input array + floor: optional + unit for flooring the unit + use_mock: boolean, default True + if we are a fixed indexer, return a mock indexer + instead of the FixedWindow Indexer. This is a type + compat Indexer that allows us to use a standard + code path with all of the indexers. + + Returns + ------- + tuple of 1d int64 ndarrays of the offsets & data about the window + + """ + + if index is not None: + indexer = VariableWindowIndexer(input, win, minp, index) + elif use_mock: + indexer = MockFixedWindowIndexer(input, win, minp, index, floor) + else: + indexer = FixedWindowIndexer(input, win, minp, index, floor) + return indexer.get_data() + +# ---------------------------------------------------------------------- +# Rolling count +# this is only an impl for index not None, IOW, freq aware + + +def roll_count(ndarray[double_t] input, int64_t win, int64_t minp, + object index): + cdef: + double val, count_x = 0.0 + int64_t s, e, nobs, N + Py_ssize_t i, j + ndarray[int64_t] start, end + ndarray[double_t] output + + start, end, N, win, minp, _ = get_window_indexer(input, win, + minp, index) + output = np.empty(N, dtype=float) + + with nogil: + + for i in range(0, N): + s = start[i] + e = end[i] + + if i == 0: + + # setup + count_x = 0.0 + for j in range(s, e): + val = input[j] + if val == val: + count_x += 1.0 + + else: + + # calculate deletes + for j in range(start[i - 1], s): + val = input[j] + if val == val: + count_x -= 1.0 + + # calculate adds + for j in range(end[i - 1], e): + val = input[j] + if val == val: + count_x += 1.0 + + if count_x >= minp: + output[i] = count_x + else: + output[i] = NaN + + return output + +# ---------------------------------------------------------------------- +# Rolling sum + + +cdef inline double calc_sum(int64_t minp, int64_t nobs, double sum_x) nogil: + cdef double result + + if nobs >= minp: + result = sum_x + else: + result = NaN + + return result + + +cdef inline void add_sum(double val, int64_t *nobs, double *sum_x) nogil: + """ add a value from the sum calc """ + + # Not NaN + if val == val: + nobs[0] = nobs[0] + 1 + sum_x[0] = sum_x[0] + val + + +cdef inline void remove_sum(double val, int64_t *nobs, double *sum_x) nogil: + """ remove a value from the sum calc """ + + if val == val: + nobs[0] = nobs[0] - 1 + sum_x[0] = sum_x[0] - val + + +def roll_sum(ndarray[double_t] input, int64_t win, int64_t minp, + object index): + cdef: + double val, prev_x, sum_x = 0 + int64_t s, e + int64_t nobs = 0, i, j, N + bint is_variable + ndarray[int64_t] start, end + ndarray[double_t] output + + start, end, N, win, minp, is_variable = get_window_indexer(input, win, + minp, index) + output = np.empty(N, dtype=float) + + # for performance we are going to iterate + # fixed windows separately, makes the code more complex as we have 2 paths + # but is faster + + if is_variable: + + # variable window + with nogil: + + for i in range(0, N): + s = start[i] + e = end[i] + + if i == 0: + + # setup + sum_x = 0.0 + nobs = 0 + for j in range(s, e): + add_sum(input[j], &nobs, &sum_x) + + else: + + # calculate deletes + for j in range(start[i - 1], s): + remove_sum(input[j], &nobs, &sum_x) + + # calculate adds + for j in range(end[i - 1], e): + add_sum(input[j], &nobs, &sum_x) + + output[i] = calc_sum(minp, nobs, sum_x) + + else: + + # fixed window + + with nogil: + + for i in range(0, minp - 1): + add_sum(input[i], &nobs, &sum_x) + output[i] = NaN + + for i in range(minp - 1, N): + val = input[i] + add_sum(val, &nobs, &sum_x) + + if i > win - 1: + prev_x = input[i - win] + remove_sum(prev_x, &nobs, &sum_x) + + output[i] = calc_sum(minp, nobs, sum_x) + + return output + +# ---------------------------------------------------------------------- +# Rolling mean + + +cdef inline double calc_mean(int64_t minp, Py_ssize_t nobs, + Py_ssize_t neg_ct, double sum_x) nogil: + cdef double result + + if nobs >= minp: + result = sum_x / nobs + if neg_ct == 0 and result < 0: + # all positive + result = 0 + elif neg_ct == nobs and result > 0: + # all negative + result = 0 + else: + pass + else: + result = NaN + return result + + +cdef inline void add_mean(double val, Py_ssize_t *nobs, double *sum_x, + Py_ssize_t *neg_ct) nogil: + """ add a value from the mean calc """ + + # Not NaN + if val == val: + nobs[0] = nobs[0] + 1 + sum_x[0] = sum_x[0] + val + if signbit(val): + neg_ct[0] = neg_ct[0] + 1 + + +cdef inline void remove_mean(double val, Py_ssize_t *nobs, double *sum_x, + Py_ssize_t *neg_ct) nogil: + """ remove a value from the mean calc """ + + if val == val: + nobs[0] = nobs[0] - 1 + sum_x[0] = sum_x[0] - val + if signbit(val): + neg_ct[0] = neg_ct[0] - 1 + + +def roll_mean(ndarray[double_t] input, int64_t win, int64_t minp, + object index): + cdef: + double val, prev_x, result, sum_x = 0 + int64_t s, e + bint is_variable + Py_ssize_t nobs = 0, i, j, neg_ct = 0, N + ndarray[int64_t] start, end + ndarray[double_t] output + + start, end, N, win, minp, is_variable = get_window_indexer(input, win, + minp, index) + output = np.empty(N, dtype=float) + + # for performance we are going to iterate + # fixed windows separately, makes the code more complex as we have 2 paths + # but is faster + + if is_variable: + + with nogil: + + for i in range(0, N): + s = start[i] + e = end[i] + + if i == 0: + + # setup + sum_x = 0.0 + nobs = 0 + for j in range(s, e): + val = input[j] + add_mean(val, &nobs, &sum_x, &neg_ct) + + else: + + # calculate deletes + for j in range(start[i - 1], s): + val = input[j] + remove_mean(val, &nobs, &sum_x, &neg_ct) + + # calculate adds + for j in range(end[i - 1], e): + val = input[j] + add_mean(val, &nobs, &sum_x, &neg_ct) + + output[i] = calc_mean(minp, nobs, neg_ct, sum_x) + + else: + + with nogil: + for i from 0 <= i < minp - 1: + val = input[i] + add_mean(val, &nobs, &sum_x, &neg_ct) + output[i] = NaN + + for i from minp - 1 <= i < N: + val = input[i] + add_mean(val, &nobs, &sum_x, &neg_ct) + + if i > win - 1: + prev_x = input[i - win] + remove_mean(prev_x, &nobs, &sum_x, &neg_ct) + + output[i] = calc_mean(minp, nobs, neg_ct, sum_x) + + return output + +# ---------------------------------------------------------------------- +# Rolling variance + + +cdef inline double calc_var(int64_t minp, int ddof, double nobs, + double ssqdm_x) nogil: + cdef double result + + # Variance is unchanged if no observation is added or removed + if (nobs >= minp) and (nobs > ddof): + + # pathological case + if nobs == 1: + result = 0 + else: + result = ssqdm_x / (nobs - ddof) + if result < 0: + result = 0 + else: + result = NaN + + return result + + +cdef inline void add_var(double val, double *nobs, double *mean_x, + double *ssqdm_x) nogil: + """ add a value from the var calc """ + cdef double delta + + # Not NaN + if val == val: + nobs[0] = nobs[0] + 1 + + delta = (val - mean_x[0]) + mean_x[0] = mean_x[0] + delta / nobs[0] + ssqdm_x[0] = ssqdm_x[0] + delta * (val - mean_x[0]) + + +cdef inline void remove_var(double val, double *nobs, double *mean_x, + double *ssqdm_x) nogil: + """ remove a value from the var calc """ + cdef double delta + + # Not NaN + if val == val: + nobs[0] = nobs[0] - 1 + if nobs[0]: + delta = (val - mean_x[0]) + mean_x[0] = mean_x[0] - delta / nobs[0] + ssqdm_x[0] = ssqdm_x[0] - delta * (val - mean_x[0]) + else: + mean_x[0] = 0 + ssqdm_x[0] = 0 + + +def roll_var(ndarray[double_t] input, int64_t win, int64_t minp, + object index, int ddof=1): + """ + Numerically stable implementation using Welford's method. + """ + cdef: + double val, prev, mean_x = 0, ssqdm_x = 0, nobs = 0, delta + int64_t s, e + bint is_variable + Py_ssize_t i, j, N + ndarray[int64_t] start, end + ndarray[double_t] output + + start, end, N, win, minp, is_variable = get_window_indexer(input, win, + minp, index) + output = np.empty(N, dtype=float) + + # Check for windows larger than array, addresses #7297 + win = min(win, N) + + # for performance we are going to iterate + # fixed windows separately, makes the code more complex as we + # have 2 paths but is faster + + if is_variable: + + with nogil: + + for i in range(0, N): + + s = start[i] + e = end[i] + + # Over the first window, observations can only be added + # never removed + if i == 0: + + for j in range(s, e): + add_var(input[j], &nobs, &mean_x, &ssqdm_x) + + else: + + # After the first window, observations can both be added + # and removed + + # calculate adds + for j in range(end[i - 1], e): + add_var(input[j], &nobs, &mean_x, &ssqdm_x) + + # calculate deletes + for j in range(start[i - 1], s): + remove_var(input[j], &nobs, &mean_x, &ssqdm_x) + + output[i] = calc_var(minp, ddof, nobs, ssqdm_x) + + else: + + with nogil: + + # Over the first window, observations can only be added, never + # removed + for i from 0 <= i < win: + add_var(input[i], &nobs, &mean_x, &ssqdm_x) + output[i] = calc_var(minp, ddof, nobs, ssqdm_x) + + # After the first window, observations can both be added and + # removed + for i from win <= i < N: + val = input[i] + prev = input[i - win] + + if val == val: + if prev == prev: + + # Adding one observation and removing another one + delta = val - prev + prev -= mean_x + mean_x += delta / nobs + val -= mean_x + ssqdm_x += (val + prev) * delta + + else: + add_var(val, &nobs, &mean_x, &ssqdm_x) + elif prev == prev: + remove_var(prev, &nobs, &mean_x, &ssqdm_x) + + output[i] = calc_var(minp, ddof, nobs, ssqdm_x) + + return output + + +# ---------------------------------------------------------------------- +# Rolling skewness + +cdef inline double calc_skew(int64_t minp, int64_t nobs, double x, double xx, + double xxx) nogil: + cdef double result, dnobs + cdef double A, B, C, R + + if nobs >= minp: + dnobs = nobs + A = x / dnobs + B = xx / dnobs - A * A + C = xxx / dnobs - A * A * A - 3 * A * B + if B <= 0 or nobs < 3: + result = NaN + else: + R = sqrt(B) + result = ((sqrt(dnobs * (dnobs - 1.)) * C) / + ((dnobs - 2) * R * R * R)) + else: + result = NaN + + return result + +cdef inline void add_skew(double val, int64_t *nobs, double *x, double *xx, + double *xxx) nogil: + """ add a value from the skew calc """ + + # Not NaN + if val == val: + nobs[0] = nobs[0] + 1 + + # seriously don't ask me why this is faster + x[0] = x[0] + val + xx[0] = xx[0] + val * val + xxx[0] = xxx[0] + val * val * val + +cdef inline void remove_skew(double val, int64_t *nobs, double *x, double *xx, + double *xxx) nogil: + """ remove a value from the skew calc """ + + # Not NaN + if val == val: + nobs[0] = nobs[0] - 1 + + # seriously don't ask me why this is faster + x[0] = x[0] - val + xx[0] = xx[0] - val * val + xxx[0] = xxx[0] - val * val * val + + +def roll_skew(ndarray[double_t] input, int64_t win, int64_t minp, + object index): + cdef: + double val, prev + double x = 0, xx = 0, xxx = 0 + int64_t nobs = 0, i, j, N + int64_t s, e + bint is_variable + ndarray[int64_t] start, end + ndarray[double_t] output + + start, end, N, win, minp, is_variable = get_window_indexer(input, win, + minp, index) + output = np.empty(N, dtype=float) + + if is_variable: + + with nogil: + + for i in range(0, N): + + s = start[i] + e = end[i] + + # Over the first window, observations can only be added + # never removed + if i == 0: + + for j in range(s, e): + val = input[j] + add_skew(val, &nobs, &x, &xx, &xxx) + + else: + + # After the first window, observations can both be added + # and removed + + # calculate adds + for j in range(end[i - 1], e): + val = input[j] + add_skew(val, &nobs, &x, &xx, &xxx) + + # calculate deletes + for j in range(start[i - 1], s): + val = input[j] + remove_skew(val, &nobs, &x, &xx, &xxx) + + output[i] = calc_skew(minp, nobs, x, xx, xxx) + + else: + + with nogil: + for i from 0 <= i < minp - 1: + val = input[i] + add_skew(val, &nobs, &x, &xx, &xxx) + output[i] = NaN + + for i from minp - 1 <= i < N: + val = input[i] + add_skew(val, &nobs, &x, &xx, &xxx) + + if i > win - 1: + prev = input[i - win] + remove_skew(prev, &nobs, &x, &xx, &xxx) + + output[i] = calc_skew(minp, nobs, x, xx, xxx) + + return output + +# ---------------------------------------------------------------------- +# Rolling kurtosis + + +cdef inline double calc_kurt(int64_t minp, int64_t nobs, double x, double xx, + double xxx, double xxxx) nogil: + cdef double result, dnobs + cdef double A, B, C, D, R, K + + if nobs >= minp: + dnobs = nobs + A = x / dnobs + R = A * A + B = xx / dnobs - R + R = R * A + C = xxx / dnobs - R - 3 * A * B + R = R * A + D = xxxx / dnobs - R - 6 * B * A * A - 4 * C * A + + if B == 0 or nobs < 4: + result = NaN + else: + K = (dnobs * dnobs - 1.) * D / (B * B) - 3 * ((dnobs - 1.) ** 2) + result = K / ((dnobs - 2.) * (dnobs - 3.)) + else: + result = NaN + + return result + +cdef inline void add_kurt(double val, int64_t *nobs, double *x, double *xx, + double *xxx, double *xxxx) nogil: + """ add a value from the kurotic calc """ + + # Not NaN + if val == val: + nobs[0] = nobs[0] + 1 + + # seriously don't ask me why this is faster + x[0] = x[0] + val + xx[0] = xx[0] + val * val + xxx[0] = xxx[0] + val * val * val + xxxx[0] = xxxx[0] + val * val * val * val + +cdef inline void remove_kurt(double val, int64_t *nobs, double *x, double *xx, + double *xxx, double *xxxx) nogil: + """ remove a value from the kurotic calc """ + + # Not NaN + if val == val: + nobs[0] = nobs[0] - 1 + + # seriously don't ask me why this is faster + x[0] = x[0] - val + xx[0] = xx[0] - val * val + xxx[0] = xxx[0] - val * val * val + xxxx[0] = xxxx[0] - val * val * val * val + + +def roll_kurt(ndarray[double_t] input, int64_t win, int64_t minp, + object index): + cdef: + double val, prev + double x = 0, xx = 0, xxx = 0, xxxx = 0 + int64_t nobs = 0, i, j, N + int64_t s, e + bint is_variable + ndarray[int64_t] start, end + ndarray[double_t] output + + start, end, N, win, minp, is_variable = get_window_indexer(input, win, + minp, index) + output = np.empty(N, dtype=float) + + if is_variable: + + with nogil: + + for i in range(0, N): + + s = start[i] + e = end[i] + + # Over the first window, observations can only be added + # never removed + if i == 0: + + for j in range(s, e): + add_kurt(input[j], &nobs, &x, &xx, &xxx, &xxxx) + + else: + + # After the first window, observations can both be added + # and removed + + # calculate adds + for j in range(end[i - 1], e): + add_kurt(input[j], &nobs, &x, &xx, &xxx, &xxxx) + + # calculate deletes + for j in range(start[i - 1], s): + remove_kurt(input[j], &nobs, &x, &xx, &xxx, &xxxx) + + output[i] = calc_kurt(minp, nobs, x, xx, xxx, xxxx) + + else: + + with nogil: + + for i from 0 <= i < minp - 1: + add_kurt(input[i], &nobs, &x, &xx, &xxx, &xxxx) + output[i] = NaN + + for i from minp - 1 <= i < N: + add_kurt(input[i], &nobs, &x, &xx, &xxx, &xxxx) + + if i > win - 1: + prev = input[i - win] + remove_kurt(prev, &nobs, &x, &xx, &xxx, &xxxx) + + output[i] = calc_kurt(minp, nobs, x, xx, xxx, xxxx) + + return output + +# ---------------------------------------------------------------------- +# Rolling median, min, max + + +def roll_median_c(ndarray[float64_t] input, int64_t win, int64_t minp, + object index): + cdef: + double val, res, prev + bint err=0, is_variable + int ret=0 + skiplist_t *sl + Py_ssize_t i, j + int64_t nobs = 0, N, s, e + int midpoint + ndarray[int64_t] start, end + ndarray[double_t] output + + # we use the Fixed/Variable Indexer here as the + # actual skiplist ops outweigh any window computation costs + start, end, N, win, minp, is_variable = get_window_indexer( + input, win, + minp, index, + use_mock=False) + output = np.empty(N, dtype=float) + + sl = skiplist_init(win) + if sl == NULL: + raise MemoryError("skiplist_init failed") + + with nogil: + + for i in range(0, N): + s = start[i] + e = end[i] + + if i == 0: + + # setup + val = input[i] + if val == val: + nobs += 1 + err = skiplist_insert(sl, val) != 1 + if err: + break + + else: + + # calculate deletes + for j in range(start[i - 1], s): + val = input[j] + if val == val: + skiplist_remove(sl, val) + nobs -= 1 + + # calculate adds + for j in range(end[i - 1], e): + val = input[j] + if val == val: + nobs += 1 + err = skiplist_insert(sl, val) != 1 + if err: + break + + if nobs >= minp: + midpoint = (nobs / 2) + if nobs % 2: + res = skiplist_get(sl, midpoint, &ret) + else: + res = (skiplist_get(sl, midpoint, &ret) + + skiplist_get(sl, (midpoint - 1), &ret)) / 2 + else: + res = NaN + + output[i] = res + + skiplist_destroy(sl) + if err: + raise MemoryError("skiplist_insert failed") + return output + +# ---------------------------------------------------------------------- + +# Moving maximum / minimum code taken from Bottleneck under the terms +# of its Simplified BSD license +# https://github.com/kwgoodman/bottleneck + + +cdef inline numeric init_mm(numeric ai, Py_ssize_t *nobs, bint is_max) nogil: + + if numeric in cython.floating: + if ai == ai: + nobs[0] = nobs[0] + 1 + elif is_max: + if numeric == cython.float: + ai = MINfloat32 + else: + ai = MINfloat64 + else: + if numeric == cython.float: + ai = MAXfloat32 + else: + ai = MAXfloat64 + + else: + nobs[0] = nobs[0] + 1 + + return ai + + +cdef inline void remove_mm(numeric aold, Py_ssize_t *nobs) nogil: + """ remove a value from the mm calc """ + if numeric in cython.floating and aold == aold: + nobs[0] = nobs[0] - 1 + + +cdef inline numeric calc_mm(int64_t minp, Py_ssize_t nobs, + numeric value) nogil: + cdef numeric result + + if numeric in cython.floating: + if nobs >= minp: + result = value + else: + result = NaN + else: + result = value + + return result + + +def roll_max(ndarray[numeric] input, int64_t win, int64_t minp, + object index): + """ + Moving max of 1d array of any numeric type along axis=0 ignoring NaNs. + + Parameters + ---------- + input: numpy array + window: int, size of rolling window + minp: if number of observations in window + is below this, output a NaN + index: ndarray, optional + index for window computation + """ + return _roll_min_max(input, win, minp, index, is_max=1) + + +def roll_min(ndarray[numeric] input, int64_t win, int64_t minp, + object index): + """ + Moving max of 1d array of any numeric type along axis=0 ignoring NaNs. + + Parameters + ---------- + input: numpy array + window: int, size of rolling window + minp: if number of observations in window + is below this, output a NaN + index: ndarray, optional + index for window computation + """ + return _roll_min_max(input, win, minp, index, is_max=0) + + +cdef _roll_min_max(ndarray[numeric] input, int64_t win, int64_t minp, + object index, bint is_max): + """ + Moving min/max of 1d array of any numeric type along axis=0 + ignoring NaNs. + """ + + cdef: + numeric ai + bint is_variable, should_replace + int64_t s, e, N, i, j, removed + Py_ssize_t nobs = 0 + ndarray[int64_t] starti, endi + ndarray[numeric, ndim=1] output + cdef: + int64_t* death + numeric* ring + numeric* minvalue + numeric* end + numeric* last + + cdef: + cdef numeric r + + starti, endi, N, win, minp, is_variable = get_window_indexer( + input, win, + minp, index) + + output = np.empty(N, dtype=input.dtype) + + if is_variable: + + with nogil: + + for i in range(N): + s = starti[i] + e = endi[i] + + r = input[s] + nobs = 0 + for j in range(s, e): + + # adds, death at the i offset + ai = init_mm(input[j], &nobs, is_max) + + if is_max: + if ai > r: + r = ai + else: + if ai < r: + r = ai + + output[i] = calc_mm(minp, nobs, r) + + else: + + # setup the rings of death! + ring = malloc(win * sizeof(numeric)) + death = malloc(win * sizeof(int64_t)) + + end = ring + win + last = ring + minvalue = ring + ai = input[0] + minvalue[0] = init_mm(input[0], &nobs, is_max) + death[0] = win + nobs = 0 + + with nogil: + + for i in range(N): + ai = init_mm(input[i], &nobs, is_max) + + if i >= win: + remove_mm(input[i - win], &nobs) + + if death[minvalue - ring] == i: + minvalue = minvalue + 1 + if minvalue >= end: + minvalue = ring + + if is_max: + should_replace = ai >= minvalue[0] + else: + should_replace = ai <= minvalue[0] + if should_replace: + + minvalue[0] = ai + death[minvalue - ring] = i + win + last = minvalue + + else: + + if is_max: + should_replace = last[0] <= ai + else: + should_replace = last[0] >= ai + while should_replace: + if last == ring: + last = end + last -= 1 + if is_max: + should_replace = last[0] <= ai + else: + should_replace = last[0] >= ai + + last += 1 + if last == end: + last = ring + last[0] = ai + death[last - ring] = i + win + + output[i] = calc_mm(minp, nobs, minvalue[0]) + + for i in range(minp - 1): + if numeric in cython.floating: + output[i] = NaN + else: + output[i] = 0 + + free(ring) + free(death) + + # print("output: {0}".format(output)) + return output + + +def roll_quantile(ndarray[float64_t, cast=True] input, int64_t win, + int64_t minp, object index, double quantile): + """ + O(N log(window)) implementation using skip list + """ + cdef: + double val, prev, midpoint + IndexableSkiplist skiplist + int64_t nobs = 0, i, j, s, e, N + Py_ssize_t idx + bint is_variable + ndarray[int64_t] start, end + ndarray[double_t] output + + # we use the Fixed/Variable Indexer here as the + # actual skiplist ops outweigh any window computation costs + start, end, N, win, minp, is_variable = get_window_indexer( + input, win, + minp, index, + use_mock=False) + output = np.empty(N, dtype=float) + skiplist = IndexableSkiplist(win) + + for i in range(0, N): + s = start[i] + e = end[i] + + if i == 0: + + # setup + val = input[i] + if val == val: + nobs += 1 + skiplist.insert(val) + + else: + + # calculate deletes + for j in range(start[i - 1], s): + val = input[j] + if val == val: + skiplist.remove(val) + nobs -= 1 + + # calculate adds + for j in range(end[i - 1], e): + val = input[j] + if val == val: + nobs += 1 + skiplist.insert(val) + + if nobs >= minp: + idx = int(quantile * (nobs - 1)) + output[i] = skiplist.get(idx) + else: + output[i] = NaN + + return output + + +def roll_generic(ndarray[float64_t, cast=True] input, + int64_t win, int64_t minp, object index, + int offset, object func, + object args, object kwargs): + cdef: + ndarray[double_t] output, counts, bufarr + float64_t *buf + float64_t *oldbuf + int64_t nobs = 0, i, j, s, e, N + bint is_variable + ndarray[int64_t] start, end + + if not input.flags.c_contiguous: + input = input.copy('C') + + n = len(input) + if n == 0: + return input + + start, end, N, win, minp, is_variable = get_window_indexer(input, win, + minp, index, + floor=0) + output = np.empty(N, dtype=float) + + counts = roll_sum(np.concatenate([np.isfinite(input).astype(float), + np.array([0.] * offset)]), + win, minp, index)[offset:] + + if is_variable: + + # variable window + if offset != 0: + raise ValueError("unable to roll_generic with a non-zero offset") + + for i in range(0, N): + s = start[i] + e = end[i] + + if counts[i] >= minp: + output[i] = func(input[s:e], *args, **kwargs) + else: + output[i] = NaN + + else: + + # truncated windows at the beginning, through first full-length window + for i from 0 <= i < (int_min(win, N) - offset): + if counts[i] >= minp: + output[i] = func(input[0: (i + offset + 1)], *args, **kwargs) + else: + output[i] = NaN + + # remaining full-length windows + buf = input.data + bufarr = np.empty(win, dtype=float) + oldbuf = bufarr.data + for i from (win - offset) <= i < (N - offset): + buf = buf + 1 + bufarr.data = buf + if counts[i] >= minp: + output[i] = func(bufarr, *args, **kwargs) + else: + output[i] = NaN + bufarr.data = oldbuf + + # truncated windows at the end + for i from int_max(N - offset, 0) <= i < N: + if counts[i] >= minp: + output[i] = func(input[int_max(i + offset - win + 1, 0): N], + *args, + **kwargs) + else: + output[i] = NaN + + return output + + +def roll_window(ndarray[float64_t, ndim=1, cast=True] input, + ndarray[float64_t, ndim=1, cast=True] weights, + int minp, bint avg=True): + """ + Assume len(weights) << len(input) + """ + cdef: + ndarray[double_t] output, tot_wgt, counts + Py_ssize_t in_i, win_i, win_n, win_k, in_n, in_k + float64_t val_in, val_win, c, w + + in_n = len(input) + win_n = len(weights) + output = np.zeros(in_n, dtype=float) + counts = np.zeros(in_n, dtype=float) + if avg: + tot_wgt = np.zeros(in_n, dtype=float) + + minp = _check_minp(len(weights), minp, in_n) + + if avg: + for win_i from 0 <= win_i < win_n: + val_win = weights[win_i] + if val_win != val_win: + continue + + for in_i from 0 <= in_i < in_n - (win_n - win_i) + 1: + val_in = input[in_i] + if val_in == val_in: + output[in_i + (win_n - win_i) - 1] += val_in * val_win + counts[in_i + (win_n - win_i) - 1] += 1 + tot_wgt[in_i + (win_n - win_i) - 1] += val_win + + for in_i from 0 <= in_i < in_n: + c = counts[in_i] + if c < minp: + output[in_i] = NaN + else: + w = tot_wgt[in_i] + if w == 0: + output[in_i] = NaN + else: + output[in_i] /= tot_wgt[in_i] + + else: + for win_i from 0 <= win_i < win_n: + val_win = weights[win_i] + if val_win != val_win: + continue + + for in_i from 0 <= in_i < in_n - (win_n - win_i) + 1: + val_in = input[in_i] + + if val_in == val_in: + output[in_i + (win_n - win_i) - 1] += val_in * val_win + counts[in_i + (win_n - win_i) - 1] += 1 + + for in_i from 0 <= in_i < in_n: + c = counts[in_i] + if c < minp: + output[in_i] = NaN + + return output + +# ---------------------------------------------------------------------- +# Exponentially weighted moving average + + +def ewma(ndarray[double_t] input, double_t com, int adjust, int ignore_na, + int minp): + """ + Compute exponentially-weighted moving average using center-of-mass. + + Parameters + ---------- + input : ndarray (float64 type) + com : float64 + adjust: int + ignore_na: int + minp: int + + Returns + ------- + y : ndarray + """ + + cdef Py_ssize_t N = len(input) + cdef ndarray[double_t] output = np.empty(N, dtype=float) + if N == 0: + return output + + minp = max(minp, 1) + + cdef double alpha, old_wt_factor, new_wt, weighted_avg, old_wt, cur + cdef Py_ssize_t i, nobs + + alpha = 1. / (1. + com) + old_wt_factor = 1. - alpha + new_wt = 1. if adjust else alpha + + weighted_avg = input[0] + is_observation = (weighted_avg == weighted_avg) + nobs = int(is_observation) + output[0] = weighted_avg if (nobs >= minp) else NaN + old_wt = 1. + + for i from 1 <= i < N: + cur = input[i] + is_observation = (cur == cur) + nobs += int(is_observation) + if weighted_avg == weighted_avg: + + if is_observation or (not ignore_na): + + old_wt *= old_wt_factor + if is_observation: + + # avoid numerical errors on constant series + if weighted_avg != cur: + weighted_avg = ((old_wt * weighted_avg) + + (new_wt * cur)) / (old_wt + new_wt) + if adjust: + old_wt += new_wt + else: + old_wt = 1. + elif is_observation: + weighted_avg = cur + + output[i] = weighted_avg if (nobs >= minp) else NaN + + return output + +# ---------------------------------------------------------------------- +# Exponentially weighted moving covariance + + +def ewmcov(ndarray[double_t] input_x, ndarray[double_t] input_y, + double_t com, int adjust, int ignore_na, int minp, int bias): + """ + Compute exponentially-weighted moving variance using center-of-mass. + + Parameters + ---------- + input_x : ndarray (float64 type) + input_y : ndarray (float64 type) + com : float64 + adjust: int + ignore_na: int + minp: int + bias: int + + Returns + ------- + y : ndarray + """ + + cdef Py_ssize_t N = len(input_x) + if len(input_y) != N: + raise ValueError("arrays are of different lengths " + "(%d and %d)" % (N, len(input_y))) + cdef ndarray[double_t] output = np.empty(N, dtype=float) + if N == 0: + return output + + minp = max(minp, 1) + + cdef double alpha, old_wt_factor, new_wt, mean_x, mean_y, cov + cdef double sum_wt, sum_wt2, old_wt, cur_x, cur_y, old_mean_x, old_mean_y + cdef Py_ssize_t i, nobs + + alpha = 1. / (1. + com) + old_wt_factor = 1. - alpha + new_wt = 1. if adjust else alpha + + mean_x = input_x[0] + mean_y = input_y[0] + is_observation = ((mean_x == mean_x) and (mean_y == mean_y)) + nobs = int(is_observation) + if not is_observation: + mean_x = NaN + mean_y = NaN + output[0] = (0. if bias else NaN) if (nobs >= minp) else NaN + cov = 0. + sum_wt = 1. + sum_wt2 = 1. + old_wt = 1. + + for i from 1 <= i < N: + cur_x = input_x[i] + cur_y = input_y[i] + is_observation = ((cur_x == cur_x) and (cur_y == cur_y)) + nobs += int(is_observation) + if mean_x == mean_x: + if is_observation or (not ignore_na): + sum_wt *= old_wt_factor + sum_wt2 *= (old_wt_factor * old_wt_factor) + old_wt *= old_wt_factor + if is_observation: + old_mean_x = mean_x + old_mean_y = mean_y + + # avoid numerical errors on constant series + if mean_x != cur_x: + mean_x = ((old_wt * old_mean_x) + + (new_wt * cur_x)) / (old_wt + new_wt) + + # avoid numerical errors on constant series + if mean_y != cur_y: + mean_y = ((old_wt * old_mean_y) + + (new_wt * cur_y)) / (old_wt + new_wt) + cov = ((old_wt * (cov + ((old_mean_x - mean_x) * + (old_mean_y - mean_y)))) + + (new_wt * ((cur_x - mean_x) * + (cur_y - mean_y)))) / (old_wt + new_wt) + sum_wt += new_wt + sum_wt2 += (new_wt * new_wt) + old_wt += new_wt + if not adjust: + sum_wt /= old_wt + sum_wt2 /= (old_wt * old_wt) + old_wt = 1. + elif is_observation: + mean_x = cur_x + mean_y = cur_y + + if nobs >= minp: + if not bias: + numerator = sum_wt * sum_wt + denominator = numerator - sum_wt2 + if (denominator > 0.): + output[i] = ((numerator / denominator) * cov) + else: + output[i] = NaN + else: + output[i] = cov + else: + output[i] = NaN + + return output diff --git a/scripts/bench_join.py b/scripts/bench_join.py index 5223aac40d63b..1ce5c94130e85 100644 --- a/scripts/bench_join.py +++ b/scripts/bench_join.py @@ -12,9 +12,9 @@ a = np.arange(n, dtype=np.int64) b = np.arange(n * pct_overlap, n * (1 + pct_overlap), dtype=np.int64) -dr1 = DatetimeIndex('1/1/2000', periods=n, offset=datetools.Minute()) +dr1 = DatetimeIndex('1/1/2000', periods=n, offset=offsets.Minute()) dr2 = DatetimeIndex( - dr1[int(pct_overlap * n)], periods=n, offset=datetools.Minute(2)) + dr1[int(pct_overlap * n)], periods=n, offset=offsets.Minute(2)) aobj = a.astype(object) bobj = b.astype(object) diff --git a/scripts/groupby_speed.py b/scripts/groupby_speed.py index 34f293d5008c6..3be9fac12418e 100644 --- a/scripts/groupby_speed.py +++ b/scripts/groupby_speed.py @@ -1,12 +1,12 @@ from __future__ import print_function from pandas import * -rng = DatetimeIndex('1/3/2011', '11/30/2011', offset=datetools.Minute()) +rng = DatetimeIndex('1/3/2011', '11/30/2011', offset=offsets.Minute()) df = DataFrame(np.random.randn(len(rng), 5), index=rng, columns=list('OHLCV')) -rng5 = DatetimeIndex('1/3/2011', '11/30/2011', offset=datetools.Minute(5)) +rng5 = DatetimeIndex('1/3/2011', '11/30/2011', offset=offsets.Minute(5)) gp = rng5.asof grouped = df.groupby(gp) diff --git a/scripts/hdfstore_panel_perf.py b/scripts/hdfstore_panel_perf.py index 66b0b52444bc1..c66e9506fc4c5 100644 --- a/scripts/hdfstore_panel_perf.py +++ b/scripts/hdfstore_panel_perf.py @@ -7,7 +7,7 @@ panel = Panel(np.random.randn(i, j, k), items=[rands(10) for _ in range(i)], major_axis=DatetimeIndex('1/1/2000', periods=j, - offset=datetools.Minute()), + offset=offsets.Minute()), minor_axis=[rands(10) for _ in range(k)]) diff --git a/scripts/merge-py.py b/scripts/merge-py.py index 9d611213ba517..65cc9d5a2bffe 100755 --- a/scripts/merge-py.py +++ b/scripts/merge-py.py @@ -34,6 +34,8 @@ import sys import textwrap +from six.moves import input + PANDAS_HOME = '.' PROJECT_NAME = 'pandas' print("PANDAS_HOME = " + PANDAS_HOME) @@ -96,11 +98,14 @@ def run_cmd(cmd): if cmd is None: cmd = popenargs[0] raise subprocess.CalledProcessError(retcode, cmd, output=output) + + if isinstance(output, six.binary_type): + output = output.decode('utf-8') return output def continue_maybe(prompt): - result = raw_input("\n%s (y/n): " % prompt) + result = input("\n%s (y/n): " % prompt) if result.lower() != "y": fail("Okay, exiting") @@ -114,13 +119,14 @@ def clean_up(): branches = run_cmd("git branch").replace(" ", "").split("\n") - for branch in filter(lambda x: x.startswith(BRANCH_PREFIX), branches): + for branch in [b for b in branches if b.startswith(BRANCH_PREFIX)]: print("Deleting local branch %s" % branch) run_cmd("git branch -D %s" % branch) # merge the requested PR and return the merge hash def merge_pr(pr_num, target_ref): + pr_branch_name = "%s_MERGE_PR_%s" % (BRANCH_PREFIX, pr_num) target_branch_name = "%s_MERGE_PR_%s_%s" % (BRANCH_PREFIX, pr_num, target_ref.upper()) @@ -199,7 +205,7 @@ def merge_pr(pr_num, target_ref): def cherry_pick(pr_num, merge_hash, default_branch): - pick_ref = raw_input("Enter a branch name [%s]: " % default_branch) + pick_ref = input("Enter a branch name [%s]: " % default_branch) if pick_ref == "": pick_ref = default_branch @@ -245,7 +251,7 @@ def fix_version_from_branch(branch, versions): # Assumes branch names can be sorted lexicographically # latest_branch = sorted(branch_names, reverse=True)[0] -pr_num = raw_input("Which pull request would you like to merge? (e.g. 34): ") +pr_num = input("Which pull request would you like to merge? (e.g. 34): ") pr = get_json("%s/pulls/%s" % (GITHUB_API_BASE, pr_num)) url = pr["url"] diff --git a/setup.py b/setup.py index 596fe62ff0781..d1b97e1e3d82c 100755 --- a/setup.py +++ b/setup.py @@ -90,11 +90,58 @@ def is_platform_mac(): except ImportError: cython = False + +if cython: + try: + try: + from Cython import Tempita as tempita + except ImportError: + import tempita + except ImportError: + raise ImportError('Building pandas requires Tempita: ' + 'pip install Tempita') + + from os.path import join as pjoin +_pxipath = pjoin('pandas', 'src') +_pxi_dep_template = { + 'algos': ['algos_common_helper.pxi.in', 'algos_groupby_helper.pxi.in', + 'algos_take_helper.pxi.in'], + '_join': ['join_helper.pxi.in', 'joins_func_helper.pxi.in'], + 'hashtable': ['hashtable_class_helper.pxi.in', + 'hashtable_func_helper.pxi.in'], + '_sparse': ['sparse_op_helper.pxi.in'] +} +_pxifiles = [] +_pxi_dep = {} +for module, files in _pxi_dep_template.items(): + pxi_files = [pjoin(_pxipath, x) for x in files] + _pxifiles.extend(pxi_files) + _pxi_dep[module] = pxi_files + + class build_ext(_build_ext): def build_extensions(self): + + for pxifile in _pxifiles: + # build pxifiles first, template extention must be .pxi.in + assert pxifile.endswith('.pxi.in') + outfile = pxifile[:-3] + + if (os.path.exists(outfile) and + os.stat(pxifile).st_mtime < os.stat(outfile).st_mtime): + # if .pxi.in is not updated, no need to output .pxi + continue + + with open(pxifile, "r") as f: + tmpl = f.read() + pyxcontent = tempita.sub(tmpl) + + with open(outfile, "w") as f: + f.write(pyxcontent) + numpy_incl = pkg_resources.resource_filename('numpy', 'core/include') for ext in self.extensions: @@ -270,6 +317,8 @@ class CheckSDist(sdist_class): 'pandas/tslib.pyx', 'pandas/index.pyx', 'pandas/algos.pyx', + 'pandas/join.pyx', + 'pandas/window.pyx', 'pandas/parser.pyx', 'pandas/src/period.pyx', 'pandas/src/sparse.pyx', @@ -410,7 +459,8 @@ def pxd(name): 'depends': lib_depends}, hashtable={'pyxfile': 'hashtable', 'pxdfiles': ['hashtable'], - 'depends': ['pandas/src/klib/khash_python.h']}, + 'depends': (['pandas/src/klib/khash_python.h'] + + _pxi_dep['hashtable'])}, tslib={'pyxfile': 'tslib', 'depends': tseries_depends, 'sources': ['pandas/src/datetime/np_datetime.c', @@ -425,17 +475,26 @@ def pxd(name): 'sources': ['pandas/src/datetime/np_datetime.c', 'pandas/src/datetime/np_datetime_strings.c']}, algos={'pyxfile': 'algos', - 'pxdfiles': ['src/skiplist'], - 'depends': [srcpath('generated', suffix='.pyx'), - srcpath('join', suffix='.pyx'), - 'pandas/src/skiplist.pyx', - 'pandas/src/skiplist.h']}, + 'pxdfiles': ['src/util'], + 'depends': _pxi_dep['algos']}, + _join={'pyxfile': 'src/join', + 'pxdfiles': ['src/util'], + 'depends': _pxi_dep['_join']}, + _window={'pyxfile': 'window', + 'pxdfiles': ['src/skiplist', 'src/util'], + 'depends': ['pandas/src/skiplist.pyx', + 'pandas/src/skiplist.h']}, parser={'pyxfile': 'parser', 'depends': ['pandas/src/parser/tokenizer.h', 'pandas/src/parser/io.h', 'pandas/src/numpy_helper.h'], 'sources': ['pandas/src/parser/tokenizer.c', 'pandas/src/parser/io.c']}, + _sparse={'pyxfile': 'src/sparse', + 'depends': ([srcpath('sparse', suffix='.pyx')] + + _pxi_dep['_sparse'])}, + _testing={'pyxfile': 'src/testing', + 'depends': [srcpath('testing', suffix='.pyx')]}, ) ext_data["io.sas.saslib"] = {'pyxfile': 'io/sas/saslib'} @@ -461,22 +520,6 @@ def pxd(name): extensions.append(obj) -sparse_ext = Extension('pandas._sparse', - sources=[srcpath('sparse', suffix=suffix)], - include_dirs=[], - libraries=libraries, - extra_compile_args=extra_compile_args) - -extensions.extend([sparse_ext]) - -testing_ext = Extension('pandas._testing', - sources=[srcpath('testing', suffix=suffix)], - include_dirs=[], - libraries=libraries, - extra_compile_args=extra_compile_args) - -extensions.extend([testing_ext]) - #---------------------------------------------------------------------- # msgpack @@ -556,6 +599,9 @@ def pxd(name): maintainer=AUTHOR, version=versioneer.get_version(), packages=['pandas', + 'pandas.api', + 'pandas.api.tests', + 'pandas.api.types', 'pandas.compat', 'pandas.compat.numpy', 'pandas.computation', @@ -566,7 +612,6 @@ def pxd(name): 'pandas.io.sas', 'pandas.formats', 'pandas.rpy', - 'pandas.sandbox', 'pandas.sparse', 'pandas.sparse.tests', 'pandas.stats', @@ -578,6 +623,7 @@ def pxd(name): 'pandas.tests.formats', 'pandas.tests.types', 'pandas.tests.test_msgpack', + 'pandas.tests.plotting', 'pandas.tools', 'pandas.tools.tests', 'pandas.tseries', @@ -595,6 +641,7 @@ def pxd(name): 'tests/data/legacy_msgpack/*/*.msgpack', 'tests/data/*.csv*', 'tests/data/*.dta', + 'tests/data/*.pickle', 'tests/data/*.txt', 'tests/data/*.xls', 'tests/data/*.xlsx', @@ -610,9 +657,8 @@ def pxd(name): 'tests/data/*.html', 'tests/data/html_encoding/*.html', 'tests/json/data/*.json'], - 'pandas.tools': ['tests/*.csv'], - 'pandas.tests': ['data/*.pickle', - 'data/*.csv'], + 'pandas.tools': ['tests/data/*.csv'], + 'pandas.tests': ['data/*.csv'], 'pandas.tests.formats': ['data/*.csv'], 'pandas.tests.indexes': ['data/*.pickle'], 'pandas.tseries.tests': ['data/*.pickle',