diff --git a/.travis.yml b/.travis.yml index fe1a2950dbf08..42b4ef0396fc8 100644 --- a/.travis.yml +++ b/.travis.yml @@ -102,8 +102,6 @@ before_install: - uname -a - git --version - git tag - - ci/before_install_travis.sh - - export DISPLAY=":99.0" install: - echo "install start" @@ -114,6 +112,8 @@ install: before_script: - ci/install_db_travis.sh + - export DISPLAY=":99.0" + - ci/before_script_travis.sh script: - echo "script start" diff --git a/appveyor.yml b/appveyor.yml index a1f8886f6d068..44af73b498aa8 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -22,7 +22,7 @@ environment: PYTHON_VERSION: "3.6" PYTHON_ARCH: "64" CONDA_PY: "36" - CONDA_NPY: "112" + CONDA_NPY: "113" - CONDA_ROOT: "C:\\Miniconda3_64" PYTHON_VERSION: "2.7" diff --git a/asv_bench/benchmarks/plotting.py b/asv_bench/benchmarks/plotting.py index dda684b35e301..16889b2f19e89 100644 --- a/asv_bench/benchmarks/plotting.py +++ b/asv_bench/benchmarks/plotting.py @@ -10,15 +10,37 @@ def date_range(start=None, end=None, periods=None, freq=None): from pandas.tools.plotting import andrews_curves +class Plotting(object): + goal_time = 0.2 + + def setup(self): + import matplotlib + matplotlib.use('Agg') + self.s = Series(np.random.randn(1000000)) + self.df = DataFrame({'col': self.s}) + + def time_series_plot(self): + self.s.plot() + + def time_frame_plot(self): + self.df.plot() + + class TimeseriesPlotting(object): goal_time = 0.2 def setup(self): import matplotlib matplotlib.use('Agg') - self.N = 2000 - self.M = 5 - self.df = DataFrame(np.random.randn(self.N, self.M), index=date_range('1/1/1975', periods=self.N)) + N = 2000 + M = 5 + idx = date_range('1/1/1975', periods=N) + self.df = DataFrame(np.random.randn(N, M), index=idx) + + idx_irregular = pd.DatetimeIndex(np.concatenate((idx.values[0:10], + idx.values[12:]))) + self.df2 = DataFrame(np.random.randn(len(idx_irregular), M), + index=idx_irregular) def time_plot_regular(self): self.df.plot() @@ -26,6 +48,9 @@ def time_plot_regular(self): def time_plot_regular_compat(self): self.df.plot(x_compat=True) + def time_plot_irregular(self): + self.df2.plot() + class Misc(object): goal_time = 0.6 diff --git a/ci/before_install_travis.sh b/ci/before_script_travis.sh similarity index 93% rename from ci/before_install_travis.sh rename to ci/before_script_travis.sh index 2d0b4da6120dc..0b3939b1906a2 100755 --- a/ci/before_install_travis.sh +++ b/ci/before_script_travis.sh @@ -4,6 +4,7 @@ echo "inside $0" if [ "${TRAVIS_OS_NAME}" == "linux" ]; then sh -e /etc/init.d/xvfb start + sleep 3 fi # Never fail because bad things happened here. diff --git a/ci/check_imports.py b/ci/check_imports.py index a83436e7d258c..d6f24ebcc4d3e 100644 --- a/ci/check_imports.py +++ b/ci/check_imports.py @@ -9,7 +9,6 @@ 'ipython', 'jinja2' 'lxml', - 'matplotlib', 'numexpr', 'openpyxl', 'py', diff --git a/ci/environment-dev.yaml b/ci/environment-dev.yaml new file mode 100644 index 0000000000000..c3d3d59f895c6 --- /dev/null +++ b/ci/environment-dev.yaml @@ -0,0 +1,14 @@ +name: pandas-dev +channels: + - defaults + - conda-forge +dependencies: + - Cython + - NumPy + - moto + - pytest + - python-dateutil + - python=3 + - pytz + - setuptools + - sphinx diff --git a/ci/install_travis.sh b/ci/install_travis.sh index b85263daa1eac..dac3625cba4ba 100755 --- a/ci/install_travis.sh +++ b/ci/install_travis.sh @@ -34,9 +34,9 @@ fi # install miniconda if [ "${TRAVIS_OS_NAME}" == "osx" ]; then - time wget http://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -O miniconda.sh || exit 1 + time wget http://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -q -O miniconda.sh || exit 1 else - time wget http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh || exit 1 + time wget http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -q -O miniconda.sh || exit 1 fi time bash miniconda.sh -b -p "$MINICONDA_DIR" || exit 1 @@ -107,7 +107,7 @@ time conda install -n pandas pytest>=3.1.0 time pip install pytest-xdist moto if [ "$LINT" ]; then - conda install flake8 + conda install flake8=3.4.1 pip install cpplint fi diff --git a/ci/requirements-2.7_BUILD_TEST.pip b/ci/requirements-2.7_BUILD_TEST.pip index a0fc77c40bc00..f4617133cad5b 100644 --- a/ci/requirements-2.7_BUILD_TEST.pip +++ b/ci/requirements-2.7_BUILD_TEST.pip @@ -1,7 +1,6 @@ xarray geopandas seaborn -pandas_gbq pandas_datareader statsmodels scikit-learn diff --git a/ci/requirements-3.5.pip b/ci/requirements-3.5.pip index 6e4f7b65f9728..c9565f2173070 100644 --- a/ci/requirements-3.5.pip +++ b/ci/requirements-3.5.pip @@ -1,2 +1,2 @@ xarray==0.9.1 -pandas-gbq +pandas_gbq diff --git a/ci/requirements-3.6.sh b/ci/requirements-3.6.sh new file mode 100644 index 0000000000000..f5c3dbf59a29d --- /dev/null +++ b/ci/requirements-3.6.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +source activate pandas + +echo "[install 3.6 downstream deps]" + +conda install -n pandas -c conda-forge pandas-datareader xarray geopandas seaborn statsmodels scikit-learn dask diff --git a/ci/requirements-3.6_WIN.run b/ci/requirements-3.6_WIN.run index af7a90b126f22..db2d429a2a4ff 100644 --- a/ci/requirements-3.6_WIN.run +++ b/ci/requirements-3.6_WIN.run @@ -1,6 +1,6 @@ python-dateutil pytz -numpy=1.12* +numpy=1.13* bottleneck openpyxl xlsxwriter diff --git a/ci/requirements_all.txt b/ci/requirements-optional-conda.txt similarity index 68% rename from ci/requirements_all.txt rename to ci/requirements-optional-conda.txt index e13afd619f105..6edb8d17337e4 100644 --- a/ci/requirements_all.txt +++ b/ci/requirements-optional-conda.txt @@ -1,28 +1,27 @@ -pytest>=3.1.0 -pytest-cov -pytest-xdist -flake8 -sphinx=1.5* -nbsphinx -ipython -python-dateutil -pytz -openpyxl -xlsxwriter -xlrd -xlwt -html5lib -patsy beautifulsoup4 -numpy -cython -scipy +blosc +bottleneck +fastparquet +feather-format +html5lib +ipython +ipykernel +jinja2 +lxml +matplotlib +nbsphinx numexpr +openpyxl +pyarrow +pymysql pytables -matplotlib +pytest-cov +pytest-xdist +s3fs +scipy seaborn -lxml sqlalchemy -bottleneck -pymysql -Jinja2 +xarray +xlrd +xlsxwriter +xlwt diff --git a/ci/requirements-optional-pip.txt b/ci/requirements-optional-pip.txt new file mode 100644 index 0000000000000..06b22bd8f2c63 --- /dev/null +++ b/ci/requirements-optional-pip.txt @@ -0,0 +1,27 @@ +# This file was autogenerated by scripts/convert_deps.py +# Do not modify directlybeautifulsoup4 +blosc +bottleneck +fastparquet +feather-format +html5lib +ipython +jinja2 +lxml +matplotlib +nbsphinx +numexpr +openpyxl +pyarrow +pymysql +tables +pytest-cov +pytest-xdist +s3fs +scipy +seaborn +sqlalchemy +xarray +xlrd +xlsxwriter +xlwt \ No newline at end of file diff --git a/ci/requirements_dev.txt b/ci/requirements_dev.txt index dbc4f6cbd6509..2fb36b7cd70d8 100644 --- a/ci/requirements_dev.txt +++ b/ci/requirements_dev.txt @@ -1,8 +1,10 @@ +# This file was autogenerated by scripts/convert_deps.py +# Do not modify directly +Cython +NumPy +moto +pytest python-dateutil pytz -numpy -cython -pytest>=3.1.0 -pytest-cov -flake8 -moto +setuptools +sphinx \ No newline at end of file diff --git a/ci/script_multi.sh b/ci/script_multi.sh index ee9fbcaad5ef5..ae8f030b92d66 100755 --- a/ci/script_multi.sh +++ b/ci/script_multi.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/bash -e echo "[script multi]" diff --git a/doc/source/api.rst b/doc/source/api.rst index 80f8d42be8ed6..a9766b5c04496 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -1794,6 +1794,7 @@ Methods Timestamp.strftime Timestamp.strptime Timestamp.time + Timestamp.timestamp Timestamp.timetuple Timestamp.timetz Timestamp.to_datetime64 @@ -2173,6 +2174,17 @@ Style Export and Import Styler.export Styler.use +Plotting +~~~~~~~~ + +.. currentmodule:: pandas + +.. autosummary:: + :toctree: generated/ + + plotting.register_matplotlib_converters + plotting.deregister_matplotlib_converters + .. currentmodule:: pandas General utility functions diff --git a/doc/source/computation.rst b/doc/source/computation.rst index 466ac3c9cbf51..0cdfec63fd696 100644 --- a/doc/source/computation.rst +++ b/doc/source/computation.rst @@ -346,7 +346,9 @@ The following methods are available: :meth:`~Window.sum`, Sum of values :meth:`~Window.mean`, Mean of values -The weights used in the window are specified by the ``win_type`` keyword. The list of recognized types are: +The weights used in the window are specified by the ``win_type`` keyword. +The list of recognized types are the `scipy.signal window functions + `__: - ``boxcar`` - ``triang`` diff --git a/doc/source/contributing.rst b/doc/source/contributing.rst index d8d57a8bfffdd..4426d3fb0165e 100644 --- a/doc/source/contributing.rst +++ b/doc/source/contributing.rst @@ -11,32 +11,32 @@ Where to start? =============== All contributions, bug reports, bug fixes, documentation improvements, -enhancements and ideas are welcome. +enhancements, and ideas are welcome. -If you are simply looking to start working with the *pandas* codebase, navigate to the -`GitHub "issues" tab `_ and start looking through -interesting issues. There are a number of issues listed under `Docs +If you are brand new to pandas or open-source development, we recommend going +through the `GitHub "issues" tab `_ +to find issues that interest you. There are a number of issues listed under `Docs `_ and `Difficulty Novice `_ -where you could start out. - -Or maybe through using *pandas* you have an idea of your own or are looking for something -in the documentation and thinking 'this can be improved'...you can do something -about it! +where you could start out. Once you've found an interesting issue, you can +return here to get your development environment setup. Feel free to ask questions on the `mailing list -`_ or on `Gitter -`_. +`_ or on `Gitter`_. + +.. _contributing.bug_reports: Bug reports and enhancement requests ==================================== -Bug reports are an important part of making *pandas* more stable. Having a complete bug report -will allow others to reproduce the bug and provide insight into fixing. Because many versions of -*pandas* are supported, knowing version information will also identify improvements made since -previous versions. Trying the bug-producing code out on the *master* branch is often a worthwhile exercise -to confirm the bug still exists. It is also worth searching existing bug reports and pull requests +Bug reports are an important part of making *pandas* more stable. Having a complete bug report +will allow others to reproduce the bug and provide insight into fixing. See +`this stackoverflow article `_ for tips on +writing a good bug report. + +Trying the bug-producing code out on the *master* branch is often a worthwhile exercise +to confirm the bug still exists. It is also worth searching existing bug reports and pull requests to see if the issue has already been reported and/or fixed. Bug reports must: @@ -60,12 +60,16 @@ Bug reports must: The issue will then show up to the *pandas* community and be open to comments/ideas from others. +.. _contributing.github + Working with the code ===================== Now that you have an issue you want to fix, enhancement to add, or documentation to improve, you need to learn how to work with GitHub and the *pandas* code base. +.. _contributing.version_control: + Version control, Git, and GitHub -------------------------------- @@ -103,167 +107,164 @@ want to clone your fork to your machine:: git clone https://github.com/your-user-name/pandas.git pandas-yourname cd pandas-yourname - git remote add upstream git://github.com/pandas-dev/pandas.git + git remote add upstream https://github.com/pandas-dev/pandas.git This creates the directory `pandas-yourname` and connects your repository to the upstream (main project) *pandas* repository. -Creating a branch ------------------ +.. _contributing.dev_env: -You want your master branch to reflect only production-ready code, so create a -feature branch for making your changes. For example:: +Creating a development environment +---------------------------------- - git branch shiny-new-feature - git checkout shiny-new-feature +To test out code changes, you'll need to build pandas from source, which +requires a C compiler and python environment. If you're making documentation +changes, you can skip to :ref:`contributing.documentation` but you won't be able +to build the documentation locally before pushing your changes. -The above can be simplified to:: +.. _contributiong.dev_c: - git checkout -b shiny-new-feature +Installing a C Complier +~~~~~~~~~~~~~~~~~~~~~~~ -This changes your working directory to the shiny-new-feature branch. Keep any -changes in this branch specific to one bug or feature so it is clear -what the branch brings to *pandas*. You can have many shiny-new-features -and switch in between them using the git checkout command. +Pandas uses C extensions (mostly written using Cython) to speed up certain +operations. To install pandas from source, you need to compile these C +extensions, which means you need a C complier. This process depends on which +platform you're using. Follow the `CPython contributing guidelines +`_ for getting a +complier installed. You don't need to do any of the ``./configure`` or ``make`` +steps; you only need to install the complier. -To update this branch, you need to retrieve the changes from the master branch:: +For Windows developers, the following links may be helpful. - git fetch upstream - git rebase upstream/master +- https://blogs.msdn.microsoft.com/pythonengineering/2016/04/11/unable-to-find-vcvarsall-bat/ +- https://github.com/conda/conda-recipes/wiki/Building-from-Source-on-Windows-32-bit-and-64-bit +- https://cowboyprogrammer.org/building-python-wheels-for-windows/ +- https://blog.ionelmc.ro/2014/12/21/compiling-python-extensions-on-windows/ +- https://support.enthought.com/hc/en-us/articles/204469260-Building-Python-extensions-with-Canopy -This will replay your commits on top of the latest pandas git master. If this -leads to merge conflicts, you must resolve these before submitting your pull -request. If you have uncommitted changes, you will need to ``stash`` them prior -to updating. This will effectively store your changes and they can be reapplied -after updating. +Let us know if you have any difficulties by opening an issue or reaching out on +`Gitter`_. -.. _contributing.dev_env: +.. _contributiong.dev_python: -Creating a development environment ----------------------------------- +Creating a Python Environment +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -An easy way to create a *pandas* development environment is as follows. +Now that you have a C complier, create an isolated pandas development +environment: -- Install either :ref:`Anaconda ` or :ref:`miniconda ` +- Install either `Anaconda `_ or `miniconda + `_ +- Make sure your conda is up to date (``conda update conda``) - Make sure that you have :ref:`cloned the repository ` - ``cd`` to the *pandas* source directory -Tell conda to create a new environment, named ``pandas_dev``, or any other name you would like -for this environment, by running:: - - conda create -n pandas_dev --file ci/requirements_dev.txt - - -For a python 3 environment:: - - conda create -n pandas_dev python=3 --file ci/requirements_dev.txt - -.. warning:: - - If you are on Windows, see :ref:`here for a fully compliant Windows environment `. - -This will create the new environment, and not touch any of your existing environments, -nor any existing python installation. It will install all of the basic dependencies of -*pandas*, as well as the development and testing tools. If you would like to install -other dependencies, you can install them as follows:: +We'll now kick off a three-step process: - conda install -n pandas_dev -c pandas pytables scipy +1. Install the build dependencies +2. Build and install pandas +3. Install the optional dependencies -To install *all* pandas dependencies you can do the following:: +.. code-block:: none - conda install -n pandas_dev -c conda-forge --file ci/requirements_all.txt + # Create and activate the build environment + conda env create -f ci/environment-dev.yaml + conda activate pandas-dev -To work in this environment, Windows users should ``activate`` it as follows:: + # Build and install pandas + python setup.py build_ext --inplace -j 4 + python -m pip install -e . - activate pandas_dev + # Install the rest of the optional dependencies + conda install -c defaults -c conda-forge --file=ci/requirements-optional-conda.txt -Mac OSX / Linux users should use:: +At this point you should be able to import pandas from your locally built version:: - source activate pandas_dev + $ python # start an interpreter + >>> import pandas + >>> print(pandas.__version__) + 0.22.0.dev0+29.g4ad6d4d74 -You will then see a confirmation message to indicate you are in the new development environment. +This will create the new environment, and not touch any of your existing environments, +nor any existing python installation. To view your environments:: conda info -e -To return to your home root environment in Windows:: - - deactivate +To return to your root environment:: -To return to your home root environment in OSX / Linux:: - - source deactivate + conda deactivate See the full conda docs `here `__. -At this point you can easily do an *in-place* install, as detailed in the next section. - -.. _contributing.windows: - -Creating a Windows development environment ------------------------------------------- +.. _contributing.pip: -To build on Windows, you need to have compilers installed to build the extensions. You will need to install the appropriate Visual Studio compilers, VS 2008 for Python 2.7, VS 2010 for 3.4, and VS 2015 for Python 3.5 and 3.6. +Creating a Python Environment (pip) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -For Python 2.7, you can install the ``mingw`` compiler which will work equivalently to VS 2008:: +If you aren't using conda for you development environment, follow these instructions. +You'll need to have at least python3.5 installed on your system. - conda install -n pandas_dev libpython +.. code-block:: none -or use the `Microsoft Visual Studio VC++ compiler for Python `__. Note that you have to check the ``x64`` box to install the ``x64`` extension building capability as this is not installed by default. + # Create a virtual environment + # Use an ENV_DIR of your choice. We'll use ~/virtualenvs/pandas-dev + # Any parent directories should already exist + python3 -m venv ~/virtualenvs/pandas-dev + # Activate the virtulaenv + . ~/virtualenvs/pandas-dev/bin/activate -For Python 3.4, you can download and install the `Windows 7.1 SDK `__. Read the references below as there may be various gotchas during the installation. - -For Python 3.5 and 3.6, you can download and install the `Visual Studio 2015 Community Edition `__. - -Here are some references and blogs: - -- https://blogs.msdn.microsoft.com/pythonengineering/2016/04/11/unable-to-find-vcvarsall-bat/ -- https://github.com/conda/conda-recipes/wiki/Building-from-Source-on-Windows-32-bit-and-64-bit -- https://cowboyprogrammer.org/building-python-wheels-for-windows/ -- https://blog.ionelmc.ro/2014/12/21/compiling-python-extensions-on-windows/ -- https://support.enthought.com/hc/en-us/articles/204469260-Building-Python-extensions-with-Canopy + # Install the build dependencies + python -m pip install -r ci/requirements_dev.txt + # Build and install pandas + python setup.py build_ext --inplace -j 4 + python -m pip install -e . -.. _contributing.getting_source: + # Install additional dependencies + python -m pip install -r ci/requirements-optional-pip.txt -Making changes --------------- +Creating a branch +----------------- -Before making your code changes, it is often necessary to build the code that was -just checked out. There are two primary methods of doing this. +You want your master branch to reflect only production-ready code, so create a +feature branch for making your changes. For example:: -#. The best way to develop *pandas* is to build the C extensions in-place by - running:: + git branch shiny-new-feature + git checkout shiny-new-feature - python setup.py build_ext --inplace +The above can be simplified to:: - If you startup the Python interpreter in the *pandas* source directory you - will call the built C extensions + git checkout -b shiny-new-feature -#. Another very common option is to do a ``develop`` install of *pandas*:: +This changes your working directory to the shiny-new-feature branch. Keep any +changes in this branch specific to one bug or feature so it is clear +what the branch brings to *pandas*. You can have many shiny-new-features +and switch in between them using the git checkout command. - python setup.py develop +To update this branch, you need to retrieve the changes from the master branch:: - This makes a symbolic link that tells the Python interpreter to import *pandas* - from your development directory. Thus, you can always be using the development - version on your system without being inside the clone directory. + git fetch upstream + git rebase upstream/master +This will replay your commits on top of the latest pandas git master. If this +leads to merge conflicts, you must resolve these before submitting your pull +request. If you have uncommitted changes, you will need to ``stash`` them prior +to updating. This will effectively store your changes and they can be reapplied +after updating. .. _contributing.documentation: Contributing to the documentation ================================= -If you're not the developer type, contributing to the documentation is still -of huge value. You don't even have to be an expert on -*pandas* to do so! Something as simple as rewriting small passages for clarity -as you reference the docs is a simple but effective way to contribute. The -next person to read that passage will be in your debt! - -In fact, there are sections of the docs that are worse off after being written -by experts. If something in the docs doesn't make sense to you, updating the -relevant section after you figure it out is a simple way to ensure it will -help the next person. +If you're not the developer type, contributing to the documentation is still of +huge value. You don't even have to be an expert on *pandas* to do so! In fact, +there are sections of the docs that are worse off after being written by +experts. If something in the docs doesn't make sense to you, updating the +relevant section after you figure it out is a great way to ensure it will help +the next person. .. contents:: Documentation: :local: @@ -330,7 +331,7 @@ The utility script ``scripts/api_rst_coverage.py`` can be used to compare the list of methods documented in ``doc/source/api.rst`` (which is used to generate the `API Reference `_ page) and the actual public methods. -This will identify methods documented in in ``doc/source/api.rst`` that are not actually +This will identify methods documented in ``doc/source/api.rst`` that are not actually class methods, and existing methods that are not documented in ``doc/source/api.rst``. @@ -342,30 +343,6 @@ Requirements First, you need to have a development environment to be able to build pandas (see the docs on :ref:`creating a development environment above `). -Further, to build the docs, there are some extra requirements: you will need to -have ``sphinx`` and ``ipython`` installed. `numpydoc -`_ is used to parse the docstrings that -follow the Numpy Docstring Standard (see above), but you don't need to install -this because a local copy of numpydoc is included in the *pandas* source -code. `nbsphinx `_ is required to build -the Jupyter notebooks included in the documentation. - -If you have a conda environment named ``pandas_dev``, you can install the extra -requirements with:: - - conda install -n pandas_dev sphinx ipython nbconvert nbformat - conda install -n pandas_dev -c conda-forge nbsphinx - -Furthermore, it is recommended to have all :ref:`optional dependencies `. -installed. This is not strictly necessary, but be aware that you will see some error -messages when building the docs. This happens because all the code in the documentation -is executed during the doc build, and so code examples using optional dependencies -will generate errors. Run ``pd.show_versions()`` to get an overview of the installed -version of all dependencies. - -.. warning:: - - You need to have ``sphinx`` version >= 1.3.2. Building the documentation ~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -386,10 +363,10 @@ If you want to do a full clean build, do:: python make.py clean python make.py html -Starting with *pandas* 0.13.1 you can tell ``make.py`` to compile only a single section -of the docs, greatly reducing the turn-around time for checking your changes. -You will be prompted to delete ``.rst`` files that aren't required. This is okay because -the prior versions of these files can be checked out from git. However, you must make sure +You can tell ``make.py`` to compile only a single section of the docs, greatly +reducing the turn-around time for checking your changes. You will be prompted to +delete ``.rst`` files that aren't required. This is okay because the prior +versions of these files can be checked out from git. However, you must make sure not to commit the file deletions to your Git repository! :: @@ -422,6 +399,8 @@ the documentation are also built by Travis-CI. These docs are then hosted `here `__, see also the :ref:`Continuous Integration ` section. +.. _contributing.code: + Contributing to the code base ============================= @@ -480,7 +459,7 @@ Once configured, you can run the tool as follows:: clang-format modified-c-file This will output what your file will look like if the changes are made, and to apply -them, just run the following command:: +them, run the following command:: clang-format -i modified-c-file @@ -1033,7 +1012,7 @@ delete your branch:: git checkout master git merge upstream/master -Then you can just do:: +Then you can do:: git branch -d shiny-new-feature @@ -1043,3 +1022,6 @@ branch has not actually been merged. The branch will still exist on GitHub, so to delete it there do:: git push origin --delete shiny-new-feature + + +.. _Gitter: https://gitter.im/pydata/pandas diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst index 2348a3d10c54f..69913b2c1fbd8 100644 --- a/doc/source/ecosystem.rst +++ b/doc/source/ecosystem.rst @@ -53,6 +53,18 @@ the latest web technologies. Its goal is to provide elegant, concise constructio graphics in the style of Protovis/D3, while delivering high-performance interactivity over large data to thin clients. +`seaborn `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Seaborn is a Python visualization library based on `matplotlib +`__. It provides a high-level, dataset-oriented +interface for creating attractive statistical graphics. The plotting functions +in seaborn understand pandas objects and leverage pandas grouping operations +internally to support concise specification of complex visualizations. Seaborn +also goes beyond matplotlib and pandas with the option to perform statistical +estimation while plotting, aggregating across observations and visualizing the +fit of statistical models to emphasize patterns in a dataset. + `yhat/ggplot `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -64,15 +76,6 @@ but a faithful implementation for python users has long been missing. Although s (as of Jan-2014), the `yhat/ggplot `__ project has been progressing quickly in that direction. -`Seaborn `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Although pandas has quite a bit of "just plot it" functionality built-in, visualization and -in particular statistical graphics is a vast field with a long tradition and lots of ground -to cover. The `Seaborn `__ project builds on top of pandas -and `matplotlib `__ to provide easy plotting of data which extends to -more advanced types of plots then those offered by pandas. - `Vincent `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -222,7 +225,13 @@ Out-of-core ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Dask is a flexible parallel computing library for analytics. Dask -allow a familiar ``DataFrame`` interface to out-of-core, parallel and distributed computing. +provides a familiar ``DataFrame`` interface for out-of-core, parallel and distributed computing. + +`Dask-ML `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Dask-ML enables parallel and distributed machine learning using Dask alongside existing machine learning libraries like Scikit-Learn, XGBoost, and TensorFlow. + `Blaze `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index fdb002a642d62..b329fac969343 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -1835,15 +1835,27 @@ that you've done this: Yikes! +.. _indexing.evaluation_order: + Evaluation order matters ~~~~~~~~~~~~~~~~~~~~~~~~ -Furthermore, in chained expressions, the order may determine whether a copy is returned or not. -If an expression will set values on a copy of a slice, then a ``SettingWithCopy`` -warning will be issued. +When you use chained indexing, the order and type of the indexing operation +partially determine whether the result is a slice into the original object, or +a copy of the slice. + +Pandas has the ``SettingWithCopyWarning`` because assigning to a copy of a +slice is frequently not intentional, but a mistake caused by chained indexing +returning a copy where a slice was expected. + +If you would like pandas to be more or less trusting about assignment to a +chained indexing expression, you can set the :ref:`option ` +``mode.chained_assignment`` to one of these values: -You can control the action of a chained assignment via the option ``mode.chained_assignment``, -which can take the values ``['raise','warn',None]``, where showing a warning is the default. +* ``'warn'``, the default, means a ``SettingWithCopyWarning`` is printed. +* ``'raise'`` means pandas will raise a ``SettingWithCopyException`` + you have to deal with. +* ``None`` will suppress the warnings entirely. .. ipython:: python :okwarning: diff --git a/doc/source/install.rst b/doc/source/install.rst index c805f84d0faaa..27dde005e5a87 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -141,28 +141,24 @@ and can take a few minutes to complete. Installing using your Linux distribution's package manager. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The commands in this table will install pandas for Python 2 from your distribution. -To install pandas for Python 3 you may need to use the package ``python3-pandas``. +The commands in this table will install pandas for Python 3 from your distribution. +To install pandas for Python 2 you may need to use the package ``python-pandas``. .. csv-table:: :header: "Distribution", "Status", "Download / Repository Link", "Install method" :widths: 10, 10, 20, 50 - Debian, stable, `official Debian repository `__ , ``sudo apt-get install python-pandas`` - Debian & Ubuntu, unstable (latest packages), `NeuroDebian `__ , ``sudo apt-get install python-pandas`` - Ubuntu, stable, `official Ubuntu repository `__ , ``sudo apt-get install python-pandas`` - Ubuntu, unstable (daily builds), `PythonXY PPA `__; activate by: ``sudo add-apt-repository ppa:pythonxy/pythonxy-devel && sudo apt-get update``, ``sudo apt-get install python-pandas`` - OpenSuse, stable, `OpenSuse Repository `__ , ``zypper in python-pandas`` - Fedora, stable, `official Fedora repository `__ , ``dnf install python-pandas`` - Centos/RHEL, stable, `EPEL repository `__ , ``yum install python-pandas`` - - - - - - + Debian, stable, `official Debian repository `__ , ``sudo apt-get install python3-pandas`` + Debian & Ubuntu, unstable (latest packages), `NeuroDebian `__ , ``sudo apt-get install python3-pandas`` + Ubuntu, stable, `official Ubuntu repository `__ , ``sudo apt-get install python3-pandas`` + OpenSuse, stable, `OpenSuse Repository `__ , ``zypper in python3-pandas`` + Fedora, stable, `official Fedora repository `__ , ``dnf install python3-pandas`` + Centos/RHEL, stable, `EPEL repository `__ , ``yum install python3-pandas`` +**However**, the packages in the linux package managers are often a few versions behind, so +to get the newest version of pandas, it's recommended to install using the ``pip`` or ``conda`` +methods described above. Installing from source @@ -258,7 +254,8 @@ Optional Dependencies `__, or `xclip `__: necessary to use :func:`~pandas.read_clipboard`. Most package managers on Linux distributions will have ``xclip`` and/or ``xsel`` immediately available for installation. -* For Google BigQuery I/O - see `here `__ +* `pandas-gbq `__: for Google BigQuery I/O. + * `Backports.lzma `__: Only for Python 2, for writing to and/or reading from an xz compressed DataFrame in CSV; Python 3 support is built into the standard library. * One of the following combinations of libraries is needed to use the diff --git a/doc/source/io.rst b/doc/source/io.rst index 82cb83c168b22..4024414610a82 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -103,15 +103,20 @@ Column and Index Locations and Names ++++++++++++++++++++++++++++++++++++ header : int or list of ints, default ``'infer'`` - Row number(s) to use as the column names, and the start of the data. Default - behavior is as if ``header=0`` if no ``names`` passed, otherwise as if - ``header=None``. Explicitly pass ``header=0`` to be able to replace existing - names. The header can be a list of ints that specify row locations for a - multi-index on the columns e.g. ``[0,1,3]``. Intervening rows that are not - specified will be skipped (e.g. 2 in this example is skipped). Note that - this parameter ignores commented lines and empty lines if - ``skip_blank_lines=True``, so header=0 denotes the first line of data - rather than the first line of the file. + Row number(s) to use as the column names, and the start of the + data. Default behavior is to infer the column names: if no names are + passed the behavior is identical to ``header=0`` and column names + are inferred from the first line of the file, if column names are + passed explicitly then the behavior is identical to + ``header=None``. Explicitly pass ``header=0`` to be able to replace + existing names. + + The header can be a list of ints that specify row locations + for a multi-index on the columns e.g. ``[0,1,3]``. Intervening rows + that are not specified will be skipped (e.g. 2 in this example is + skipped). Note that this parameter ignores commented lines and empty + lines if ``skip_blank_lines=True``, so header=0 denotes the first + line of data rather than the first line of the file. names : array-like, default ``None`` List of column names to use. If file contains no header row, then you should explicitly pass ``header=None``. Duplicates in this list will cause @@ -553,6 +558,14 @@ If the header is in a row other than the first, pass the row number to data = 'skip this skip it\na,b,c\n1,2,3\n4,5,6\n7,8,9' pd.read_csv(StringIO(data), header=1) +.. note:: + + Default behavior is to infer the column names: if no names are + passed the behavior is identical to ``header=0`` and column names + are inferred from the first nonblank line of the file, if column + names are passed explicitly then the behavior is identical to + ``header=None``. + .. _io.dupe_names: Duplicate names parsing @@ -4469,8 +4482,10 @@ Several caveats. - This is a newer library, and the format, though stable, is not guaranteed to be backward compatible to the earlier versions. -- The format will NOT write an ``Index``, or ``MultiIndex`` for the ``DataFrame`` and will raise an - error if a non-default one is provided. You can simply ``.reset_index()`` in order to store the index. +- The format will NOT write an ``Index``, or ``MultiIndex`` for the + ``DataFrame`` and will raise an error if a non-default one is provided. You + can ``.reset_index()`` to store the index or ``.reset_index(drop=True)`` to + ignore it. - Duplicate column names and non-string columns names are not supported - Non supported types include ``Period`` and actual python object types. These will raise a helpful error message on an attempt at serialization. @@ -4533,8 +4548,10 @@ dtypes, including extension dtypes such as datetime with tz. Several caveats. -- The format will NOT write an ``Index``, or ``MultiIndex`` for the ``DataFrame`` and will raise an - error if a non-default one is provided. You can simply ``.reset_index(drop=True)`` in order to store the index. +- The format will NOT write an ``Index``, or ``MultiIndex`` for the + ``DataFrame`` and will raise an error if a non-default one is provided. You + can ``.reset_index()`` to store the index or ``.reset_index(drop=True)`` to + ignore it. - Duplicate column names and non-string columns names are not supported - Categorical dtypes can be serialized to parquet, but will de-serialize as ``object`` dtype. - Non supported types include ``Period`` and actual python object types. These will raise a helpful error message @@ -4580,6 +4597,15 @@ Read from a parquet file. result.dtypes +Read only certain columns of a parquet file. + +.. ipython:: python + + result = pd.read_parquet('example_fp.parquet', engine='fastparquet', columns=['a', 'b']) + + result.dtypes + + .. ipython:: python :suppress: diff --git a/doc/source/options.rst b/doc/source/options.rst index 2da55a5a658a4..db3380bd4a3e7 100644 --- a/doc/source/options.rst +++ b/doc/source/options.rst @@ -273,164 +273,167 @@ Options are 'right', and 'left'. Available Options ----------------- -=================================== ============ ================================== -Option Default Function -=================================== ============ ================================== -display.chop_threshold None If set to a float value, all float - values smaller then the given - threshold will be displayed as - exactly 0 by repr and friends. -display.colheader_justify right Controls the justification of - column headers. used by DataFrameFormatter. -display.column_space 12 No description available. -display.date_dayfirst False When True, prints and parses dates - with the day first, eg 20/01/2005 -display.date_yearfirst False When True, prints and parses dates - with the year first, eg 2005/01/20 -display.encoding UTF-8 Defaults to the detected encoding - of the console. Specifies the encoding - to be used for strings returned by - to_string, these are generally strings - meant to be displayed on the console. -display.expand_frame_repr True Whether to print out the full DataFrame - repr for wide DataFrames across - multiple lines, `max_columns` is - still respected, but the output will - wrap-around across multiple "pages" - if its width exceeds `display.width`. -display.float_format None The callable should accept a floating - point number and return a string with - the desired format of the number. - This is used in some places like - SeriesFormatter. - See core.format.EngFormatter for an example. -display.large_repr truncate For DataFrames exceeding max_rows/max_cols, - the repr (and HTML repr) can show - a truncated table (the default), - or switch to the view from df.info() - (the behaviour in earlier versions of pandas). - allowable settings, ['truncate', 'info'] -display.latex.repr False Whether to produce a latex DataFrame - representation for jupyter frontends - that support it. -display.latex.escape True Escapes special characters in DataFrames, when - using the to_latex method. -display.latex.longtable False Specifies if the to_latex method of a DataFrame - uses the longtable format. -display.latex.multicolumn True Combines columns when using a MultiIndex -display.latex.multicolumn_format 'l' Alignment of multicolumn labels -display.latex.multirow False Combines rows when using a MultiIndex. - Centered instead of top-aligned, - separated by clines. -display.max_columns 20 max_rows and max_columns are used - in __repr__() methods to decide if - to_string() or info() is used to - render an object to a string. In - case python/IPython is running in - a terminal this can be set to 0 and - pandas will correctly auto-detect - the width the terminal and swap to - a smaller format in case all columns - would not fit vertically. The IPython - notebook, IPython qtconsole, or IDLE - do not run in a terminal and hence - it is not possible to do correct - auto-detection. 'None' value means - unlimited. -display.max_colwidth 50 The maximum width in characters of - a column in the repr of a pandas - data structure. When the column overflows, - a "..." placeholder is embedded in - the output. -display.max_info_columns 100 max_info_columns is used in DataFrame.info - method to decide if per column information - will be printed. -display.max_info_rows 1690785 df.info() will usually show null-counts - for each column. For large frames - this can be quite slow. max_info_rows - and max_info_cols limit this null - check only to frames with smaller - dimensions then specified. -display.max_rows 60 This sets the maximum number of rows - pandas should output when printing - out various output. For example, - this value determines whether the - repr() for a dataframe prints out - fully or just a summary repr. - 'None' value means unlimited. -display.max_seq_items 100 when pretty-printing a long sequence, - no more then `max_seq_items` will - be printed. If items are omitted, - they will be denoted by the addition - of "..." to the resulting string. - If set to None, the number of items - to be printed is unlimited. -display.memory_usage True This specifies if the memory usage of - a DataFrame should be displayed when the - df.info() method is invoked. -display.multi_sparse True "Sparsify" MultiIndex display (don't - display repeated elements in outer - levels within groups) -display.notebook_repr_html True When True, IPython notebook will - use html representation for - pandas objects (if it is available). -display.pprint_nest_depth 3 Controls the number of nested levels - to process when pretty-printing -display.precision 6 Floating point output precision in - terms of number of places after the - decimal, for regular formatting as well - as scientific notation. Similar to - numpy's ``precision`` print option -display.show_dimensions truncate Whether to print out dimensions - at the end of DataFrame repr. - If 'truncate' is specified, only - print out the dimensions if the - frame is truncated (e.g. not display - all rows and/or columns) -display.width 80 Width of the display in characters. - In case python/IPython is running in - a terminal this can be set to None - and pandas will correctly auto-detect - the width. Note that the IPython notebook, - IPython qtconsole, or IDLE do not run in a - terminal and hence it is not possible - to correctly detect the width. -display.html.table_schema False Whether to publish a Table Schema - representation for frontends that - support it. -display.html.border 1 A ``border=value`` attribute is - inserted in the ```` tag - for the DataFrame HTML repr. -io.excel.xls.writer xlwt The default Excel writer engine for - 'xls' files. -io.excel.xlsm.writer openpyxl The default Excel writer engine for - 'xlsm' files. Available options: - 'openpyxl' (the default). -io.excel.xlsx.writer openpyxl The default Excel writer engine for - 'xlsx' files. -io.hdf.default_format None default format writing format, if - None, then put will default to - 'fixed' and append will default to - 'table' -io.hdf.dropna_table True drop ALL nan rows when appending - to a table -io.parquet.engine None The engine to use as a default for - parquet reading and writing. If None - then try 'pyarrow' and 'fastparquet' -mode.chained_assignment warn Raise an exception, warn, or no - action if trying to use chained - assignment, The default is warn -mode.sim_interactive False Whether to simulate interactive mode - for purposes of testing. -mode.use_inf_as_na False True means treat None, NaN, -INF, - INF as NA (old way), False means - None and NaN are null, but INF, -INF - are not NA (new way). -compute.use_bottleneck True Use the bottleneck library to accelerate - computation if it is installed. -compute.use_numexpr True Use the numexpr library to accelerate - computation if it is installed. -=================================== ============ ================================== +======================================= ============ ================================== +Option Default Function +======================================= ============ ================================== +display.chop_threshold None If set to a float value, all float + values smaller then the given + threshold will be displayed as + exactly 0 by repr and friends. +display.colheader_justify right Controls the justification of + column headers. used by DataFrameFormatter. +display.column_space 12 No description available. +display.date_dayfirst False When True, prints and parses dates + with the day first, eg 20/01/2005 +display.date_yearfirst False When True, prints and parses dates + with the year first, eg 2005/01/20 +display.encoding UTF-8 Defaults to the detected encoding + of the console. Specifies the encoding + to be used for strings returned by + to_string, these are generally strings + meant to be displayed on the console. +display.expand_frame_repr True Whether to print out the full DataFrame + repr for wide DataFrames across + multiple lines, `max_columns` is + still respected, but the output will + wrap-around across multiple "pages" + if its width exceeds `display.width`. +display.float_format None The callable should accept a floating + point number and return a string with + the desired format of the number. + This is used in some places like + SeriesFormatter. + See core.format.EngFormatter for an example. +display.large_repr truncate For DataFrames exceeding max_rows/max_cols, + the repr (and HTML repr) can show + a truncated table (the default), + or switch to the view from df.info() + (the behaviour in earlier versions of pandas). + allowable settings, ['truncate', 'info'] +display.latex.repr False Whether to produce a latex DataFrame + representation for jupyter frontends + that support it. +display.latex.escape True Escapes special characters in DataFrames, when + using the to_latex method. +display.latex.longtable False Specifies if the to_latex method of a DataFrame + uses the longtable format. +display.latex.multicolumn True Combines columns when using a MultiIndex +display.latex.multicolumn_format 'l' Alignment of multicolumn labels +display.latex.multirow False Combines rows when using a MultiIndex. + Centered instead of top-aligned, + separated by clines. +display.max_columns 20 max_rows and max_columns are used + in __repr__() methods to decide if + to_string() or info() is used to + render an object to a string. In + case python/IPython is running in + a terminal this can be set to 0 and + pandas will correctly auto-detect + the width the terminal and swap to + a smaller format in case all columns + would not fit vertically. The IPython + notebook, IPython qtconsole, or IDLE + do not run in a terminal and hence + it is not possible to do correct + auto-detection. 'None' value means + unlimited. +display.max_colwidth 50 The maximum width in characters of + a column in the repr of a pandas + data structure. When the column overflows, + a "..." placeholder is embedded in + the output. +display.max_info_columns 100 max_info_columns is used in DataFrame.info + method to decide if per column information + will be printed. +display.max_info_rows 1690785 df.info() will usually show null-counts + for each column. For large frames + this can be quite slow. max_info_rows + and max_info_cols limit this null + check only to frames with smaller + dimensions then specified. +display.max_rows 60 This sets the maximum number of rows + pandas should output when printing + out various output. For example, + this value determines whether the + repr() for a dataframe prints out + fully or just a summary repr. + 'None' value means unlimited. +display.max_seq_items 100 when pretty-printing a long sequence, + no more then `max_seq_items` will + be printed. If items are omitted, + they will be denoted by the addition + of "..." to the resulting string. + If set to None, the number of items + to be printed is unlimited. +display.memory_usage True This specifies if the memory usage of + a DataFrame should be displayed when the + df.info() method is invoked. +display.multi_sparse True "Sparsify" MultiIndex display (don't + display repeated elements in outer + levels within groups) +display.notebook_repr_html True When True, IPython notebook will + use html representation for + pandas objects (if it is available). +display.pprint_nest_depth 3 Controls the number of nested levels + to process when pretty-printing +display.precision 6 Floating point output precision in + terms of number of places after the + decimal, for regular formatting as well + as scientific notation. Similar to + numpy's ``precision`` print option +display.show_dimensions truncate Whether to print out dimensions + at the end of DataFrame repr. + If 'truncate' is specified, only + print out the dimensions if the + frame is truncated (e.g. not display + all rows and/or columns) +display.width 80 Width of the display in characters. + In case python/IPython is running in + a terminal this can be set to None + and pandas will correctly auto-detect + the width. Note that the IPython notebook, + IPython qtconsole, or IDLE do not run in a + terminal and hence it is not possible + to correctly detect the width. +display.html.table_schema False Whether to publish a Table Schema + representation for frontends that + support it. +display.html.border 1 A ``border=value`` attribute is + inserted in the ``
`` tag + for the DataFrame HTML repr. +io.excel.xls.writer xlwt The default Excel writer engine for + 'xls' files. +io.excel.xlsm.writer openpyxl The default Excel writer engine for + 'xlsm' files. Available options: + 'openpyxl' (the default). +io.excel.xlsx.writer openpyxl The default Excel writer engine for + 'xlsx' files. +io.hdf.default_format None default format writing format, if + None, then put will default to + 'fixed' and append will default to + 'table' +io.hdf.dropna_table True drop ALL nan rows when appending + to a table +io.parquet.engine None The engine to use as a default for + parquet reading and writing. If None + then try 'pyarrow' and 'fastparquet' +mode.chained_assignment warn Controls ``SettingWithCopyWarning``: + 'raise', 'warn', or None. Raise an + exception, warn, or no action if + trying to use :ref:`chained assignment `. +mode.sim_interactive False Whether to simulate interactive mode + for purposes of testing. +mode.use_inf_as_na False True means treat None, NaN, -INF, + INF as NA (old way), False means + None and NaN are null, but INF, -INF + are not NA (new way). +compute.use_bottleneck True Use the bottleneck library to accelerate + computation if it is installed. +compute.use_numexpr True Use the numexpr library to accelerate + computation if it is installed. +plotting.matplotlib.register_converters True Register custom converters with + matplotlib. Set to False to de-register. +======================================= ============ ======================================== .. _basics.console_output: diff --git a/doc/source/release.rst b/doc/source/release.rst index 6c3e7f847b485..a3289b1144863 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -52,7 +52,7 @@ Highlights include: - Integration with `Apache Parquet `__, including a new top-level :func:`read_parquet` function and :meth:`DataFrame.to_parquet` method, see :ref:`here `. - New user-facing :class:`pandas.api.types.CategoricalDtype` for specifying categoricals independent of the data, see :ref:`here `. -- The behavior of ``sum`` and ``prod`` on all-NaN Series/DataFrames is now consistent and no longer depends on whether `bottleneck `__ is installed, see :ref:`here `. +- The behavior of ``sum`` and ``prod`` on all-NaN Series/DataFrames is now consistent and no longer depends on whether `bottleneck `__ is installed, and ``sum`` and ``prod`` on empty Series now return NaN instead of 0, see :ref:`here `. - Compatibility fixes for pypy, see :ref:`here `. - Additions to the ``drop``, ``reindex`` and ``rename`` API to make them more consistent, see :ref:`here `. - Addition of the new methods ``DataFrame.infer_objects`` (see :ref:`here `) and ``GroupBy.pipe`` (see :ref:`here `). diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 4c460eeb85b82..89e2d3006696c 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -12,7 +12,7 @@ Highlights include: - Integration with `Apache Parquet `__, including a new top-level :func:`read_parquet` function and :meth:`DataFrame.to_parquet` method, see :ref:`here `. - New user-facing :class:`pandas.api.types.CategoricalDtype` for specifying categoricals independent of the data, see :ref:`here `. -- The behavior of ``sum`` and ``prod`` on all-NaN Series/DataFrames is now consistent and no longer depends on whether `bottleneck `__ is installed, see :ref:`here `. +- The behavior of ``sum`` and ``prod`` on all-NaN Series/DataFrames is now consistent and no longer depends on whether `bottleneck `__ is installed, and ``sum`` and ``prod`` on empty Series now return NaN instead of 0, see :ref:`here `. - Compatibility fixes for pypy, see :ref:`here `. - Additions to the ``drop``, ``reindex`` and ``rename`` API to make them more consistent, see :ref:`here `. - Addition of the new methods ``DataFrame.infer_objects`` (see :ref:`here `) and ``GroupBy.pipe`` (see :ref:`here `). @@ -369,11 +369,11 @@ Additionally, support has been dropped for Python 3.4 (:issue:`15251`). .. _whatsnew_0210.api_breaking.bottleneck: -Sum/Prod of all-NaN Series/DataFrames is now consistently NaN -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Sum/Prod of all-NaN or empty Series/DataFrames is now consistently NaN +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The behavior of ``sum`` and ``prod`` on all-NaN Series/DataFrames no longer depends on -whether `bottleneck `__ is installed. (:issue:`9422`, :issue:`15507`). +whether `bottleneck `__ is installed, and return value of ``sum`` and ``prod`` on an empty Series has changed (:issue:`9422`, :issue:`15507`). Calling ``sum`` or ``prod`` on an empty or all-``NaN`` ``Series``, or columns of a ``DataFrame``, will result in ``NaN``. See the :ref:`docs `. @@ -381,35 +381,35 @@ Calling ``sum`` or ``prod`` on an empty or all-``NaN`` ``Series``, or columns of s = Series([np.nan]) -Previously NO ``bottleneck`` +Previously WITHOUT ``bottleneck`` installed: .. code-block:: ipython In [2]: s.sum() Out[2]: np.nan -Previously WITH ``bottleneck`` +Previously WITH ``bottleneck``: .. code-block:: ipython In [2]: s.sum() Out[2]: 0.0 -New Behavior, without regard to the bottleneck installation. +New Behavior, without regard to the bottleneck installation: .. ipython:: python s.sum() -Note that this also changes the sum of an empty ``Series`` - -Previously regardless of ``bottlenck`` +Note that this also changes the sum of an empty ``Series``. Previously this always returned 0 regardless of a ``bottlenck`` installation: .. code-block:: ipython In [1]: pd.Series([]).sum() Out[1]: 0 +but for consistency with the all-NaN case, this was changed to return NaN as well: + .. ipython:: python pd.Series([]).sum() @@ -877,6 +877,28 @@ New Behavior: pd.interval_range(start=0, end=4) +.. _whatsnew_0210.api.mpl_converters: + +No Automatic Matplotlib Converters +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Pandas no longer registers our ``date``, ``time``, ``datetime``, +``datetime64``, and ``Period`` converters with matplotlib when pandas is +imported. Matplotlib plot methods (``plt.plot``, ``ax.plot``, ...), will not +nicely format the x-axis for ``DatetimeIndex`` or ``PeriodIndex`` values. You +must explicitly register these methods: + +.. ipython:: python + + from pandas.tseries import converter + converter.register() + + fig, ax = plt.subplots() + plt.plot(pd.date_range('2017', periods=6), range(6)) + +Pandas built-in ``Series.plot`` and ``DataFrame.plot`` *will* register these +converters on first-use (:issue:17710). + .. _whatsnew_0210.api: Other API Changes @@ -900,8 +922,6 @@ Other API Changes - Renamed non-functional ``index`` to ``index_col`` in :func:`read_stata` to improve API consistency (:issue:`16342`) - Bug in :func:`DataFrame.drop` caused boolean labels ``False`` and ``True`` to be treated as labels 0 and 1 respectively when dropping indices from a numeric index. This will now raise a ValueError (:issue:`16877`) - Restricted DateOffset keyword arguments. Previously, ``DateOffset`` subclasses allowed arbitrary keyword arguments which could lead to unexpected behavior. Now, only valid arguments will be accepted. (:issue:`17176`). -- Pandas no longer registers matplotlib converters on import. The converters - will be registered and used when the first plot is draw (:issue:`17710`) .. _whatsnew_0210.deprecations: diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt index 56412651f13f0..00726a4606cf7 100644 --- a/doc/source/whatsnew/v0.21.1.txt +++ b/doc/source/whatsnew/v0.21.1.txt @@ -7,6 +7,36 @@ This is a minor release from 0.21.1 and includes a number of deprecations, new features, enhancements, and performance improvements along with a large number of bug fixes. We recommend that all users upgrade to this version. +.. _whatsnew_0211.special: + +Restore Matplotlib datetime Converter Registration +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Pandas implements some matplotlib converters for nicely formatting the axis +labels on plots with ``datetime`` or ``Period`` values. Prior to pandas 0.21.0, +these were implicitly registered with matplotlib, as a side effect of ``import +pandas``. + +In pandas 0.21.0, we required users to explicitly register the +converter. This caused problems for some users who relied on those converters +being present for regular ``matplotlib.pyplot`` plotting methods, so we're +temporarily reverting that change; pandas will again register the converters on +import. + +We've added a new option to control the converters: +``pd.options.plotting.matplotlib.register_converters``. By default, they are +registered. Toggling this to ``False`` removes pandas' formatters and restore +any converters we overwrote when registering them (:issue:`18301`). + +We're working with the matplotlib developers to make this easier. We're trying +to balance user convenience (automatically registering the converters) with +import performance and best practices (importing pandas shouldn't have the side +effect of overwriting any custom converters you've already set). In the future +we hope to have most of the datetime formatting functionality in matplotlib, +with just the pandas-specific converters in pandas. We'll then gracefully +deprecate the automatic registration of converters in favor of users explicitly +registering them when they want them. + .. _whatsnew_0211.enhancements: New features @@ -21,8 +51,8 @@ New features Other Enhancements ^^^^^^^^^^^^^^^^^^ -- -- +- :meth:`Timestamp.timestamp` is now available in Python 2.7. (:issue:`17329`) +- :class:`Grouper` and :class:`TimeGrouper` now have a friendly repr output (:issue:`18203`). - .. _whatsnew_0211.deprecations: @@ -30,16 +60,15 @@ Other Enhancements Deprecations ~~~~~~~~~~~~ -- -- -- +- ``pandas.tseries.register`` has been renamed to + :func:`pandas.plotting.register_matplotlib_converters`` (:issue:`18301`) .. _whatsnew_0211.performance: Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ -- +- Improved performance of plotting large series/dataframes (:issue:`18236`). - - @@ -60,15 +89,21 @@ Bug Fixes Conversion ^^^^^^^^^^ -- -- +- Bug in :class:`TimedeltaIndex` subtraction could incorrectly overflow when ``NaT`` is present (:issue:`17791`) +- Bug in :class:`DatetimeIndex` subtracting datetimelike from DatetimeIndex could fail to overflow (:issue:`18020`) +- Bug in :meth:`IntervalIndex.copy` when copying and ``IntervalIndex`` with non-default ``closed`` (:issue:`18339`) +- Bug in :func:`DataFrame.to_dict` where columns of datetime that are tz-aware were not converted to required arrays when used with ``orient='records'``, raising``TypeError` (:issue:`18372`) +- Bug in :class:`DateTimeIndex` and :meth:`date_range` where mismatching tz-aware ``start`` and ``end`` timezones would not raise an err if ``end.tzinfo`` is None (:issue:`18431`) - Indexing ^^^^^^^^ -- -- +- Bug in a boolean comparison of a ``datetime.datetime`` and a ``datetime64[ns]`` dtype Series (:issue:`17965`) +- Bug where a ``MultiIndex`` with more than a million records was not raising ``AttributeError`` when trying to access a missing attribute (:issue:`18165`) +- Bug in :class:`IntervalIndex` constructor when a list of intervals is passed with non-default ``closed`` (:issue:`18334`) +- Bug in ``Index.putmask`` when an invalid mask passed (:issue:`18368`) +- Bug in masked assignment of a ``timedelta64[ns]`` dtype ``Series``, incorrectly coerced to float (:issue:`18493`) - I/O @@ -76,18 +111,32 @@ I/O - Bug in class:`~pandas.io.stata.StataReader` not converting date/time columns with display formatting addressed (:issue:`17990`). Previously columns with display formatting were normally left as ordinal numbers and not converted to datetime objects. - Bug in :func:`read_csv` when reading a compressed UTF-16 encoded file (:issue:`18071`) +- Bug in :func:`read_csv` for handling null values in index columns when specifying ``na_filter=False`` (:issue:`5239`) +- Bug in :func:`read_csv` when reading numeric category fields with high cardinality (:issue:`18186`) +- Bug in :meth:`DataFrame.to_csv` when the table had ``MultiIndex`` columns, and a list of strings was passed in for ``header`` (:issue:`5539`) +- :func:`read_parquet` now allows to specify the columns to read from a parquet file (:issue:`18154`) +- :func:`read_parquet` now allows to specify kwargs which are passed to the respective engine (:issue:`18216`) +- Bug in parsing integer datetime-like columns with specified format in ``read_sql`` (:issue:`17855`). +- Bug in :meth:`DataFrame.to_msgpack` when serializing data of the numpy.bool_ datatype (:issue:`18390`) +- Bug in :func:`read_json` not decoding when reading line deliminted JSON from S3 (:issue:`17200`) +- Bug in :func:`pandas.io.json.json_normalize` to avoid modification of ``meta`` (:issue:`18610`) +- Bug in :func:`to_latex` where repeated multi-index values were not printed even though a higher level index differed from the previous row (:issue:`14484`) + Plotting ^^^^^^^^ -- +- Bug in ``DataFrame.plot()`` and ``Series.plot()`` with :class:`DatetimeIndex` where a figure generated by them is not pickleable in Python 3 (:issue:`18439`) - - Groupby/Resample/Rolling ^^^^^^^^^^^^^^^^^^^^^^^^ -- +- Bug in ``DataFrame.resample(...).apply(...)`` when there is a callable that returns different columns (:issue:`15169`) +- Bug in ``DataFrame.resample(...)`` when there is a time change (DST) and resampling frequecy is 12h or higher (:issue:`15549`) +- Bug in ``pd.DataFrameGroupBy.count()`` when counting over a datetimelike column (:issue:`13393`) +- Bug in ``rolling.var`` where calculation is inaccurate with a zero-valued array (:issue:`18430`) - - @@ -101,13 +150,15 @@ Sparse Reshaping ^^^^^^^^^ -- -- -- +- Error message in ``pd.merge_asof()`` for key datatype mismatch now includes datatype of left and right key (:issue:`18068`) +- Bug in ``pd.concat`` when empty and non-empty DataFrames or Series are concatenated (:issue:`18178` :issue:`18187`) +- Bug in ``DataFrame.filter(...)`` when :class:`unicode` is passed as a condition in Python 2 (:issue:`13101`) +- Bug when merging empty DataFrames when ``np.seterr(divide='raise')`` is set (:issue:`17776`) Numeric ^^^^^^^ +- Bug in ``pd.Series.rolling.skew()`` and ``rolling.kurt()`` with all equal values has floating issue (:issue:`18044`) - - - @@ -115,13 +166,20 @@ Numeric Categorical ^^^^^^^^^^^ -- -- -- +- Bug in :meth:`DataFrame.astype` where casting to 'category' on an empty ``DataFrame`` causes a segmentation fault (:issue:`18004`) +- Error messages in the testing module have been improved when items have + different ``CategoricalDtype`` (:issue:`18069`) +- ``CategoricalIndex`` can now correctly take a ``pd.api.types.CategoricalDtype`` as its dtype (:issue:`18116`) +- Bug in ``Categorical.unique()`` returning read-only ``codes`` array when all categories were ``NaN`` (:issue:`18051`) +- Bug in ``DataFrame.groupby(axis=1)`` with a ``CategoricalIndex`` (:issue:`18432`) + +String +^^^^^^ + +- :meth:`Series.str.split()` will now propogate ``NaN`` values across all expanded columns instead of ``None`` (:issue:`18450`) Other ^^^^^ - - -- diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index d159761c3f5e6..a44a7288bda45 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -87,7 +87,7 @@ class NegInfinity(object): @cython.boundscheck(False) def is_lexsorted(list list_of_arrays): cdef: - int i + Py_ssize_t i Py_ssize_t n, nlevels int64_t k, cur, pre ndarray arr @@ -99,11 +99,12 @@ def is_lexsorted(list list_of_arrays): cdef int64_t **vecs = malloc(nlevels * sizeof(int64_t*)) for i in range(nlevels): arr = list_of_arrays[i] + assert arr.dtype.name == 'int64' vecs[i] = arr.data # Assume uniqueness?? with nogil: - for i in range(n): + for i in range(1, n): for k in range(nlevels): cur = vecs[k][i] pre = vecs[k][i -1] diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index c96251a0293d6..65e99f5f46fc2 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -19,7 +19,7 @@ from hashtable cimport HashTable from pandas._libs import algos, hashtable as _hash from pandas._libs.tslib import Timestamp, Timedelta -from datetime import datetime, timedelta +from datetime import datetime, timedelta, date from cpython cimport PyTuple_Check, PyList_Check @@ -500,7 +500,7 @@ cpdef convert_scalar(ndarray arr, object value): if arr.descr.type_num == NPY_DATETIME: if isinstance(value, np.ndarray): pass - elif isinstance(value, datetime): + elif isinstance(value, (datetime, np.datetime64, date)): return Timestamp(value).value elif value is None or value != value: return iNaT diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 44fad899ff099..a90039d789972 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -2221,9 +2221,10 @@ def _concatenate_chunks(list chunks): for name in names: arrs = [chunk.pop(name) for chunk in chunks] # Check each arr for consistent types. - dtypes = set([a.dtype for a in arrs]) - if len(dtypes) > 1: - common_type = np.find_common_type(dtypes, []) + dtypes = {a.dtype for a in arrs} + numpy_dtypes = {x for x in dtypes if not is_categorical_dtype(x)} + if len(numpy_dtypes) > 1: + common_type = np.find_common_type(numpy_dtypes, []) if common_type == np.object: warning_columns.append(str(name)) diff --git a/pandas/_libs/src/inference.pyx b/pandas/_libs/src/inference.pyx index b0a64e1ccc225..c340e870e9722 100644 --- a/pandas/_libs/src/inference.pyx +++ b/pandas/_libs/src/inference.pyx @@ -349,13 +349,13 @@ def infer_dtype(object value, bint skipna=False): if values.dtype != np.object_: values = values.astype('O') + # make contiguous + values = values.ravel() + n = len(values) if n == 0: return 'empty' - # make contiguous - values = values.ravel() - # try to use a valid value for i in range(n): val = util.get_value_1d(values, i) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index a0aae6a5de707..20b974ce5a659 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -961,8 +961,7 @@ class NaTType(_NaT): combine = _make_error_func('combine', None) utcnow = _make_error_func('utcnow', None) - if PY3: - timestamp = _make_error_func('timestamp', datetime) + timestamp = _make_error_func('timestamp', Timestamp) # GH9513 NaT methods (except to_datetime64) to raise, return np.nan, or # return NaT create functions that raise, for binding to NaTType @@ -1409,6 +1408,11 @@ cdef class _Timestamp(datetime): def __get__(self): return np.datetime64(self.value, 'ns') + def timestamp(self): + """Return POSIX timestamp as float.""" + # py27 compat, see GH#17329 + return round(self.value / 1e9, 6) + cdef PyTypeObject* ts_type = Timestamp @@ -3366,7 +3370,7 @@ cpdef int64_t tz_convert_single(int64_t val, object tz1, object tz2): """ Convert the val (in i8) from timezone1 to timezone2 - This is a single timezone versoin of tz_convert + This is a single timezone version of tz_convert Parameters ---------- diff --git a/pandas/_libs/tslibs/timezones.pyx b/pandas/_libs/tslibs/timezones.pyx index 7f778dde86e23..ba7031bc382b1 100644 --- a/pandas/_libs/tslibs/timezones.pyx +++ b/pandas/_libs/tslibs/timezones.pyx @@ -283,10 +283,9 @@ cdef object get_dst_info(object tz): def infer_tzinfo(start, end): if start is not None and end is not None: tz = start.tzinfo - if end.tzinfo: - if not (get_timezone(tz) == get_timezone(end.tzinfo)): - msg = 'Inputs must both have the same timezone, {tz1} != {tz2}' - raise AssertionError(msg.format(tz1=tz, tz2=end.tzinfo)) + if not (get_timezone(tz) == get_timezone(end.tzinfo)): + msg = 'Inputs must both have the same timezone, {tz1} != {tz2}' + raise AssertionError(msg.format(tz1=tz, tz2=end.tzinfo)) elif start is not None: tz = start.tzinfo elif end is not None: diff --git a/pandas/_libs/window.pyx b/pandas/_libs/window.pyx index b6bd6f92f6199..a1c4ddbc8d0b0 100644 --- a/pandas/_libs/window.pyx +++ b/pandas/_libs/window.pyx @@ -661,9 +661,11 @@ cdef inline void add_var(double val, double *nobs, double *mean_x, if val == val: nobs[0] = nobs[0] + 1 - delta = (val - mean_x[0]) + # a part of Welford's method for the online variance-calculation + # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance + delta = val - mean_x[0] mean_x[0] = mean_x[0] + delta / nobs[0] - ssqdm_x[0] = ssqdm_x[0] + delta * (val - mean_x[0]) + ssqdm_x[0] = ssqdm_x[0] + ((nobs[0] - 1) * delta ** 2) / nobs[0] cdef inline void remove_var(double val, double *nobs, double *mean_x, @@ -675,9 +677,11 @@ cdef inline void remove_var(double val, double *nobs, double *mean_x, if val == val: nobs[0] = nobs[0] - 1 if nobs[0]: - delta = (val - mean_x[0]) + # a part of Welford's method for the online variance-calculation + # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance + delta = val - mean_x[0] mean_x[0] = mean_x[0] - delta / nobs[0] - ssqdm_x[0] = ssqdm_x[0] - delta * (val - mean_x[0]) + ssqdm_x[0] = ssqdm_x[0] - ((nobs[0] + 1) * delta ** 2) / nobs[0] else: mean_x[0] = 0 ssqdm_x[0] = 0 @@ -689,7 +693,7 @@ def roll_var(ndarray[double_t] input, int64_t win, int64_t minp, Numerically stable implementation using Welford's method. """ cdef: - double val, prev, mean_x = 0, ssqdm_x = 0, nobs = 0, delta + double val, prev, mean_x = 0, ssqdm_x = 0, nobs = 0, delta, mean_x_old int64_t s, e bint is_variable Py_ssize_t i, j, N @@ -749,6 +753,9 @@ def roll_var(ndarray[double_t] input, int64_t win, int64_t minp, add_var(input[i], &nobs, &mean_x, &ssqdm_x) output[i] = calc_var(minp, ddof, nobs, ssqdm_x) + # a part of Welford's method for the online variance-calculation + # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance + # After the first window, observations can both be added and # removed for i from win <= i < N: @@ -760,10 +767,12 @@ def roll_var(ndarray[double_t] input, int64_t win, int64_t minp, # Adding one observation and removing another one delta = val - prev - prev -= mean_x + mean_x_old = mean_x + mean_x += delta / nobs - val -= mean_x - ssqdm_x += (val + prev) * delta + ssqdm_x += ((nobs - 1) * val + + (nobs + 1) * prev + - 2 * nobs * mean_x_old) * delta / nobs else: add_var(val, &nobs, &mean_x, &ssqdm_x) @@ -788,7 +797,17 @@ cdef inline double calc_skew(int64_t minp, int64_t nobs, double x, double xx, A = x / dnobs B = xx / dnobs - A * A C = xxx / dnobs - A * A * A - 3 * A * B - if B <= 0 or nobs < 3: + + # #18044: with uniform distribution, floating issue will + # cause B != 0. and cause the result is a very + # large number. + # + # in core/nanops.py nanskew/nankurt call the function + # _zero_out_fperr(m2) to fix floating error. + # if the variance is less than 1e-14, it could be + # treat as zero, here we follow the original + # skew/kurt behaviour to check B <= 1e-14 + if B <= 1e-14 or nobs < 3: result = NaN else: R = sqrt(B) @@ -915,7 +934,16 @@ cdef inline double calc_kurt(int64_t minp, int64_t nobs, double x, double xx, R = R * A D = xxxx / dnobs - R - 6 * B * A * A - 4 * C * A - if B == 0 or nobs < 4: + # #18044: with uniform distribution, floating issue will + # cause B != 0. and cause the result is a very + # large number. + # + # in core/nanops.py nanskew/nankurt call the function + # _zero_out_fperr(m2) to fix floating error. + # if the variance is less than 1e-14, it could be + # treat as zero, here we follow the original + # skew/kurt behaviour to check B <= 1e-14 + if B <= 1e-14 or nobs < 4: result = NaN else: K = (dnobs * dnobs - 1.) * D / (B * B) - 3 * ((dnobs - 1.) ** 2) diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 3853ac017044c..288d9d7742daf 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -257,6 +257,16 @@ def u(s): def u_safe(s): return s + def to_str(s): + """ + Convert bytes and non-string into Python 3 str + """ + if isinstance(s, binary_type): + s = bytes_to_str(s) + elif not isinstance(s, string_types): + s = str(s) + return s + def strlen(data, encoding=None): # encoding is for compat with PY2 return len(data) @@ -302,6 +312,14 @@ def u_safe(s): except: return s + def to_str(s): + """ + Convert unicode and non-string into Python 2 str + """ + if not isinstance(s, string_types): + s = str(s) + return s + def strlen(data, encoding=None): try: data = data.decode(encoding) @@ -381,17 +399,20 @@ def raise_with_traceback(exc, traceback=Ellipsis): # http://stackoverflow.com/questions/4126348 # Thanks to @martineau at SO -from dateutil import parser as _date_parser import dateutil + +if PY2 and LooseVersion(dateutil.__version__) == '2.0': + # dateutil brokenness + raise Exception('dateutil 2.0 incompatible with Python 2.x, you must ' + 'install version 1.5 or 2.1+!') + +from dateutil import parser as _date_parser if LooseVersion(dateutil.__version__) < '2.0': + @functools.wraps(_date_parser.parse) def parse_date(timestr, *args, **kwargs): timestr = bytes(timestr) return _date_parser.parse(timestr, *args, **kwargs) -elif PY2 and LooseVersion(dateutil.__version__) == '2.0': - # dateutil brokenness - raise Exception('dateutil 2.0 incompatible with Python 2.x, you must ' - 'install version 1.5 or 2.1+!') else: parse_date = _date_parser.parse diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index e709c771b7d18..c574e6d56916b 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -2268,7 +2268,7 @@ def _recode_for_categories(codes, old_categories, new_categories): if len(old_categories) == 0: # All null anyway, so just retain the nulls - return codes + return codes.copy() indexer = coerce_indexer_dtype(new_categories.get_indexer(old_categories), new_categories) new_codes = take_1d(indexer, codes.copy(), fill_value=-1) diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 33531e80449d8..94208a61a4377 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -479,3 +479,29 @@ def use_inf_as_na_cb(key): cf.register_option( 'engine', 'auto', parquet_engine_doc, validator=is_one_of_factory(['auto', 'pyarrow', 'fastparquet'])) + +# -------- +# Plotting +# --------- + +register_converter_doc = """ +: bool + Whether to register converters with matplotlib's units registry for + dates, times, datetimes, and Periods. Toggling to False will remove + the converters, restoring any converters that pandas overwrote. +""" + + +def register_converter_cb(key): + from pandas.plotting import register_matplotlib_converters + from pandas.plotting import deregister_matplotlib_converters + + if cf.get_option(key): + register_matplotlib_converters() + else: + deregister_matplotlib_converters() + + +with cf.config_prefix("plotting.matplotlib"): + cf.register_option("register_converters", True, register_converter_doc, + validator=bool, cb=register_converter_cb) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index f3b11e52cdd7a..eae283e9bc00d 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -136,7 +136,7 @@ def trans(x): # noqa try: if np.allclose(new_result, result, rtol=0): return new_result - except: + except Exception: # comparison of an object dtype with a number type could # hit here @@ -151,14 +151,14 @@ def trans(x): # noqa elif dtype.kind in ['M', 'm'] and result.dtype.kind in ['i', 'f']: try: result = result.astype(dtype) - except: + except Exception: if dtype.tz: # convert to datetime and change timezone from pandas import to_datetime result = to_datetime(result).tz_localize('utc') result = result.tz_convert(dtype.tz) - except: + except Exception: pass return result @@ -210,7 +210,7 @@ def changeit(): new_result[mask] = om_at result[:] = new_result return result, False - except: + except Exception: pass # we are forced to change the dtype of the result as the input @@ -243,7 +243,7 @@ def changeit(): try: np.place(result, mask, other) - except: + except Exception: return changeit() return result, False @@ -274,14 +274,14 @@ def maybe_promote(dtype, fill_value=np.nan): if issubclass(dtype.type, np.datetime64): try: fill_value = tslib.Timestamp(fill_value).value - except: + except Exception: # the proper thing to do here would probably be to upcast # to object (but numpy 1.6.1 doesn't do this properly) fill_value = iNaT elif issubclass(dtype.type, np.timedelta64): try: fill_value = lib.Timedelta(fill_value).value - except: + except Exception: # as for datetimes, cannot upcast to object fill_value = iNaT else: @@ -592,12 +592,12 @@ def maybe_convert_scalar(values): def coerce_indexer_dtype(indexer, categories): """ coerce the indexer input array to the smallest dtype possible """ - l = len(categories) - if l < _int8_max: + length = len(categories) + if length < _int8_max: return _ensure_int8(indexer) - elif l < _int16_max: + elif length < _int16_max: return _ensure_int16(indexer) - elif l < _int32_max: + elif length < _int32_max: return _ensure_int32(indexer) return _ensure_int64(indexer) @@ -629,7 +629,7 @@ def conv(r, dtype): r = float(r) elif dtype.kind == 'i': r = int(r) - except: + except Exception: pass return r @@ -756,7 +756,7 @@ def maybe_convert_objects(values, convert_dates=True, convert_numeric=True, if not isna(new_values).all(): values = new_values - except: + except Exception: pass else: # soft-conversion @@ -817,7 +817,7 @@ def soft_convert_objects(values, datetime=True, numeric=True, timedelta=True, # If all NaNs, then do not-alter values = converted if not isna(converted).all() else values values = values.copy() if copy else values - except: + except Exception: pass return values @@ -888,10 +888,10 @@ def try_datetime(v): try: from pandas import to_datetime return to_datetime(v) - except: + except Exception: pass - except: + except Exception: pass return v.reshape(shape) @@ -903,7 +903,7 @@ def try_timedelta(v): from pandas import to_timedelta try: return to_timedelta(v)._values.reshape(shape) - except: + except Exception: return v.reshape(shape) inferred_type = lib.infer_datetimelike_array(_ensure_object(v)) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 93993fd0a0cab..bca5847f3a6cc 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -569,9 +569,10 @@ def _concat_rangeindex_same_dtype(indexes): start = step = next = None - for obj in indexes: - if not len(obj): - continue + # Filter the empty indexes + non_empty_indexes = [obj for obj in indexes if len(obj)] + + for obj in non_empty_indexes: if start is None: # This is set by the first non-empty index @@ -595,8 +596,16 @@ def _concat_rangeindex_same_dtype(indexes): if step is not None: next = obj[-1] + step - if start is None: + if non_empty_indexes: + # Get the stop value from "next" or alternatively + # from the last non-empty index + stop = non_empty_indexes[-1]._stop if next is None else next + else: + # Here all "indexes" had 0 length, i.e. were empty. + # Simply take start, stop, and step from the last empty index. + obj = indexes[-1] start = obj._start step = obj._step - stop = obj._stop if next is None else next + stop = obj._stop + return indexes[0].__class__(start, stop, step) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a1af806e5cb9e..ad79001e45b86 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -997,7 +997,7 @@ def to_dict(self, orient='dict', into=dict): for k, v in compat.iteritems(self)) elif orient.lower().startswith('r'): return [into_c((k, _maybe_box_datetimelike(v)) - for k, v in zip(self.columns, row)) + for k, v in zip(self.columns, np.atleast_1d(row))) for row in self.values] elif orient.lower().startswith('i'): return into_c((k, v.to_dict(into)) for k, v in self.iterrows()) @@ -3751,7 +3751,7 @@ def nlargest(self, n, columns, keep='first'): Number of items to retrieve columns : list or str Column name or names to order by - keep : {'first', 'last', False}, default 'first' + keep : {'first', 'last'}, default 'first' Where there are duplicate values: - ``first`` : take the first occurrence. - ``last`` : take the last occurrence. @@ -3788,7 +3788,7 @@ def nsmallest(self, n, columns, keep='first'): Number of items to retrieve columns : list or str Column name or names to order by - keep : {'first', 'last', False}, default 'first' + keep : {'first', 'last'}, default 'first' Where there are duplicate values: - ``first`` : take the first occurrence. - ``last`` : take the last occurrence. @@ -4035,6 +4035,8 @@ def combine(self, other, func, fill_value=None, overwrite=True): ---------- other : DataFrame func : function + Function that takes two series as inputs and return a Series or a + scalar fill_value : scalar value overwrite : boolean, default True If True then overwrite values for common keys in the calling frame @@ -4042,8 +4044,21 @@ def combine(self, other, func, fill_value=None, overwrite=True): Returns ------- result : DataFrame - """ + Examples + -------- + >>> df1 = DataFrame({'A': [0, 0], 'B': [4, 4]}) + >>> df2 = DataFrame({'A': [1, 1], 'B': [3, 3]}) + >>> df1.combine(df2, lambda s1, s2: s1 if s1.sum() < s2.sum() else s2) + A B + 0 0 3 + 1 0 3 + + See Also + -------- + DataFrame.combine_first : Combine two DataFrame objects and default to + non-null values in frame calling the method + """ other_idxlen = len(other.index) # save for compare this, other = self.align(other, copy=False) @@ -4131,16 +4146,24 @@ def combine_first(self, other): ---------- other : DataFrame + Returns + ------- + combined : DataFrame + Examples -------- - a's values prioritized, use values from b to fill holes: - - >>> a.combine_first(b) + df1's values prioritized, use values from df2 to fill holes: + >>> df1 = pd.DataFrame([[1, np.nan]]) + >>> df2 = pd.DataFrame([[3, 4]]) + >>> df1.combine_first(df2) + 0 1 + 0 1 4.0 - Returns - ------- - combined : DataFrame + See Also + -------- + DataFrame.combine : Perform series-wise operation on two DataFrames + using a given function """ import pandas.core.computation.expressions as expressions @@ -4283,7 +4306,7 @@ def first_valid_index(self): return valid_indices[0] if len(valid_indices) else None @Appender(_shared_docs['valid_index'] % { - 'position': 'first', 'klass': 'DataFrame'}) + 'position': 'last', 'klass': 'DataFrame'}) def last_valid_index(self): if len(self) == 0: return None @@ -5113,7 +5136,7 @@ def append(self, other, ignore_index=False, verify_integrity=False): >>> df = pd.DataFrame(columns=['A']) >>> for i in range(5): - ... df = df.append({'A'}: i}, ignore_index=True) + ... df = df.append({'A': i}, ignore_index=True) >>> df A 0 0 @@ -5790,7 +5813,12 @@ def idxmin(self, axis=0, skipna=True): 0 or 'index' for row-wise, 1 or 'columns' for column-wise skipna : boolean, default True Exclude NA/null values. If an entire row/column is NA, the result - will be NA + will be NA. + + Raises + ------ + ValueError + * If the row/column is empty Returns ------- @@ -5821,7 +5849,12 @@ def idxmax(self, axis=0, skipna=True): 0 or 'index' for row-wise, 1 or 'columns' for column-wise skipna : boolean, default True Exclude NA/null values. If an entire row/column is NA, the result - will be first index. + will be NA. + + Raises + ------ + ValueError + * If the row/column is empty Returns ------- diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 118e7d5cd437b..58d86251a4a62 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -49,7 +49,7 @@ from pandas.tseries.frequencies import to_offset from pandas import compat from pandas.compat.numpy import function as nv -from pandas.compat import (map, zip, lzip, lrange, string_types, +from pandas.compat import (map, zip, lzip, lrange, string_types, to_str, isidentifier, set_function_name, cPickle as pkl) from pandas.core.ops import _align_method_FRAME import pandas.core.nanops as nanops @@ -3235,14 +3235,14 @@ def filter(self, items=None, like=None, regex=None, axis=None): **{name: [r for r in items if r in labels]}) elif like: def f(x): - if not isinstance(x, string_types): - x = str(x) - return like in x + return like in to_str(x) values = labels.map(f) return self.loc(axis=axis)[values] elif regex: + def f(x): + return matcher.search(to_str(x)) is not None matcher = re.compile(regex) - values = labels.map(lambda x: matcher.search(str(x)) is not None) + values = labels.map(f) return self.loc(axis=axis)[values] else: raise TypeError('Must pass either `items`, `like`, or `regex`') diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 5c07033f5a68f..5931f6e009dab 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -77,6 +77,119 @@ pandas.Panel.%(name)s """ +_apply_docs = dict( + template=""" + Apply function ``func`` group-wise and combine the results together. + + The function passed to ``apply`` must take a {input} as its first + argument and return a dataframe, a series or a scalar. ``apply`` will + then take care of combining the results back together into a single + dataframe or series. ``apply`` is therefore a highly flexible + grouping method. + + While ``apply`` is a very flexible method, its downside is that + using it can be quite a bit slower than using more specific methods. + Pandas offers a wide range of method that will be much faster + than using ``apply`` for their specific purposes, so try to use them + before reaching for ``apply``. + + Parameters + ---------- + func : function + A callable that takes a {input} as its first argument, and + returns a dataframe, a series or a scalar. In addition the + callable may take positional and keyword arguments + args, kwargs : tuple and dict + Optional positional and keyword arguments to pass to ``func`` + + Returns + ------- + applied : Series or DataFrame + + Notes + ----- + In the current implementation ``apply`` calls func twice on the + first group to decide whether it can take a fast or slow code + path. This can lead to unexpected behavior if func has + side-effects, as they will take effect twice for the first + group. + + Examples + -------- + {examples} + + See also + -------- + pipe : Apply function to the full GroupBy object instead of to each + group. + aggregate, transform + """, + dataframe_examples=""" + >>> df = pd.DataFrame({'A': 'a a b'.split(), 'B': [1,2,3], 'C': [4,6, 5]}) + >>> g = df.groupby('A') + + From ``df`` above we can see that ``g`` has two groups, ``a``, ``b``. + Calling ``apply`` in various ways, we can get different grouping results: + + Example 1: below the function passed to ``apply`` takes a dataframe as + its argument and returns a dataframe. ``apply`` combines the result for + each group together into a new dataframe: + + >>> g.apply(lambda x: x / x.sum()) + B C + 0 0.333333 0.4 + 1 0.666667 0.6 + 2 1.000000 1.0 + + Example 2: The function passed to ``apply`` takes a dataframe as + its argument and returns a series. ``apply`` combines the result for + each group together into a new dataframe: + + >>> g.apply(lambda x: x.max() - x.min()) + B C + A + a 1 2 + b 0 0 + + Example 3: The function passed to ``apply`` takes a dataframe as + its argument and returns a scalar. ``apply`` combines the result for + each group together into a series, including setting the index as + appropriate: + + >>> g.apply(lambda x: x.C.max() - x.B.min()) + A + a 5 + b 2 + dtype: int64 + """, + series_examples=""" + >>> ser = pd.Series([0, 1, 2], index='a a b'.split()) + >>> g = ser.groupby(ser.index) + + From ``ser`` above we can see that ``g`` has two groups, ``a``, ``b``. + Calling ``apply`` in various ways, we can get different grouping results: + + Example 1: The function passed to ``apply`` takes a series as + its argument and returns a series. ``apply`` combines the result for + each group together into a new series: + + >>> g.apply(lambda x: x*2 if x.name == 'b' else x/2) + 0 0.0 + 1 0.5 + 2 4.0 + dtype: float64 + + Example 2: The function passed to ``apply`` takes a series as + its argument and returns a scalar. ``apply`` combines the result for + each group together into a series, including setting the index as + appropriate: + + >>> g.apply(lambda x: x.max() - x.min()) + a 1 + b 0 + dtype: int64 + """) + _transform_template = """ Call function producing a like-indexed %(klass)s on each group and return a %(klass)s having the same indexes as the original object @@ -144,6 +257,7 @@ """ + # special case to prevent duplicate plots when catching exceptions when # forwarding methods from NDFrames _plotting_methods = frozenset(['plot', 'boxplot', 'hist']) @@ -206,12 +320,13 @@ class Grouper(object): sort : boolean, default to False whether to sort the resulting labels - additional kwargs to control time-like groupers (when freq is passed) + additional kwargs to control time-like groupers (when ``freq`` is passed) - closed : closed end of interval; left or right - label : interval boundary to use for labeling; left or right + closed : closed end of interval; 'left' or 'right' + label : interval boundary to use for labeling; 'left' or 'right' convention : {'start', 'end', 'e', 's'} If grouper is PeriodIndex + base, loffset Returns ------- @@ -233,6 +348,7 @@ class Grouper(object): >>> df.groupby(Grouper(level='date', freq='60s', axis=1)) """ + _attributes = ('key', 'level', 'freq', 'axis', 'sort') def __new__(cls, *args, **kwargs): if kwargs.get('freq') is not None: @@ -333,6 +449,14 @@ def _set_grouper(self, obj, sort=False): def groups(self): return self.grouper.groups + def __repr__(self): + attrs_list = ["{}={!r}".format(attr_name, getattr(self, attr_name)) + for attr_name in self._attributes + if getattr(self, attr_name) is not None] + attrs = ", ".join(attrs_list) + cls_name = self.__class__.__name__ + return "{}({})".format(cls_name, attrs) + class GroupByPlot(PandasObject): """ @@ -653,50 +777,10 @@ def __iter__(self): """ return self.grouper.get_iterator(self.obj, axis=self.axis) - @Substitution(name='groupby') + @Appender(_apply_docs['template'] + .format(input="dataframe", + examples=_apply_docs['dataframe_examples'])) def apply(self, func, *args, **kwargs): - """ - Apply function and combine results together in an intelligent way. - - The split-apply-combine combination rules attempt to be as common - sense based as possible. For example: - - case 1: - group DataFrame - apply aggregation function (f(chunk) -> Series) - yield DataFrame, with group axis having group labels - - case 2: - group DataFrame - apply transform function ((f(chunk) -> DataFrame with same indexes) - yield DataFrame with resulting chunks glued together - - case 3: - group Series - apply function with f(chunk) -> DataFrame - yield DataFrame with result of chunks glued together - - Parameters - ---------- - func : function - - Notes - ----- - See online documentation for full exposition on how to use apply. - - In the current implementation apply calls func twice on the - first group to decide whether it can take a fast or slow code - path. This can lead to unexpected behavior if func has - side-effects, as they will take effect twice for the first - group. - - - See also - -------- - pipe : Apply function to the full GroupBy object instead of to each - group. - aggregate, transform - """ func = self._is_builtin_func(func) @@ -2847,9 +2931,11 @@ def is_in_obj(gpr): else: in_axis, name = False, None - if is_categorical_dtype(gpr) and len(gpr) != len(obj): - raise ValueError("Categorical dtype grouper must " - "have len(grouper) == len(data)") + if is_categorical_dtype(gpr) and len(gpr) != obj.shape[axis]: + raise ValueError( + ("Length of grouper ({len_gpr}) and axis ({len_axis})" + " must be same length" + .format(len_gpr=len(gpr), len_axis=obj.shape[axis]))) # create the Grouping # allow us to passing the actual Grouping as the gpr @@ -3011,6 +3097,12 @@ def _selection_name(self): """) + @Appender(_apply_docs['template'] + .format(input='series', + examples=_apply_docs['series_examples'])) + def apply(self, func, *args, **kwargs): + return super(SeriesGroupBy, self).apply(func, *args, **kwargs) + @Appender(_agg_doc) @Appender(_shared_docs['aggregate'] % dict( klass='Series', @@ -4363,7 +4455,8 @@ def count(self): ids, _, ngroups = self.grouper.group_info mask = ids != -1 - val = ((mask & ~isna(blk.get_values())) for blk in data.blocks) + val = ((mask & ~isna(np.atleast_2d(blk.get_values()))) + for blk in data.blocks) loc = (blk.mgr_locs for blk in data.blocks) counter = partial(count_level_2d, labels=ids, max_bin=ngroups, axis=1) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index a995fc10a6674..83c78f084a9da 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1934,7 +1934,10 @@ def putmask(self, mask, value): try: np.putmask(values, mask, self._convert_for_op(value)) return self._shallow_copy(values) - except (ValueError, TypeError): + except (ValueError, TypeError) as err: + if is_object_dtype(self): + raise err + # coerces to object return self.astype(object).putmask(mask, value) @@ -2032,7 +2035,7 @@ def equals(self, other): try: return array_equivalent(_values_from_object(self), _values_from_object(other)) - except: + except Exception: return False def identical(self, other): @@ -2315,7 +2318,7 @@ def intersection(self, other): try: indexer = Index(other._values).get_indexer(self._values) indexer = indexer.take((indexer != -1).nonzero()[0]) - except: + except Exception: # duplicates indexer = algos.unique1d( Index(other._values).get_indexer_non_unique(self._values)[0]) @@ -3024,13 +3027,13 @@ def _reindex_non_unique(self, target): new_indexer = None if len(missing): - l = np.arange(len(indexer)) + length = np.arange(len(indexer)) missing = _ensure_platform_int(missing) missing_labels = target.take(missing) - missing_indexer = _ensure_int64(l[~check]) + missing_indexer = _ensure_int64(length[~check]) cur_labels = self.take(indexer[check]).values - cur_indexer = _ensure_int64(l[check]) + cur_indexer = _ensure_int64(length[check]) new_labels = np.empty(tuple([len(indexer)]), dtype=object) new_labels[cur_indexer] = cur_labels diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 8b680127723c3..70b531ffb0ec4 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -79,7 +79,8 @@ def __new__(cls, data=None, categories=None, ordered=None, dtype=None, if data is not None or categories is None: cls._scalar_data_error(data) data = [] - data = cls._create_categorical(cls, data, categories, ordered) + data = cls._create_categorical(cls, data, categories, ordered, + dtype) if copy: data = data.copy() diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 18be6c61abdf7..50085889ad88f 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -2,9 +2,11 @@ from __future__ import division import operator import warnings -from datetime import time, datetime -from datetime import timedelta +from datetime import time, datetime, timedelta + import numpy as np +from pytz import utc + from pandas.core.base import _shared_docs from pandas.core.dtypes.common import ( @@ -55,10 +57,6 @@ from pandas._libs.tslibs import timezones -def _utc(): - import pytz - return pytz.utc - # -------- some conversion wrapper functions @@ -66,7 +64,6 @@ def _field_accessor(name, field, docstring=None): def f(self): values = self.asi8 if self.tz is not None: - utc = _utc() if self.tz is not utc: values = self._local_timestamps() @@ -451,7 +448,7 @@ def _generate(cls, start, end, periods, name, offset, try: inferred_tz = timezones.infer_tzinfo(start, end) - except: + except Exception: raise TypeError('Start and end cannot both be tz-aware with ' 'different timezones') @@ -562,8 +559,6 @@ def _convert_for_op(self, value): raise ValueError('Passed item and index have different timezone') def _local_timestamps(self): - utc = _utc() - if self.is_monotonic: return libts.tz_convert(self.asi8, utc, self.tz) else: @@ -823,7 +818,6 @@ def _add_delta(self, delta): tz = 'UTC' if self.tz is not None else None result = DatetimeIndex(new_values, tz=tz, name=name, freq='infer') - utc = _utc() if self.tz is not None and self.tz is not utc: result = result.tz_convert(self.tz) return result @@ -877,7 +871,6 @@ def astype(self, dtype, copy=True): raise ValueError('Cannot cast DatetimeIndex to dtype %s' % dtype) def _get_time_micros(self): - utc = _utc() values = self.asi8 if self.tz is not None and self.tz is not utc: values = self._local_timestamps() @@ -1183,12 +1176,12 @@ def __iter__(self): # convert in chunks of 10k for efficiency data = self.asi8 - l = len(self) + length = len(self) chunksize = 10000 - chunks = int(l / chunksize) + 1 + chunks = int(length / chunksize) + 1 for i in range(chunks): start_i = i * chunksize - end_i = min((i + 1) * chunksize, l) + end_i = min((i + 1) * chunksize, length) converted = libts.ints_to_pydatetime(data[start_i:end_i], tz=self.tz, freq=self.freq, box=True) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 7bf7cfce515a1..9619f5403b761 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -179,7 +179,7 @@ def __new__(cls, data, closed='right', if isinstance(data, IntervalIndex): left = data.left right = data.right - + closed = data.closed else: # don't allow scalars @@ -187,7 +187,7 @@ def __new__(cls, data, closed='right', cls._scalar_data_error(data) data = IntervalIndex.from_intervals(data, name=name) - left, right = data.left, data.right + left, right, closed = data.left, data.right, data.closed return cls._simple_new(left, right, closed, name, copy=copy, verify_integrity=verify_integrity) @@ -569,7 +569,8 @@ def copy(self, deep=False, name=None): left = self.left.copy(deep=True) if deep else self.left right = self.right.copy(deep=True) if deep else self.right name = name if name is not None else self.name - return type(self).from_arrays(left, right, name=name) + closed = self.closed + return type(self).from_arrays(left, right, closed=closed, name=name) @Appender(_index_shared_docs['astype']) def astype(self, dtype, copy=True): diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 4cc59f5297058..f4acb6862addb 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -446,6 +446,17 @@ def _shallow_copy_with_infer(self, values=None, **kwargs): **kwargs) return self._shallow_copy(values, **kwargs) + @Appender(_index_shared_docs['__contains__'] % _index_doc_kwargs) + def __contains__(self, key): + hash(key) + try: + self.get_loc(key) + return True + except (LookupError, TypeError): + return False + + contains = __contains__ + @Appender(_index_shared_docs['_shallow_copy']) def _shallow_copy(self, values=None, **kwargs): if values is not None: @@ -809,9 +820,10 @@ def duplicated(self, keep='first'): return duplicated_int64(ids, keep) - @Appender(ibase._index_shared_docs['fillna']) def fillna(self, value=None, downcast=None): - # isna is not implemented for MultiIndex + """ + fillna is not implemented for MultiIndex + """ raise NotImplementedError('isna is not defined for MultiIndex') @Appender(_index_shared_docs['dropna']) @@ -1370,17 +1382,6 @@ def nlevels(self): def levshape(self): return tuple(len(x) for x in self.levels) - @Appender(_index_shared_docs['__contains__'] % _index_doc_kwargs) - def __contains__(self, key): - hash(key) - try: - self.get_loc(key) - return True - except LookupError: - return False - - contains = __contains__ - def __reduce__(self): """Necessary for making this object picklable""" d = dict(levels=[lev for lev in self.levels], diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 6e08c32f30dcd..445adb6bd3b18 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -36,6 +36,26 @@ join as libjoin, Timedelta, NaT, iNaT) +def _field_accessor(name, alias, docstring=None): + def f(self): + if self.hasnans: + result = np.empty(len(self), dtype='float64') + mask = self._isnan + imask = ~mask + result.flat[imask] = np.array([getattr(Timedelta(val), alias) + for val in self.asi8[imask]]) + result[mask] = np.nan + else: + result = np.array([getattr(Timedelta(val), alias) + for val in self.asi8], dtype='int64') + + return Index(result, name=self.name) + + f.__name__ = name + f.__doc__ = docstring + return property(f) + + def _td_index_cmp(opname, nat_result=False): """ Wrap comparison operations to convert timedelta-like to timedelta64 @@ -380,46 +400,17 @@ def _format_native_types(self, na_rep=u('NaT'), nat_rep=na_rep, justify='all').get_result() - def _get_field(self, m): - - values = self.asi8 - hasnans = self.hasnans - if hasnans: - result = np.empty(len(self), dtype='float64') - mask = self._isnan - imask = ~mask - result.flat[imask] = np.array( - [getattr(Timedelta(val), m) for val in values[imask]]) - result[mask] = np.nan - else: - result = np.array([getattr(Timedelta(val), m) - for val in values], dtype='int64') - return Index(result, name=self.name) - - @property - def days(self): - """ Number of days for each element. """ - return self._get_field('days') - - @property - def seconds(self): - """ Number of seconds (>= 0 and less than 1 day) for each element. """ - return self._get_field('seconds') - - @property - def microseconds(self): - """ - Number of microseconds (>= 0 and less than 1 second) for each - element. """ - return self._get_field('microseconds') - - @property - def nanoseconds(self): - """ - Number of nanoseconds (>= 0 and less than 1 microsecond) for each - element. - """ - return self._get_field('nanoseconds') + days = _field_accessor("days", "days", + " Number of days for each element. ") + seconds = _field_accessor("seconds", "seconds", + " Number of seconds (>= 0 and less than 1 day) " + "for each element. ") + microseconds = _field_accessor("microseconds", "microseconds", + "\nNumber of microseconds (>= 0 and less " + "than 1 second) for each\nelement. ") + nanoseconds = _field_accessor("nanoseconds", "nanoseconds", + "\nNumber of nanoseconds (>= 0 and less " + "than 1 microsecond) for each\nelement.\n") @property def components(self): @@ -850,7 +841,7 @@ def insert(self, loc, item): if _is_convertible_to_td(item): try: item = Timedelta(item) - except: + except Exception: pass freq = None diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 045580d393b26..b929dfd5a9d0b 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1946,7 +1946,8 @@ def _can_hold_element(self, element): tipo = maybe_infer_dtype_type(element) if tipo is not None: return issubclass(tipo.type, np.timedelta64) - return isinstance(element, (timedelta, np.timedelta64)) + return is_integer(element) or isinstance( + element, (timedelta, np.timedelta64)) def fillna(self, value, **kwargs): diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index baeb869239c1e..e1c09947ac0b4 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -548,6 +548,9 @@ def nanskew(values, axis=None, skipna=True): m3 = adjusted3.sum(axis, dtype=np.float64) # floating point error + # + # #18044 in _libs/windows.pyx calc_skew follow this behavior + # to fix the fperr to treat m2 <1e-14 as zero m2 = _zero_out_fperr(m2) m3 = _zero_out_fperr(m3) @@ -609,6 +612,9 @@ def nankurt(values, axis=None, skipna=True): result = numer / denom - adj # floating point error + # + # #18044 in _libs/windows.pyx calc_kurt follow this behavior + # to fix the fperr to treat denom <1e-14 as zero numer = _zero_out_fperr(numer) denom = _zero_out_fperr(denom) @@ -699,6 +705,7 @@ def _maybe_null_out(result, axis, mask): def _zero_out_fperr(arg): + # #18044 reference this behavior to fix rolling skew/kurt issue if isinstance(arg, np.ndarray): with np.errstate(invalid='ignore'): return np.where(np.abs(arg) < 1e-14, 0, arg) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 6edbb99641542..1adb3a078cca3 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -395,7 +395,11 @@ def _groupby_and_aggregate(self, how, grouper=None, *args, **kwargs): grouped = PanelGroupBy(obj, grouper=grouper, axis=self.axis) try: - result = grouped.aggregate(how, *args, **kwargs) + if isinstance(obj, ABCDataFrame) and compat.callable(how): + # Check if the function is reducing or not. + result = grouped._aggregate_item_by_item(how, *args, **kwargs) + else: + result = grouped.aggregate(how, *args, **kwargs) except Exception: # we have a non-reducing function @@ -1010,22 +1014,18 @@ class TimeGrouper(Grouper): Parameters ---------- freq : pandas date offset or offset alias for identifying bin edges - closed : closed end of interval; left or right - label : interval boundary to use for labeling; left or right - nperiods : optional, integer + closed : closed end of interval; 'left' or 'right' + label : interval boundary to use for labeling; 'left' or 'right' convention : {'start', 'end', 'e', 's'} If axis is PeriodIndex - - Notes - ----- - Use begin, end, nperiods to generate intervals that cannot be derived - directly from the associated object """ + _attributes = Grouper._attributes + ('closed', 'label', 'how', + 'loffset', 'kind', 'convention', + 'base') def __init__(self, freq='Min', closed=None, label=None, how='mean', - nperiods=None, axis=0, - fill_method=None, limit=None, loffset=None, kind=None, - convention=None, base=0, **kwargs): + axis=0, fill_method=None, limit=None, loffset=None, + kind=None, convention=None, base=0, **kwargs): freq = to_offset(freq) end_types = set(['M', 'A', 'Q', 'BM', 'BA', 'BQ', 'W']) @@ -1044,7 +1044,6 @@ def __init__(self, freq='Min', closed=None, label=None, how='mean', self.closed = closed self.label = label - self.nperiods = nperiods self.kind = kind self.convention = convention or 'E' @@ -1137,6 +1136,16 @@ def _get_time_bins(self, ax): tz=tz, name=ax.name) + # GH 15549 + # In edge case of tz-aware resapmling binner last index can be + # less than the last variable in data object, this happens because of + # DST time change + if len(binner) > 1 and binner[-1] < last: + extra_date_range = pd.date_range(binner[-1], last + self.freq, + freq=self.freq, tz=tz, + name=ax.name) + binner = labels = binner.append(extra_date_range[1:]) + # a little hack trimmed = False if (len(binner) > 2 and binner[-2] == last and diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index e409090e76944..bdb7ec00a29fd 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -126,7 +126,7 @@ def _groupby_and_merge(by, on, left, right, _merge_pieces, try: if k in merged: merged[k] = key - except: + except KeyError: pass pieces.append(merged) @@ -1253,10 +1253,12 @@ def _get_merge_keys(self): join_names) = super(_AsOfMerge, self)._get_merge_keys() # validate index types are the same - for lk, rk in zip(left_join_keys, right_join_keys): + for i, (lk, rk) in enumerate(zip(left_join_keys, right_join_keys)): if not is_dtype_equal(lk.dtype, rk.dtype): - raise MergeError("incompatible merge keys, " - "must be the same type") + raise MergeError("incompatible merge keys [{i}] {lkdtype} and " + "{rkdtype}, must be the same type" + .format(i=i, lkdtype=lk.dtype, + rkdtype=rk.dtype)) # validate tolerance; must be a Timedelta if we have a DTI if self.tolerance is not None: @@ -1266,8 +1268,10 @@ def _get_merge_keys(self): else: lt = left_join_keys[-1] - msg = "incompatible tolerance, must be compat " \ - "with type {lt}".format(lt=type(lt)) + msg = ("incompatible tolerance {tolerance}, must be compat " + "with type {lkdtype}".format( + tolerance=type(self.tolerance), + lkdtype=lt.dtype)) if is_datetime64_dtype(lt) or is_datetime64tz_dtype(lt): if not isinstance(self.tolerance, Timedelta): @@ -1503,12 +1507,12 @@ def _sort_labels(uniques, left, right): # tuplesafe uniques = Index(uniques).values - l = len(left) + llength = len(left) labels = np.concatenate([left, right]) _, new_labels = sorting.safe_sort(uniques, labels, na_sentinel=-1) new_labels = _ensure_int64(new_labels) - new_left, new_right = new_labels[:l], new_labels[l:] + new_left, new_right = new_labels[:llength], new_labels[llength:] return new_left, new_right @@ -1525,7 +1529,8 @@ def _get_join_keys(llab, rlab, shape, sort): rkey = stride * rlab[0].astype('i8', subok=False, copy=False) for i in range(1, nlev): - stride //= shape[i] + with np.errstate(divide='ignore'): + stride //= shape[i] lkey += llab[i] * stride rkey += rlab[i] * stride diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index fda339aa30461..2adf17a227a59 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -148,7 +148,7 @@ def qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise'): Parameters ---------- - x : ndarray or Series + x : 1d ndarray or Series q : integer or array of quantiles Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles diff --git a/pandas/core/series.py b/pandas/core/series.py index 1c92c4b8850ee..2b4f9c4c6f7e3 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -597,7 +597,7 @@ def _ixs(self, i, axis=0): return values[i] except IndexError: raise - except: + except Exception: if isinstance(i, slice): indexer = self.index._convert_slice_indexer(i, kind='iloc') return self._get_values(indexer) @@ -675,7 +675,7 @@ def _get_with(self, key): if isinstance(key, tuple): try: return self._get_values_tuple(key) - except: + except Exception: if len(key) == 1: key = key[0] if isinstance(key, slice): @@ -818,7 +818,7 @@ def _set_with(self, key, value): if not isinstance(key, (list, Series, np.ndarray, Series)): try: key = list(key) - except: + except Exception: key = [key] if isinstance(key, Index): @@ -1306,7 +1306,13 @@ def idxmin(self, axis=None, skipna=True, *args, **kwargs): Parameters ---------- skipna : boolean, default True - Exclude NA/null values + Exclude NA/null values. If the entire Series is NA, the result + will be NA. + + Raises + ------ + ValueError + * If the Series is empty Returns ------- @@ -1336,7 +1342,13 @@ def idxmax(self, axis=None, skipna=True, *args, **kwargs): Parameters ---------- skipna : boolean, default True - Exclude NA/null values + Exclude NA/null values. If the entire Series is NA, the result + will be NA. + + Raises + ------ + ValueError + * If the Series is empty Returns ------- @@ -1731,11 +1743,26 @@ def combine(self, other, func, fill_value=np.nan): ---------- other : Series or scalar value func : function + Function that takes two scalars as inputs and return a scalar fill_value : scalar value Returns ------- result : Series + + Examples + -------- + >>> s1 = Series([1, 2]) + >>> s2 = Series([0, 3]) + >>> s1.combine(s2, lambda x1, x2: x1 if x1 < x2 else x2) + 0 0 + 1 2 + dtype: int64 + + See Also + -------- + Series.combine_first : Combine Series values, choosing the calling + Series's values first """ if isinstance(other, Series): new_index = self.index.union(other.index) @@ -1764,7 +1791,21 @@ def combine_first(self, other): Returns ------- - y : Series + combined : Series + + Examples + -------- + >>> s1 = pd.Series([1, np.nan]) + >>> s2 = pd.Series([3, 4]) + >>> s1.combine_first(s2) + 0 1.0 + 1 4.0 + dtype: float64 + + See Also + -------- + Series.combine : Perform elementwise operation on two Series + using a given function """ new_index = self.index.union(other.index) this = self.reindex(new_index, copy=False) @@ -1982,7 +2023,7 @@ def nlargest(self, n=5, keep='first'): ---------- n : int Return this many descending sorted values - keep : {'first', 'last', False}, default 'first' + keep : {'first', 'last'}, default 'first' Where there are duplicate values: - ``first`` : take the first occurrence. - ``last`` : take the last occurrence. @@ -2029,7 +2070,7 @@ def nsmallest(self, n=5, keep='first'): ---------- n : int Return this many ascending sorted values - keep : {'first', 'last', False}, default 'first' + keep : {'first', 'last'}, default 'first' Where there are duplicate values: - ``first`` : take the first occurrence. - ``last`` : take the last occurrence. diff --git a/pandas/core/strings.py b/pandas/core/strings.py index abef6f6086dbd..9614641aa1abf 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1423,6 +1423,10 @@ def cons_row(x): return [x] result = [cons_row(x) for x in result] + if result: + # propogate nan values to match longest sequence (GH 18450) + max_len = max(len(x) for x in result) + result = [x * max_len if x[0] is np.nan else x for x in result] if not isinstance(expand, bool): raise ValueError("expand must be True or False") diff --git a/pandas/core/window.py b/pandas/core/window.py index 5143dddc5e866..345f9b035a36b 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -503,6 +503,9 @@ class Window(_Window): * ``general_gaussian`` (needs power, width) * ``slepian`` (needs width). + If ``win_type=None`` all points are evenly weighted. To learn more about + different window types see `scipy.signal window functions + `__. """ def validate(self): diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index c5d4a0ecf44ab..24eeb1dd94c18 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -45,7 +45,6 @@ import pandas as pd import numpy as np -import itertools import csv from functools import partial @@ -891,6 +890,7 @@ def get_col_type(dtype): name = any(self.frame.index.names) cname = any(self.frame.columns.names) lastcol = self.frame.index.nlevels - 1 + previous_lev3 = None for i, lev in enumerate(self.frame.index.levels): lev2 = lev.format() blank = ' ' * len(lev2[0]) @@ -901,11 +901,19 @@ def get_col_type(dtype): lev3 = [blank] * clevels if name: lev3.append(lev.name) - for level_idx, group in itertools.groupby( - self.frame.index.labels[i]): - count = len(list(group)) - lev3.extend([lev2[level_idx]] + [blank] * (count - 1)) + current_idx_val = None + for level_idx in self.frame.index.labels[i]: + if ((previous_lev3 is None or + previous_lev3[len(lev3)].isspace()) and + lev2[level_idx] == current_idx_val): + # same index as above row and left index was the same + lev3.append(blank) + else: + # different value than above or left index different + lev3.append(lev2[level_idx]) + current_idx_val = lev2[level_idx] strcols.insert(i, lev3) + previous_lev3 = lev3 column_format = self.column_format if column_format is None: @@ -1695,7 +1703,7 @@ def _save_header(self): else: encoded_labels = [] - if not has_mi_columns: + if not has_mi_columns or has_aliases: encoded_labels += list(write_cols) writer.writerow(encoded_labels) else: diff --git a/pandas/io/gbq.py b/pandas/io/gbq.py index b4dc9173f11ba..caa67d1ce6bce 100644 --- a/pandas/io/gbq.py +++ b/pandas/io/gbq.py @@ -29,9 +29,8 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, The main method a user calls to execute a Query in Google BigQuery and read results into a pandas DataFrame. - Google BigQuery API Client Library v2 for Python is used. - Documentation is available `here - `__ + This function requires the `pandas-gbq package + `__. Authentication to the Google BigQuery service is via OAuth 2.0. @@ -70,7 +69,7 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, dialect : {'legacy', 'standard'}, default 'legacy' 'legacy' : Use BigQuery's legacy SQL dialect. - 'standard' : Use BigQuery's standard SQL (beta), which is + 'standard' : Use BigQuery's standard SQL, which is compliant with the SQL 2011 standard. For more information see `BigQuery SQL Reference `__ diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py index be39f4baba0fb..203b1d62fcbf3 100644 --- a/pandas/io/json/json.py +++ b/pandas/io/json/json.py @@ -5,7 +5,7 @@ import pandas._libs.json as json from pandas._libs.tslib import iNaT -from pandas.compat import StringIO, long, u +from pandas.compat import StringIO, long, u, to_str from pandas import compat, isna from pandas import Series, DataFrame, to_datetime, MultiIndex from pandas.io.common import (get_filepath_or_buffer, _get_handle, @@ -458,8 +458,10 @@ def read(self): if self.lines and self.chunksize: obj = concat(self) elif self.lines: + + data = to_str(self.data) obj = self._get_object_parser( - self._combine_lines(self.data.split('\n')) + self._combine_lines(data.split('\n')) ) else: obj = self._get_object_parser(self.data) @@ -612,7 +614,7 @@ def _try_convert_data(self, name, data, use_dtypes=True, try: dtype = np.dtype(dtype) return data.astype(dtype), True - except: + except (TypeError, ValueError): return data, False if convert_dates: @@ -628,7 +630,7 @@ def _try_convert_data(self, name, data, use_dtypes=True, try: data = data.astype('float64') result = True - except: + except (TypeError, ValueError): pass if data.dtype.kind == 'f': @@ -639,7 +641,7 @@ def _try_convert_data(self, name, data, use_dtypes=True, try: data = data.astype('float64') result = True - except: + except (TypeError, ValueError): pass # do't coerce 0-len data @@ -651,7 +653,7 @@ def _try_convert_data(self, name, data, use_dtypes=True, if (new_data == data).all(): data = new_data result = True - except: + except (TypeError, ValueError): pass # coerce ints to 64 @@ -661,7 +663,7 @@ def _try_convert_data(self, name, data, use_dtypes=True, try: data = data.astype('int64') result = True - except: + except (TypeError, ValueError): pass return data, result @@ -680,7 +682,7 @@ def _try_convert_to_date(self, data): if new_data.dtype == 'object': try: new_data = data.astype('int64') - except: + except (TypeError, ValueError): pass # ignore numbers that are out of range @@ -697,7 +699,7 @@ def _try_convert_to_date(self, data): unit=date_unit) except ValueError: continue - except: + except Exception: break return new_data, True return data, False diff --git a/pandas/io/json/normalize.py b/pandas/io/json/normalize.py index e811dd1eab142..23d2f730d070c 100644 --- a/pandas/io/json/normalize.py +++ b/pandas/io/json/normalize.py @@ -181,7 +181,7 @@ def _pull_field(js, spec): return result - if isinstance(data, list) and len(data) is 0: + if isinstance(data, list) and not data: return DataFrame() # A bit of a hackjob @@ -207,9 +207,7 @@ def _pull_field(js, spec): elif not isinstance(meta, list): meta = [meta] - for i, x in enumerate(meta): - if not isinstance(x, list): - meta[i] = [x] + meta = [m if isinstance(m, list) else [m] for m in meta] # Disastrously inefficient for now records = [] diff --git a/pandas/io/msgpack/_packer.pyx b/pandas/io/msgpack/_packer.pyx index fd3f4612fb432..f175a6743f44b 100644 --- a/pandas/io/msgpack/_packer.pyx +++ b/pandas/io/msgpack/_packer.pyx @@ -8,6 +8,7 @@ from libc.limits cimport * from pandas.io.msgpack.exceptions import PackValueError from pandas.io.msgpack import ExtType +import numpy as np cdef extern from "../../src/msgpack/pack.h": @@ -133,7 +134,7 @@ cdef class Packer(object): while True: if o is None: ret = msgpack_pack_nil(&self.pk) - elif isinstance(o, bool): + elif isinstance(o, (bool, np.bool_)): if o: ret = msgpack_pack_true(&self.pk) else: diff --git a/pandas/io/packers.py b/pandas/io/packers.py index 92270b39f56ef..abd258034af99 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -350,8 +350,11 @@ def unconvert(values, dtype, compress=None): ) # fall through to copying `np.fromstring` - # Copy the string into a numpy array. - return np.fromstring(values, dtype=dtype) + # Copy the bytes into a numpy array. + buf = np.frombuffer(values, dtype=dtype) + buf = buf.copy() # required to not mutate the original data + buf.flags.writeable = True + return buf def encode(obj): diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 4b507b7f5df6f..4a13d2c9db944 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -76,9 +76,10 @@ def write(self, df, path, compression='snappy', table, path, compression=compression, coerce_timestamps=coerce_timestamps, **kwargs) - def read(self, path): + def read(self, path, columns=None, **kwargs): path, _, _ = get_filepath_or_buffer(path) - return self.api.parquet.read_table(path).to_pandas() + return self.api.parquet.read_table(path, columns=columns, + **kwargs).to_pandas() class FastParquetImpl(object): @@ -115,9 +116,9 @@ def write(self, df, path, compression='snappy', **kwargs): self.api.write(path, df, compression=compression, **kwargs) - def read(self, path): + def read(self, path, columns=None, **kwargs): path, _, _ = get_filepath_or_buffer(path) - return self.api.ParquetFile(path).to_pandas() + return self.api.ParquetFile(path).to_pandas(columns=columns, **kwargs) def to_parquet(df, path, engine='auto', compression='snappy', **kwargs): @@ -175,10 +176,10 @@ def to_parquet(df, path, engine='auto', compression='snappy', **kwargs): if df.columns.inferred_type not in valid_types: raise ValueError("parquet must have string column names") - return impl.write(df, path, compression=compression) + return impl.write(df, path, compression=compression, **kwargs) -def read_parquet(path, engine='auto', **kwargs): +def read_parquet(path, engine='auto', columns=None, **kwargs): """ Load a parquet object from the file path, returning a DataFrame. @@ -188,6 +189,10 @@ def read_parquet(path, engine='auto', **kwargs): ---------- path : string File path + columns: list, default=None + If not None, only these columns will be read from the file. + + .. versionadded 0.21.1 engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto' Parquet reader library to use. If 'auto', then the option 'io.parquet.engine' is used. If 'auto', then the first @@ -201,4 +206,4 @@ def read_parquet(path, engine='auto', **kwargs): """ impl = get_engine(engine) - return impl.read(path) + return impl.read(path, columns=columns, **kwargs) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 7f3f5630e49f9..df8b1b5cca1d3 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -74,15 +74,19 @@ .. versionadded:: 0.18.1 support for the Python parser. header : int or list of ints, default 'infer' - Row number(s) to use as the column names, and the start of the data. - Default behavior is as if set to 0 if no ``names`` passed, otherwise - ``None``. Explicitly pass ``header=0`` to be able to replace existing - names. The header can be a list of integers that specify row locations for - a multi-index on the columns e.g. [0,1,3]. Intervening rows that are not - specified will be skipped (e.g. 2 in this example is skipped). Note that - this parameter ignores commented lines and empty lines if - ``skip_blank_lines=True``, so header=0 denotes the first line of data - rather than the first line of the file. + Row number(s) to use as the column names, and the start of the + data. Default behavior is to infer the column names: if no names + are passed the behavior is identical to ``header=0`` and column + names are inferred from the first line of the file, if column + names are passed explicitly then the behavior is identical to + ``header=None``. Explicitly pass ``header=0`` to be able to + replace existing names. The header can be a list of integers that + specify row locations for a multi-index on the columns + e.g. [0,1,3]. Intervening rows that are not specified will be + skipped (e.g. 2 in this example is skipped). Note that this + parameter ignores commented lines and empty lines if + ``skip_blank_lines=True``, so header=0 denotes the first line of + data rather than the first line of the file. names : array-like, default None List of column names to use. If file contains no header row, then you should explicitly pass header=None. Duplicates in this list will cause @@ -1231,6 +1235,8 @@ def __init__(self, kwds): self.na_values = kwds.get('na_values') self.na_fvalues = kwds.get('na_fvalues') + self.na_filter = kwds.get('na_filter', False) + self.true_values = kwds.get('true_values') self.false_values = kwds.get('false_values') self.as_recarray = kwds.get('as_recarray', False) @@ -1404,7 +1410,6 @@ def _make_index(self, data, alldata, columns, indexnamerow=False): elif not self._has_complex_date_col: index = self._get_simple_index(alldata, columns) index = self._agg_index(index) - elif self._has_complex_date_col: if not self._name_processed: (self.index_names, _, @@ -1487,8 +1492,12 @@ def _agg_index(self, index, try_parse_dates=True): if (try_parse_dates and self._should_parse_dates(i)): arr = self._date_conv(arr) - col_na_values = self.na_values - col_na_fvalues = self.na_fvalues + if self.na_filter: + col_na_values = self.na_values + col_na_fvalues = self.na_fvalues + else: + col_na_values = set() + col_na_fvalues = set() if isinstance(self.na_values, dict): col_name = self.index_names[i] @@ -1671,8 +1680,8 @@ def __init__(self, src, **kwds): ParserBase.__init__(self, kwds) - if (kwds.get('compression') is None - and 'utf-16' in (kwds.get('encoding') or '')): + if (kwds.get('compression') is None and + 'utf-16' in (kwds.get('encoding') or '')): # if source is utf-16 plain text, convert source to utf-8 if isinstance(src, compat.string_types): src = open(src, 'rb') @@ -2043,8 +2052,6 @@ def __init__(self, f, **kwds): self.names_passed = kwds['names'] or None - self.na_filter = kwds['na_filter'] - self.has_index_names = False if 'has_index_names' in kwds: self.has_index_names = kwds['has_index_names'] diff --git a/pandas/io/sql.py b/pandas/io/sql.py index c42c19e1357bc..a9b4f504dd624 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -103,12 +103,12 @@ def _handle_date_column(col, utc=None, format=None): if isinstance(format, dict): return to_datetime(col, errors='ignore', **format) else: - if format in ['D', 's', 'ms', 'us', 'ns']: - return to_datetime(col, errors='coerce', unit=format, utc=utc) - elif (issubclass(col.dtype.type, np.floating) or - issubclass(col.dtype.type, np.integer)): - # parse dates as timestamp - format = 's' if format is None else format + # Allow passing of formatting string for integers + # GH17855 + if format is None and (issubclass(col.dtype.type, np.floating) or + issubclass(col.dtype.type, np.integer)): + format = 's' + if format in ['D', 'd', 'h', 'm', 's', 'ms', 'us', 'ns']: return to_datetime(col, errors='coerce', unit=format, utc=utc) elif is_datetime64tz_dtype(col): # coerce to UTC timezone diff --git a/pandas/io/stata.py b/pandas/io/stata.py index afc1631a947c8..aafe5f2ce76bd 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -306,11 +306,11 @@ def convert_delta_safe(base, deltas, unit): data_col[bad_locs] = 1.0 # Replace with NaT dates = dates.astype(np.int64) - if fmt in ["%tc", "tc"]: # Delta ms relative to base + if fmt.startswith(("%tc", "tc")): # Delta ms relative to base base = stata_epoch ms = dates conv_dates = convert_delta_safe(base, ms, 'ms') - elif fmt in ["%tC", "tC"]: + elif fmt.startswith(("%tC", "tC")): from warnings import warn warn("Encountered %tC format. Leaving in Stata Internal Format.") @@ -318,27 +318,30 @@ def convert_delta_safe(base, deltas, unit): if has_bad_values: conv_dates[bad_locs] = pd.NaT return conv_dates - elif fmt in ["%td", "td", "%d", "d"]: # Delta days relative to base + # Delta days relative to base + elif fmt.startswith(("%td", "td", "%d", "d")): base = stata_epoch days = dates conv_dates = convert_delta_safe(base, days, 'd') - elif fmt in ["%tw", "tw"]: # does not count leap days - 7 days is a week + # does not count leap days - 7 days is a week. + # 52nd week may have more than 7 days + elif fmt.startswith(("%tw", "tw")): year = stata_epoch.year + dates // 52 days = (dates % 52) * 7 conv_dates = convert_year_days_safe(year, days) - elif fmt in ["%tm", "tm"]: # Delta months relative to base + elif fmt.startswith(("%tm", "tm")): # Delta months relative to base year = stata_epoch.year + dates // 12 month = (dates % 12) + 1 conv_dates = convert_year_month_safe(year, month) - elif fmt in ["%tq", "tq"]: # Delta quarters relative to base + elif fmt.startswith(("%tq", "tq")): # Delta quarters relative to base year = stata_epoch.year + dates // 4 month = (dates % 4) * 3 + 1 conv_dates = convert_year_month_safe(year, month) - elif fmt in ["%th", "th"]: # Delta half-years relative to base + elif fmt.startswith(("%th", "th")): # Delta half-years relative to base year = stata_epoch.year + dates // 2 month = (dates % 2) * 6 + 1 conv_dates = convert_year_month_safe(year, month) - elif fmt in ["%ty", "ty"]: # Years -- not delta + elif fmt.startswith(("%ty", "ty")): # Years -- not delta year = dates month = np.ones_like(dates) conv_dates = convert_year_month_safe(year, month) @@ -1029,10 +1032,6 @@ def _read_header(self): # calculate size of a data record self.col_sizes = lmap(lambda x: self._calcsize(x), self.typlist) - # remove format details from %td - self.fmtlist = ["%td" if x.startswith("%td") else x - for x in self.fmtlist] - def _read_new_header(self, first_char): # The first part of the header is common to 117 and 118. self.path_or_buf.read(27) # stata_dta>
@@ -1578,7 +1577,8 @@ def read(self, nrows=None, convert_dates=None, self._do_convert_missing(data, convert_missing) if convert_dates: - cols = np.where(lmap(lambda x: x in _date_formats, + cols = np.where(lmap(lambda x: any(x.startswith(fmt) + for fmt in _date_formats), self.fmtlist))[0] for i in cols: col = data.columns[i] diff --git a/pandas/plotting/__init__.py b/pandas/plotting/__init__.py index 8f98e297e3e66..385d4d7f047c7 100644 --- a/pandas/plotting/__init__.py +++ b/pandas/plotting/__init__.py @@ -11,3 +11,10 @@ from pandas.plotting._core import boxplot from pandas.plotting._style import plot_params from pandas.plotting._tools import table +try: + from pandas.plotting._converter import \ + register as register_matplotlib_converters + from pandas.plotting._converter import \ + deregister as deregister_matplotlib_converters +except ImportError: + pass diff --git a/pandas/plotting/_converter.py b/pandas/plotting/_converter.py index 47d15195315ba..357e84d1f17ea 100644 --- a/pandas/plotting/_converter.py +++ b/pandas/plotting/_converter.py @@ -1,3 +1,4 @@ +import warnings from datetime import datetime, timedelta import datetime as pydt import numpy as np @@ -44,14 +45,96 @@ MUSEC_PER_DAY = 1e6 * SEC_PER_DAY +_WARN = True # Global for whether pandas has registered the units explicitly +_mpl_units = {} # Cache for units overwritten by us -def register(): - units.registry[lib.Timestamp] = DatetimeConverter() - units.registry[Period] = PeriodConverter() - units.registry[pydt.datetime] = DatetimeConverter() - units.registry[pydt.date] = DatetimeConverter() - units.registry[pydt.time] = TimeConverter() - units.registry[np.datetime64] = DatetimeConverter() + +def get_pairs(): + pairs = [ + (lib.Timestamp, DatetimeConverter), + (Period, PeriodConverter), + (pydt.datetime, DatetimeConverter), + (pydt.date, DatetimeConverter), + (pydt.time, TimeConverter), + (np.datetime64, DatetimeConverter), + ] + return pairs + + +def register(explicit=True): + """Register Pandas Formatters and Converters with matplotlib + + This function modifies the global ``matplotlib.units.registry`` + dictionary. Pandas adds custom converters for + + * pd.Timestamp + * pd.Period + * np.datetime64 + * datetime.datetime + * datetime.date + * datetime.time + + See Also + -------- + deregister_matplotlib_converter + """ + # Renamed in pandas.plotting.__init__ + global _WARN + + if explicit: + _WARN = False + + pairs = get_pairs() + for type_, cls in pairs: + converter = cls() + if type_ in units.registry: + previous = units.registry[type_] + _mpl_units[type_] = previous + units.registry[type_] = converter + + +def deregister(): + """Remove pandas' formatters and converters + + Removes the custom converters added by :func:`register`. This + attempts to set the state of the registry back to the state before + pandas registered its own units. Converters for pandas' own types like + Timestamp and Period are removed completely. Converters for types + pandas overwrites, like ``datetime.datetime``, are restored to their + original value. + + See Also + -------- + deregister_matplotlib_converters + """ + # Renamed in pandas.plotting.__init__ + for type_, cls in get_pairs(): + # We use type to catch our classes directly, no inheritance + if type(units.registry.get(type_)) is cls: + units.registry.pop(type_) + + # restore the old keys + for unit, formatter in _mpl_units.items(): + if type(formatter) not in {DatetimeConverter, PeriodConverter, + TimeConverter}: + # make it idempotent by excluding ours. + units.registry[unit] = formatter + + +def _check_implicitly_registered(): + global _WARN + + if _WARN: + msg = ("Using an implicitly registered datetime converter for a " + "matplotlib plotting method. The converter was registered " + "by pandas on import. Future versions of pandas will require " + "you to explicitly register matplotlib converters.\n\n" + "To register the converters:\n\t" + ">>> from pandas.plotting import register_matplotlib_converters" + "\n\t" + ">>> register_matplotlib_converters()") + warnings.warn(msg, FutureWarning) + _WARN = False def _to_ordinalf(tm): @@ -189,6 +272,7 @@ class DatetimeConverter(dates.DateConverter): @staticmethod def convert(values, unit, axis): # values might be a 1-d array, or a list-like of arrays. + _check_implicitly_registered() if is_nested_list_like(values): values = [DatetimeConverter._convert_1d(v, unit, axis) for v in values] @@ -273,6 +357,7 @@ class PandasAutoDateLocator(dates.AutoDateLocator): def get_locator(self, dmin, dmax): 'Pick the best locator based on a distance.' + _check_implicitly_registered() delta = relativedelta(dmax, dmin) num_days = (delta.years * 12.0 + delta.months) * 31.0 + delta.days @@ -314,6 +399,7 @@ def get_unit_generic(freq): def __call__(self): # if no data have been set, this will tank with a ValueError + _check_implicitly_registered() try: dmin, dmax = self.viewlim_to_dt() except ValueError: @@ -914,6 +1000,8 @@ def _get_default_locs(self, vmin, vmax): def __call__(self): 'Return the locations of the ticks.' # axis calls Locator.set_axis inside set_m_formatter + _check_implicitly_registered() + vi = tuple(self.axis.get_view_interval()) if vi != self.plot_obj.view_interval: self.plot_obj.date_axis_info = None @@ -998,6 +1086,8 @@ def set_locs(self, locs): 'Sets the locations of the ticks' # don't actually use the locs. This is just needed to work with # matplotlib. Force to use vmin, vmax + _check_implicitly_registered() + self.locs = locs (vmin, vmax) = vi = tuple(self.axis.get_view_interval()) @@ -1009,6 +1099,8 @@ def set_locs(self, locs): self._set_default_format(vmin, vmax) def __call__(self, x, pos=0): + _check_implicitly_registered() + if self.formatdict is None: return '' else: @@ -1039,6 +1131,7 @@ def format_timedelta_ticks(x, pos, n_decimals): return s def __call__(self, x, pos=0): + _check_implicitly_registered() (vmin, vmax) = tuple(self.axis.get_view_interval()) n_decimals = int(np.ceil(np.log10(100 * 1e9 / (vmax - vmin)))) if n_decimals > 9: diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 43f33cf30dea1..e1380953e4519 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -11,6 +11,7 @@ from pandas.util._decorators import cache_readonly from pandas.core.base import PandasObject +from pandas.core.config import get_option from pandas.core.dtypes.missing import isna, notna, remove_na_arraylike from pandas.core.dtypes.common import ( is_list_like, @@ -40,16 +41,13 @@ _get_xlim, _set_ticks_props, format_date_labels) -_registered = False - - -def _setup(): - # delay the import of matplotlib until nescessary - global _registered - if not _registered: - from pandas.plotting import _converter - _converter.register() - _registered = True +try: + from pandas.plotting import _converter +except ImportError: + pass +else: + if get_option('plotting.matplotlib.register_converters'): + _converter.register(explicit=True) def _get_standard_kind(kind): @@ -99,7 +97,7 @@ def __init__(self, data, kind=None, by=None, subplots=False, sharex=None, secondary_y=False, colormap=None, table=False, layout=None, **kwds): - _setup() + _converter._WARN = False self.data = data self.by = by @@ -383,12 +381,16 @@ def _add_table(self): def _post_plot_logic_common(self, ax, data): """Common post process for each axes""" - labels = [pprint_thing(key) for key in data.index] - labels = dict(zip(range(len(data.index)), labels)) + + def get_label(i): + try: + return pprint_thing(data.index[i]) + except Exception: + return '' if self.orientation == 'vertical' or self.orientation is None: if self._need_to_set_index: - xticklabels = [labels.get(x, '') for x in ax.get_xticks()] + xticklabels = [get_label(x) for x in ax.get_xticks()] ax.set_xticklabels(xticklabels) self._apply_axis_properties(ax.xaxis, rot=self.rot, fontsize=self.fontsize) @@ -400,7 +402,7 @@ def _post_plot_logic_common(self, ax, data): elif self.orientation == 'horizontal': if self._need_to_set_index: - yticklabels = [labels.get(y, '') for y in ax.get_yticks()] + yticklabels = [get_label(y) for y in ax.get_yticks()] ax.set_yticklabels(yticklabels) self._apply_axis_properties(ax.yaxis, rot=self.rot, fontsize=self.fontsize) @@ -2059,7 +2061,7 @@ def boxplot_frame(self, column=None, by=None, ax=None, fontsize=None, rot=0, grid=True, figsize=None, layout=None, return_type=None, **kwds): import matplotlib.pyplot as plt - _setup() + _converter._WARN = False ax = boxplot(self, column=column, by=by, ax=ax, fontsize=fontsize, grid=grid, rot=rot, figsize=figsize, layout=layout, return_type=return_type, **kwds) @@ -2155,7 +2157,7 @@ def hist_frame(data, column=None, by=None, grid=True, xlabelsize=None, kwds : other plotting keyword arguments To be passed to hist function """ - _setup() + _converter._WARN = False if by is not None: axes = grouped_hist(data, column=column, by=by, ax=ax, grid=grid, figsize=figsize, sharex=sharex, sharey=sharey, @@ -2289,6 +2291,8 @@ def grouped_hist(data, column=None, by=None, ax=None, bins=50, figsize=None, ------- axes: collection of Matplotlib Axes """ + _converter._WARN = False + def plot_group(group, ax): ax.hist(group.dropna().values, bins=bins, **kwargs) @@ -2352,7 +2356,7 @@ def boxplot_frame_groupby(grouped, subplots=True, column=None, fontsize=None, >>> grouped = df.unstack(level='lvl1').groupby(level=0, axis=1) >>> boxplot_frame_groupby(grouped, subplots=False) """ - _setup() + _converter._WARN = False if subplots is True: naxes = len(grouped) fig, axes = _subplots(naxes=naxes, squeeze=False, diff --git a/pandas/plotting/_timeseries.py b/pandas/plotting/_timeseries.py index 3d04973ed0009..56b5311326e98 100644 --- a/pandas/plotting/_timeseries.py +++ b/pandas/plotting/_timeseries.py @@ -1,5 +1,7 @@ # TODO: Use the fact that axis can have units to simplify the process +import functools + import numpy as np from matplotlib import pylab @@ -293,6 +295,10 @@ def format_timedelta_ticks(x, pos, n_decimals): return s +def _format_coord(freq, t, y): + return "t = {0} y = {1:8f}".format(Period(ordinal=int(t), freq=freq), y) + + def format_dateaxis(subplot, freq, index): """ Pretty-formats the date axis (x-axis). @@ -327,8 +333,7 @@ def format_dateaxis(subplot, freq, index): subplot.xaxis.set_minor_formatter(minformatter) # x and y coord info - subplot.format_coord = lambda t, y: ( - "t = {0} y = {1:8f}".format(Period(ordinal=int(t), freq=freq), y)) + subplot.format_coord = functools.partial(_format_coord, freq) elif isinstance(index, TimedeltaIndex): subplot.xaxis.set_major_formatter( diff --git a/pandas/tests/dtypes/test_cast.py b/pandas/tests/dtypes/test_cast.py index d9fb458c83529..82a35fa711e8c 100644 --- a/pandas/tests/dtypes/test_cast.py +++ b/pandas/tests/dtypes/test_cast.py @@ -38,17 +38,17 @@ def test_downcast_conv(self): arr = np.array([8.5, 8.6, 8.7, 8.8, 8.9999999999995]) result = maybe_downcast_to_dtype(arr, 'infer') - assert (np.array_equal(result, arr)) + tm.assert_numpy_array_equal(result, arr) arr = np.array([8., 8., 8., 8., 8.9999999999995]) result = maybe_downcast_to_dtype(arr, 'infer') - expected = np.array([8, 8, 8, 8, 9]) - assert (np.array_equal(result, expected)) + expected = np.array([8, 8, 8, 8, 9], dtype=np.int64) + tm.assert_numpy_array_equal(result, expected) arr = np.array([8., 8., 8., 8., 9.0000000000005]) result = maybe_downcast_to_dtype(arr, 'infer') - expected = np.array([8, 8, 8, 8, 9]) - assert (np.array_equal(result, expected)) + expected = np.array([8, 8, 8, 8, 9], dtype=np.int64) + tm.assert_numpy_array_equal(result, expected) # GH16875 coercing of bools ser = Series([True, True, False]) diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 70273f9e999cf..7195cb43a70dc 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -416,6 +416,12 @@ def test_length_zero(self): result = lib.infer_dtype([]) assert result == 'empty' + # GH 18004 + arr = np.array([np.array([], dtype=object), + np.array([], dtype=object)]) + result = lib.infer_dtype(arr) + assert result == 'empty' + def test_integers(self): arr = np.array([1, 2, 3, np.int64(4), np.int32(5)], dtype='O') result = lib.infer_dtype(arr) diff --git a/pandas/tests/frame/test_axis_select_reindex.py b/pandas/tests/frame/test_axis_select_reindex.py index 1e2f630401c89..343e235fb741c 100644 --- a/pandas/tests/frame/test_axis_select_reindex.py +++ b/pandas/tests/frame/test_axis_select_reindex.py @@ -884,6 +884,27 @@ def test_filter_regex_search(self): exp = df[[x for x in df.columns if 'BB' in x]] assert_frame_equal(result, exp) + @pytest.mark.parametrize('name,expected', [ + ('a', DataFrame({u'a': [1, 2]})), + (u'a', DataFrame({u'a': [1, 2]})), + (u'あ', DataFrame({u'あ': [3, 4]})) + ]) + def test_filter_unicode(self, name, expected): + # GH13101 + df = DataFrame({u'a': [1, 2], u'あ': [3, 4]}) + + assert_frame_equal(df.filter(like=name), expected) + assert_frame_equal(df.filter(regex=name), expected) + + @pytest.mark.parametrize('name', ['a', u'a']) + def test_filter_bytestring(self, name): + # GH13101 + df = DataFrame({b'a': [1, 2], b'b': [3, 4]}) + expected = DataFrame({b'a': [1, 2]}) + + assert_frame_equal(df.filter(like=name), expected) + assert_frame_equal(df.filter(regex=name), expected) + def test_filter_corner(self): empty = DataFrame() diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index c55c79ef18602..8291e9d452348 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1913,10 +1913,11 @@ def test_from_records_len0_with_columns(self): # #2633 result = DataFrame.from_records([], index='foo', columns=['foo', 'bar']) + expected = Index(['bar']) - assert np.array_equal(result.columns, ['bar']) assert len(result) == 0 assert result.index.name == 'foo' + tm.assert_index_equal(result.columns, expected) def test_to_frame_with_falsey_names(self): # GH 16114 diff --git a/pandas/tests/frame/test_convert_to.py b/pandas/tests/frame/test_convert_to.py index 5bdb76494f4c8..7d2d18db8d41c 100644 --- a/pandas/tests/frame/test_convert_to.py +++ b/pandas/tests/frame/test_convert_to.py @@ -1,6 +1,9 @@ # -*- coding: utf-8 -*- +from datetime import datetime + import pytest +import pytz import collections import numpy as np @@ -249,3 +252,18 @@ def test_to_dict_box_scalars(self): result = DataFrame(d).to_dict(orient='records') assert isinstance(result[0]['a'], (int, long)) + + def test_frame_to_dict_tz(self): + # GH18372 When converting to dict with orient='records' columns of + # datetime that are tz-aware were not converted to required arrays + data = [(datetime(2017, 11, 18, 21, 53, 0, 219225, tzinfo=pytz.utc),), + (datetime(2017, 11, 18, 22, 6, 30, 61810, tzinfo=pytz.utc,),)] + df = DataFrame(list(data), columns=["d", ]) + + result = df.to_dict(orient='records') + expected = [ + {'d': Timestamp('2017-11-18 21:53:00.219225+0000', tz=pytz.utc)}, + {'d': Timestamp('2017-11-18 22:06:30.061810+0000', tz=pytz.utc)}, + ] + tm.assert_dict_equal(result[0], expected[0]) + tm.assert_dict_equal(result[1], expected[1]) diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index abb528f0d2179..5adcd3b6855ce 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -10,6 +10,8 @@ from pandas import (DataFrame, Series, date_range, Timedelta, Timestamp, compat, concat, option_context) from pandas.compat import u +from pandas import _np_version_under1p14 + from pandas.core.dtypes.dtypes import DatetimeTZDtype from pandas.tests.frame.common import TestData from pandas.util.testing import (assert_series_equal, @@ -531,7 +533,12 @@ def test_astype_str(self): assert_frame_equal(result, expected) result = DataFrame([1.12345678901234567890]).astype(tt) - expected = DataFrame(['1.12345678901']) + if _np_version_under1p14: + # < 1.14 truncates + expected = DataFrame(['1.12345678901']) + else: + # >= 1.14 preserves the full repr + expected = DataFrame(['1.1234567890123457']) assert_frame_equal(result, expected) @pytest.mark.parametrize("dtype_class", [dict, Series]) diff --git a/pandas/tests/frame/test_nonunique_indexes.py b/pandas/tests/frame/test_nonunique_indexes.py index 4f77ba0ae1f5a..5b903c5a1eaf6 100644 --- a/pandas/tests/frame/test_nonunique_indexes.py +++ b/pandas/tests/frame/test_nonunique_indexes.py @@ -448,7 +448,7 @@ def test_as_matrix_duplicates(self): expected = np.array([[1, 2, 'a', 'b'], [1, 2, 'a', 'b']], dtype=object) - assert np.array_equal(result, expected) + tm.assert_numpy_array_equal(result, expected) def test_set_value_by_index(self): # See gh-12344 diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index 4162a586f8063..ca8a0d8bda3ab 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -1203,3 +1203,16 @@ def test_period_index_date_overflow(self): expected = ',0\n1990-01-01,4\n,5\n3005-01-01,6\n' assert result == expected + + def test_multi_index_header(self): + # see gh-5539 + columns = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), + ("b", 1), ("b", 2)]) + df = pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]]) + df.columns = columns + + header = ["a", "b", "c", "d"] + result = df.to_csv(header=header) + + expected = ",a,b,c,d\n0,1,2,3,4\n1,5,6,7,8\n" + assert result == expected diff --git a/pandas/tests/groupby/test_counting.py b/pandas/tests/groupby/test_counting.py index 485241d593d4f..787d99086873e 100644 --- a/pandas/tests/groupby/test_counting.py +++ b/pandas/tests/groupby/test_counting.py @@ -2,9 +2,11 @@ from __future__ import print_function import numpy as np +import pytest -from pandas import (DataFrame, Series, MultiIndex) -from pandas.util.testing import assert_series_equal +from pandas import (DataFrame, Series, MultiIndex, Timestamp, Timedelta, + Period) +from pandas.util.testing import (assert_series_equal, assert_frame_equal) from pandas.compat import (range, product as cart_product) @@ -195,3 +197,18 @@ def test_ngroup_respects_groupby_order(self): g.ngroup()) assert_series_equal(Series(df['group_index'].values), g.cumcount()) + + @pytest.mark.parametrize('datetimelike', [ + [Timestamp('2016-05-%02d 20:09:25+00:00' % i) for i in range(1, 4)], + [Timestamp('2016-05-%02d 20:09:25' % i) for i in range(1, 4)], + [Timedelta(x, unit="h") for x in range(1, 4)], + [Period(freq="2W", year=2017, month=x) for x in range(1, 4)]]) + def test_count_with_datetimelike(self, datetimelike): + # test for #13393, where DataframeGroupBy.count() fails + # when counting a datetimelike column. + + df = DataFrame({'x': ['a', 'a', 'b'], 'y': datetimelike}) + res = df.groupby('x').count() + expected = DataFrame({'y': [2, 1]}, index=['a', 'b']) + expected.index.name = "x" + assert_frame_equal(expected, res) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 9d25117fbd954..675f8d6413b2a 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -10,7 +10,7 @@ from pandas import (date_range, bdate_range, Timestamp, Index, MultiIndex, DataFrame, Series, - concat, Panel, DatetimeIndex) + concat, Panel, DatetimeIndex, CategoricalIndex) from pandas.errors import UnsupportedFunctionCall, PerformanceWarning from pandas.util.testing import (assert_panel_equal, assert_frame_equal, assert_series_equal, assert_almost_equal, @@ -28,6 +28,15 @@ from .common import MixIn +class TestGrouper(object): + + def test_repr(self): + # GH18203 + result = repr(pd.Grouper(key='A', level='B')) + expected = "Grouper(key='A', level='B', axis=0, sort=False)" + assert result == expected + + class TestGroupBy(MixIn): def test_basic(self): @@ -253,6 +262,29 @@ def test_grouper_column_and_index(self): expected = df_single.reset_index().groupby(['inner', 'B']).mean() assert_frame_equal(result, expected) + def test_groupby_categorical_index_and_columns(self): + # GH18432 + columns = ['A', 'B', 'A', 'B'] + categories = ['B', 'A'] + data = np.ones((5, 4), int) + cat_columns = CategoricalIndex(columns, + categories=categories, + ordered=True) + df = DataFrame(data=data, columns=cat_columns) + result = df.groupby(axis=1, level=0).sum() + expected_data = 2 * np.ones((5, 2), int) + expected_columns = CategoricalIndex(categories, + categories=categories, + ordered=True) + expected = DataFrame(data=expected_data, columns=expected_columns) + assert_frame_equal(result, expected) + + # test transposed version + df = DataFrame(data.T, index=cat_columns) + result = df.groupby(axis=0, level=0).sum() + expected = DataFrame(data=expected_data.T, index=expected_columns) + assert_frame_equal(result, expected) + def test_grouper_getting_correct_binner(self): # GH 10063 diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 456e5a9bd6439..3a57337efea6f 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -996,3 +996,16 @@ def test_searchsorted_monotonic(self, indices): # non-monotonic should raise. with pytest.raises(ValueError): indices._searchsorted_monotonic(value, side='left') + + def test_putmask_with_wrong_mask(self): + # GH18368 + index = self.create_index() + + with pytest.raises(ValueError): + index.putmask(np.ones(len(index) + 1, np.bool), 1) + + with pytest.raises(ValueError): + index.putmask(np.ones(len(index) - 1, np.bool), 1) + + with pytest.raises(ValueError): + index.putmask('foo', 1) diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index 3b40ef092f364..1349f2f761a2f 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -20,11 +20,6 @@ START, END = datetime(2009, 1, 1), datetime(2010, 1, 1) -def eq_gen_range(kwargs, expected): - rng = generate_range(**kwargs) - assert (np.array_equal(list(rng), expected)) - - class TestDateRanges(TestData): def test_date_range_gen_error(self): @@ -201,20 +196,23 @@ def test_generate_cday(self): assert rng1 == rng2 def test_1(self): - eq_gen_range(dict(start=datetime(2009, 3, 25), periods=2), - [datetime(2009, 3, 25), datetime(2009, 3, 26)]) + rng = list(generate_range(start=datetime(2009, 3, 25), periods=2)) + expected = [datetime(2009, 3, 25), datetime(2009, 3, 26)] + assert rng == expected def test_2(self): - eq_gen_range(dict(start=datetime(2008, 1, 1), - end=datetime(2008, 1, 3)), - [datetime(2008, 1, 1), - datetime(2008, 1, 2), - datetime(2008, 1, 3)]) + rng = list(generate_range(start=datetime(2008, 1, 1), + end=datetime(2008, 1, 3))) + expected = [datetime(2008, 1, 1), + datetime(2008, 1, 2), + datetime(2008, 1, 3)] + assert rng == expected def test_3(self): - eq_gen_range(dict(start=datetime(2008, 1, 5), - end=datetime(2008, 1, 6)), - []) + rng = list(generate_range(start=datetime(2008, 1, 5), + end=datetime(2008, 1, 6))) + expected = [] + assert rng == expected def test_precision_finer_than_offset(self): # GH 9907 @@ -236,6 +234,22 @@ def test_precision_finer_than_offset(self): tm.assert_index_equal(result1, expected1) tm.assert_index_equal(result2, expected2) + dt1, dt2 = '2017-01-01', '2017-01-01' + tz1, tz2 = 'US/Eastern', 'Europe/London' + + @pytest.mark.parametrize("start,end", [ + (pd.Timestamp(dt1, tz=tz1), pd.Timestamp(dt2)), + (pd.Timestamp(dt1), pd.Timestamp(dt2, tz=tz2)), + (pd.Timestamp(dt1, tz=tz1), pd.Timestamp(dt2, tz=tz2)), + (pd.Timestamp(dt1, tz=tz2), pd.Timestamp(dt2, tz=tz1)) + ]) + def test_mismatching_tz_raises_err(self, start, end): + # issue 18488 + with pytest.raises(TypeError): + pd.date_range(start, end) + with pytest.raises(TypeError): + pd.DatetimeIndex(start, end, freq=BDay()) + class TestBusinessDateRange(object): diff --git a/pandas/tests/indexes/datetimes/test_partial_slicing.py b/pandas/tests/indexes/datetimes/test_partial_slicing.py index e7d03aa193cbd..04c180350fb72 100644 --- a/pandas/tests/indexes/datetimes/test_partial_slicing.py +++ b/pandas/tests/indexes/datetimes/test_partial_slicing.py @@ -2,9 +2,10 @@ import pytest -from datetime import datetime +from datetime import datetime, date import numpy as np import pandas as pd +import operator as op from pandas import (DatetimeIndex, Series, DataFrame, date_range, Index, Timedelta, Timestamp) @@ -268,3 +269,21 @@ def test_loc_datetime_length_one(self): result = df.loc['2016-10-01T00:00:00':] tm.assert_frame_equal(result, df) + + @pytest.mark.parametrize('datetimelike', [ + Timestamp('20130101'), datetime(2013, 1, 1), + date(2013, 1, 1), np.datetime64('2013-01-01T00:00', 'ns')]) + @pytest.mark.parametrize('op,expected', [ + (op.lt, [True, False, False, False]), + (op.le, [True, True, False, False]), + (op.eq, [False, True, False, False]), + (op.gt, [False, False, False, True])]) + def test_selection_by_datetimelike(self, datetimelike, op, expected): + # GH issue #17965, test for ability to compare datetime64[ns] columns + # to datetimelike + df = DataFrame({'A': [pd.Timestamp('20120101'), + pd.Timestamp('20130101'), + np.nan, pd.Timestamp('20130103')]}) + result = op(df.A, datetimelike) + expected = Series(expected, name='A') + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index 330ec9f357655..c7944c078d8c4 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -960,6 +960,7 @@ def test_guess_datetime_format_nopadding(self): for dt_string, dt_format in dt_string_to_format: assert tools._guess_datetime_format(dt_string) == dt_format + @pytest.mark.xfail(reason="GH18141 - dateutil > 2.6.1 broken") def test_guess_datetime_format_for_array(self): tm._skip_if_not_us_locale() expected_format = '%Y-%m-%d %H:%M:%S.%f' diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index d8ec23b9c7e0e..5e40e06d57413 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -4,6 +4,7 @@ import pandas.util.testing as tm from pandas.core.indexes.api import Index, CategoricalIndex +from pandas.core.dtypes.dtypes import CategoricalDtype from .common import Base from pandas.compat import range, PY3 @@ -95,6 +96,11 @@ def test_construction(self): 1, -1, 0], dtype='int8')) assert result.ordered + result = pd.CategoricalIndex(ci, categories=list('ab'), ordered=True) + expected = pd.CategoricalIndex(ci, categories=list('ab'), ordered=True, + dtype='category') + tm.assert_index_equal(result, expected, exact=True) + # turn me to an Index result = Index(np.array(ci)) assert isinstance(result, Index) @@ -125,6 +131,25 @@ def test_construction_with_dtype(self): result = CategoricalIndex(idx, categories=idx, ordered=True) tm.assert_index_equal(result, expected, exact=True) + def test_construction_with_categorical_dtype(self): + # construction with CategoricalDtype + # GH18109 + data, cats, ordered = 'a a b b'.split(), 'c b a'.split(), True + dtype = CategoricalDtype(categories=cats, ordered=ordered) + + result = pd.CategoricalIndex(data, dtype=dtype) + expected = pd.CategoricalIndex(data, categories=cats, + ordered=ordered) + tm.assert_index_equal(result, expected, exact=True) + + # error to combine categories or ordered and dtype keywords args + with pytest.raises(ValueError, match="Cannot specify both `dtype` and " + "`categories` or `ordered`."): + pd.CategoricalIndex(data, categories=cats, dtype=dtype) + with pytest.raises(ValueError, match="Cannot specify both `dtype` and " + "`categories` or `ordered`."): + pd.CategoricalIndex(data, ordered=ordered, dtype=dtype) + def test_create_categorical(self): # https://github.com/pandas-dev/pandas/pull/17513 # The public CI constructor doesn't hit this code path with diff --git a/pandas/tests/indexes/test_interval.py b/pandas/tests/indexes/test_interval.py index b55bab3a210cc..399d88309072e 100644 --- a/pandas/tests/indexes/test_interval.py +++ b/pandas/tests/indexes/test_interval.py @@ -6,6 +6,7 @@ from pandas import (Interval, IntervalIndex, Index, isna, interval_range, Timestamp, Timedelta, compat, date_range, timedelta_range, DateOffset) +from pandas.compat import zip from pandas.tseries.offsets import Day from pandas._libs.interval import IntervalTree from pandas.tests.indexes.common import Base @@ -13,6 +14,11 @@ import pandas as pd +@pytest.fixture(scope='class', params=['left', 'right', 'both', 'neither']) +def closed(request): + return request.param + + class TestIntervalIndex(Base): _holder = IntervalIndex @@ -22,34 +28,63 @@ def setup_method(self, method): [(0, 1), np.nan, (1, 2)]) self.indices = dict(intervalIndex=tm.makeIntervalIndex(10)) - def create_index(self): - return IntervalIndex.from_breaks(np.arange(10)) + def create_index(self, closed='right'): + return IntervalIndex.from_breaks(np.arange(3), closed=closed) - def test_constructors(self): - expected = self.index - actual = IntervalIndex.from_breaks(np.arange(3), closed='right') - assert expected.equals(actual) + def create_index_with_nan(self, closed='right'): + return IntervalIndex.from_tuples( + [(0, 1), np.nan, (1, 2)], closed=closed) - alternate = IntervalIndex.from_breaks(np.arange(3), closed='left') - assert not expected.equals(alternate) + @pytest.mark.parametrize('name', [None, 'foo']) + def test_constructors(self, closed, name): + left, right = Index([0, 1, 2, 3]), Index([1, 2, 3, 4]) + ivs = [Interval(l, r, closed=closed) for l, r in zip(left, right)] + expected = IntervalIndex._simple_new( + left=left, right=right, closed=closed, name=name) - actual = IntervalIndex.from_intervals([Interval(0, 1), Interval(1, 2)]) - assert expected.equals(actual) + result = IntervalIndex(ivs, name=name) + tm.assert_index_equal(result, expected) - actual = IntervalIndex([Interval(0, 1), Interval(1, 2)]) - assert expected.equals(actual) + result = IntervalIndex.from_intervals(ivs, name=name) + tm.assert_index_equal(result, expected) - actual = IntervalIndex.from_arrays(np.arange(2), np.arange(2) + 1, - closed='right') - assert expected.equals(actual) + result = IntervalIndex.from_breaks( + np.arange(5), closed=closed, name=name) + tm.assert_index_equal(result, expected) - actual = Index([Interval(0, 1), Interval(1, 2)]) - assert isinstance(actual, IntervalIndex) - assert expected.equals(actual) + result = IntervalIndex.from_arrays( + left.values, right.values, closed=closed, name=name) + tm.assert_index_equal(result, expected) - actual = Index(expected) - assert isinstance(actual, IntervalIndex) - assert expected.equals(actual) + result = IntervalIndex.from_tuples( + zip(left, right), closed=closed, name=name) + tm.assert_index_equal(result, expected) + + result = Index(ivs, name=name) + assert isinstance(result, IntervalIndex) + tm.assert_index_equal(result, expected) + + # idempotent + tm.assert_index_equal(Index(expected), expected) + tm.assert_index_equal(IntervalIndex(expected), expected) + + result = IntervalIndex.from_intervals( + expected.values, name=expected.name) + tm.assert_index_equal(result, expected) + + left, right = expected.left, expected.right + result = IntervalIndex.from_arrays( + left, right, closed=expected.closed, name=expected.name) + tm.assert_index_equal(result, expected) + + result = IntervalIndex.from_tuples( + expected.to_tuples(), closed=expected.closed, name=expected.name) + tm.assert_index_equal(result, expected) + + breaks = expected.left.tolist() + [expected.right[-1]] + result = IntervalIndex.from_breaks( + breaks, closed=expected.closed, name=expected.name) + tm.assert_index_equal(result, expected) def test_constructors_other(self): @@ -66,43 +101,57 @@ def test_constructors_other(self): def test_constructors_errors(self): # scalar - with pytest.raises(TypeError): + msg = ('IntervalIndex(...) must be called with a collection of ' + 'some kind, 5 was passed') + with pytest.raises(TypeError, message=msg): IntervalIndex(5) # not an interval - with pytest.raises(TypeError): + msg = "type with value 0 is not an interval" + with pytest.raises(TypeError, message=msg): IntervalIndex([0, 1]) - with pytest.raises(TypeError): + with pytest.raises(TypeError, message=msg): IntervalIndex.from_intervals([0, 1]) # invalid closed - with pytest.raises(ValueError): + msg = "invalid options for 'closed': invalid" + with pytest.raises(ValueError, message=msg): IntervalIndex.from_arrays([0, 1], [1, 2], closed='invalid') # mismatched closed - with pytest.raises(ValueError): + msg = 'intervals must all be closed on the same side' + with pytest.raises(ValueError, message=msg): IntervalIndex.from_intervals([Interval(0, 1), Interval(1, 2, closed='left')]) - with pytest.raises(ValueError): + with pytest.raises(ValueError, message=msg): IntervalIndex.from_arrays([0, 10], [3, 5]) - with pytest.raises(ValueError): + with pytest.raises(ValueError, message=msg): Index([Interval(0, 1), Interval(2, 3, closed='left')]) # no point in nesting periods in an IntervalIndex - with pytest.raises(ValueError): + msg = 'Period dtypes are not supported, use a PeriodIndex instead' + with pytest.raises(ValueError, message=msg): IntervalIndex.from_breaks( pd.period_range('2000-01-01', periods=3)) - def test_constructors_datetimelike(self): + # decreasing breaks/arrays + msg = 'left side of interval must be <= right side' + with pytest.raises(ValueError, message=msg): + IntervalIndex.from_breaks(range(10, -1, -1)) + + with pytest.raises(ValueError, message=msg): + IntervalIndex.from_arrays(range(10, -1, -1), range(9, -2, -1)) + + def test_constructors_datetimelike(self, closed): # DTI / TDI for idx in [pd.date_range('20130101', periods=5), pd.timedelta_range('1 day', periods=5)]: - result = IntervalIndex.from_breaks(idx) - expected = IntervalIndex.from_breaks(idx.values) + result = IntervalIndex.from_breaks(idx, closed=closed) + expected = IntervalIndex.from_breaks(idx.values, closed=closed) tm.assert_index_equal(result, expected) expected_scalar_type = type(idx[0]) @@ -117,8 +166,8 @@ def f(): IntervalIndex.from_intervals([0.997, 4.0]) pytest.raises(TypeError, f) - def test_properties(self): - index = self.index + def test_properties(self, closed): + index = self.create_index(closed=closed) assert len(index) == 2 assert index.size == 2 assert index.shape == (2, ) @@ -127,14 +176,15 @@ def test_properties(self): tm.assert_index_equal(index.right, Index([1, 2])) tm.assert_index_equal(index.mid, Index([0.5, 1.5])) - assert index.closed == 'right' + assert index.closed == closed - expected = np.array([Interval(0, 1), Interval(1, 2)], dtype=object) + expected = np.array([Interval(0, 1, closed=closed), + Interval(1, 2, closed=closed)], dtype=object) tm.assert_numpy_array_equal(np.asarray(index), expected) tm.assert_numpy_array_equal(index.values, expected) # with nans - index = self.index_with_nan + index = self.create_index_with_nan(closed=closed) assert len(index) == 3 assert index.size == 3 assert index.shape == (3, ) @@ -143,41 +193,43 @@ def test_properties(self): tm.assert_index_equal(index.right, Index([1, np.nan, 2])) tm.assert_index_equal(index.mid, Index([0.5, np.nan, 1.5])) - assert index.closed == 'right' + assert index.closed == closed - expected = np.array([Interval(0, 1), np.nan, - Interval(1, 2)], dtype=object) + expected = np.array([Interval(0, 1, closed=closed), np.nan, + Interval(1, 2, closed=closed)], dtype=object) tm.assert_numpy_array_equal(np.asarray(index), expected) tm.assert_numpy_array_equal(index.values, expected) - def test_with_nans(self): - index = self.index + def test_with_nans(self, closed): + index = self.create_index(closed=closed) assert not index.hasnans tm.assert_numpy_array_equal(index.isna(), np.array([False, False])) tm.assert_numpy_array_equal(index.notna(), np.array([True, True])) - index = self.index_with_nan + index = self.create_index_with_nan(closed=closed) assert index.hasnans tm.assert_numpy_array_equal(index.notna(), np.array([True, False, True])) tm.assert_numpy_array_equal(index.isna(), np.array([False, True, False])) - def test_copy(self): - actual = self.index.copy() - assert actual.equals(self.index) + def test_copy(self, closed): + expected = IntervalIndex.from_breaks(np.arange(5), closed=closed) + + result = expected.copy() + assert result.equals(expected) - actual = self.index.copy(deep=True) - assert actual.equals(self.index) - assert actual.left is not self.index.left + result = expected.copy(deep=True) + assert result.equals(expected) + assert result.left is not expected.left - def test_ensure_copied_data(self): + def test_ensure_copied_data(self, closed): # exercise the copy flag in the constructor # not copying - index = self.index + index = self.create_index(closed=closed) result = IntervalIndex(index, copy=False) tm.assert_numpy_array_equal(index.left.values, result.left.values, check_same='same') @@ -191,23 +243,34 @@ def test_ensure_copied_data(self): tm.assert_numpy_array_equal(index.right.values, result.right.values, check_same='copy') - def test_equals(self): + def test_equals(self, closed): + expected = IntervalIndex.from_breaks(np.arange(5), closed=closed) + assert expected.equals(expected) + assert expected.equals(expected.copy()) - idx = self.index - assert idx.equals(idx) - assert idx.equals(idx.copy()) + assert not expected.equals(expected.astype(object)) + assert not expected.equals(np.array(expected)) + assert not expected.equals(list(expected)) - assert not idx.equals(idx.astype(object)) - assert not idx.equals(np.array(idx)) - assert not idx.equals(list(idx)) + assert not expected.equals([1, 2]) + assert not expected.equals(np.array([1, 2])) + assert not expected.equals(pd.date_range('20130101', periods=2)) - assert not idx.equals([1, 2]) - assert not idx.equals(np.array([1, 2])) - assert not idx.equals(pd.date_range('20130101', periods=2)) + expected_name1 = IntervalIndex.from_breaks( + np.arange(5), closed=closed, name='foo') + expected_name2 = IntervalIndex.from_breaks( + np.arange(5), closed=closed, name='bar') + assert expected.equals(expected_name1) + assert expected_name1.equals(expected_name2) - def test_astype(self): + for other_closed in {'left', 'right', 'both', 'neither'} - {closed}: + expected_other_closed = IntervalIndex.from_breaks( + np.arange(5), closed=other_closed) + assert not expected.equals(expected_other_closed) - idx = self.index + def test_astype(self, closed): + + idx = self.create_index(closed=closed) for dtype in [np.int64, np.float64, 'datetime64[ns]', 'datetime64[ns, US/Eastern]', 'timedelta64', @@ -227,24 +290,24 @@ def test_astype(self): expected = pd.Categorical(idx, ordered=True) tm.assert_categorical_equal(result, expected) - def test_where(self): - expected = self.index - result = self.index.where(self.index.notna()) + def test_where(self, closed): + expected = self.create_index(closed=closed) + result = expected.where(expected.notna()) tm.assert_index_equal(result, expected) - idx = IntervalIndex.from_breaks([1, 2]) + idx = IntervalIndex.from_breaks([1, 2], closed=closed) result = idx.where([True, False]) expected = IntervalIndex.from_intervals( - [Interval(1.0, 2.0, closed='right'), np.nan]) + [Interval(1.0, 2.0, closed=closed), np.nan]) tm.assert_index_equal(result, expected) def test_where_array_like(self): pass - def test_delete(self): - expected = IntervalIndex.from_breaks([1, 2]) - actual = self.index.delete(0) - assert expected.equals(actual) + def test_delete(self, closed): + expected = IntervalIndex.from_breaks([1, 2], closed=closed) + result = self.create_index(closed=closed).delete(0) + tm.assert_index_equal(result, expected) def test_insert(self): expected = IntervalIndex.from_breaks(range(4)) @@ -255,113 +318,128 @@ def test_insert(self): pytest.raises(ValueError, self.index.insert, 0, Interval(2, 3, closed='left')) - def test_take(self): - actual = self.index.take([0, 1]) - assert self.index.equals(actual) + def test_take(self, closed): + index = self.create_index(closed=closed) - expected = IntervalIndex.from_arrays([0, 0, 1], [1, 1, 2]) - actual = self.index.take([0, 0, 1]) - assert expected.equals(actual) + actual = index.take([0, 1]) + tm.assert_index_equal(actual, index) + + expected = IntervalIndex.from_arrays( + [0, 0, 1], [1, 1, 2], closed=closed) + actual = index.take([0, 0, 1]) + tm.assert_index_equal(actual, expected) - def test_unique(self): + def test_unique(self, closed): # unique non-overlapping - idx = IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)]) + idx = IntervalIndex.from_tuples( + [(0, 1), (2, 3), (4, 5)], closed=closed) assert idx.is_unique # unique overlapping - distinct endpoints - idx = IntervalIndex.from_tuples([(0, 1), (0.5, 1.5)]) + idx = IntervalIndex.from_tuples([(0, 1), (0.5, 1.5)], closed=closed) assert idx.is_unique # unique overlapping - shared endpoints - idx = pd.IntervalIndex.from_tuples([(1, 2), (1, 3), (2, 3)]) + idx = pd.IntervalIndex.from_tuples( + [(1, 2), (1, 3), (2, 3)], closed=closed) assert idx.is_unique # unique nested - idx = IntervalIndex.from_tuples([(-1, 1), (-2, 2)]) + idx = IntervalIndex.from_tuples([(-1, 1), (-2, 2)], closed=closed) assert idx.is_unique # duplicate - idx = IntervalIndex.from_tuples([(0, 1), (0, 1), (2, 3)]) + idx = IntervalIndex.from_tuples( + [(0, 1), (0, 1), (2, 3)], closed=closed) assert not idx.is_unique # unique mixed - idx = IntervalIndex.from_tuples([(0, 1), ('a', 'b')]) + idx = IntervalIndex.from_tuples([(0, 1), ('a', 'b')], closed=closed) assert idx.is_unique # duplicate mixed - idx = IntervalIndex.from_tuples([(0, 1), ('a', 'b'), (0, 1)]) + idx = IntervalIndex.from_tuples( + [(0, 1), ('a', 'b'), (0, 1)], closed=closed) assert not idx.is_unique # empty - idx = IntervalIndex([]) + idx = IntervalIndex([], closed=closed) assert idx.is_unique - def test_monotonic(self): + def test_monotonic(self, closed): # increasing non-overlapping - idx = IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)]) + idx = IntervalIndex.from_tuples( + [(0, 1), (2, 3), (4, 5)], closed=closed) assert idx.is_monotonic assert idx._is_strictly_monotonic_increasing assert not idx.is_monotonic_decreasing assert not idx._is_strictly_monotonic_decreasing # decreasing non-overlapping - idx = IntervalIndex.from_tuples([(4, 5), (2, 3), (1, 2)]) + idx = IntervalIndex.from_tuples( + [(4, 5), (2, 3), (1, 2)], closed=closed) assert not idx.is_monotonic assert not idx._is_strictly_monotonic_increasing assert idx.is_monotonic_decreasing assert idx._is_strictly_monotonic_decreasing # unordered non-overlapping - idx = IntervalIndex.from_tuples([(0, 1), (4, 5), (2, 3)]) + idx = IntervalIndex.from_tuples( + [(0, 1), (4, 5), (2, 3)], closed=closed) assert not idx.is_monotonic assert not idx._is_strictly_monotonic_increasing assert not idx.is_monotonic_decreasing assert not idx._is_strictly_monotonic_decreasing # increasing overlapping - idx = IntervalIndex.from_tuples([(0, 2), (0.5, 2.5), (1, 3)]) + idx = IntervalIndex.from_tuples( + [(0, 2), (0.5, 2.5), (1, 3)], closed=closed) assert idx.is_monotonic assert idx._is_strictly_monotonic_increasing assert not idx.is_monotonic_decreasing assert not idx._is_strictly_monotonic_decreasing # decreasing overlapping - idx = IntervalIndex.from_tuples([(1, 3), (0.5, 2.5), (0, 2)]) + idx = IntervalIndex.from_tuples( + [(1, 3), (0.5, 2.5), (0, 2)], closed=closed) assert not idx.is_monotonic assert not idx._is_strictly_monotonic_increasing assert idx.is_monotonic_decreasing assert idx._is_strictly_monotonic_decreasing # unordered overlapping - idx = IntervalIndex.from_tuples([(0.5, 2.5), (0, 2), (1, 3)]) + idx = IntervalIndex.from_tuples( + [(0.5, 2.5), (0, 2), (1, 3)], closed=closed) assert not idx.is_monotonic assert not idx._is_strictly_monotonic_increasing assert not idx.is_monotonic_decreasing assert not idx._is_strictly_monotonic_decreasing # increasing overlapping shared endpoints - idx = pd.IntervalIndex.from_tuples([(1, 2), (1, 3), (2, 3)]) + idx = pd.IntervalIndex.from_tuples( + [(1, 2), (1, 3), (2, 3)], closed=closed) assert idx.is_monotonic assert idx._is_strictly_monotonic_increasing assert not idx.is_monotonic_decreasing assert not idx._is_strictly_monotonic_decreasing # decreasing overlapping shared endpoints - idx = pd.IntervalIndex.from_tuples([(2, 3), (1, 3), (1, 2)]) + idx = pd.IntervalIndex.from_tuples( + [(2, 3), (1, 3), (1, 2)], closed=closed) assert not idx.is_monotonic assert not idx._is_strictly_monotonic_increasing assert idx.is_monotonic_decreasing assert idx._is_strictly_monotonic_decreasing # stationary - idx = IntervalIndex.from_tuples([(0, 1), (0, 1)]) + idx = IntervalIndex.from_tuples([(0, 1), (0, 1)], closed=closed) assert idx.is_monotonic assert not idx._is_strictly_monotonic_increasing assert idx.is_monotonic_decreasing assert not idx._is_strictly_monotonic_decreasing # empty - idx = IntervalIndex([]) + idx = IntervalIndex([], closed=closed) assert idx.is_monotonic assert idx._is_strictly_monotonic_increasing assert idx.is_monotonic_decreasing @@ -395,24 +473,24 @@ def test_repr_max_seq_item_setting(self): def test_repr_roundtrip(self): super(TestIntervalIndex, self).test_repr_roundtrip() - def test_get_item(self): + def test_get_item(self, closed): i = IntervalIndex.from_arrays((0, 1, np.nan), (1, 2, np.nan), - closed='right') - assert i[0] == Interval(0.0, 1.0) - assert i[1] == Interval(1.0, 2.0) + closed=closed) + assert i[0] == Interval(0.0, 1.0, closed=closed) + assert i[1] == Interval(1.0, 2.0, closed=closed) assert isna(i[2]) result = i[0:1] - expected = IntervalIndex.from_arrays((0.,), (1.,), closed='right') + expected = IntervalIndex.from_arrays((0.,), (1.,), closed=closed) tm.assert_index_equal(result, expected) result = i[0:2] - expected = IntervalIndex.from_arrays((0., 1), (1., 2.), closed='right') + expected = IntervalIndex.from_arrays((0., 1), (1., 2.), closed=closed) tm.assert_index_equal(result, expected) result = i[1:3] expected = IntervalIndex.from_arrays((1., np.nan), (2., np.nan), - closed='right') + closed=closed) tm.assert_index_equal(result, expected) def test_get_loc_value(self): @@ -581,20 +659,22 @@ def testcontains(self): assert not i.contains(20) assert not i.contains(-20) - def test_dropna(self): + def test_dropna(self, closed): - expected = IntervalIndex.from_tuples([(0.0, 1.0), (1.0, 2.0)]) + expected = IntervalIndex.from_tuples( + [(0.0, 1.0), (1.0, 2.0)], closed=closed) - ii = IntervalIndex.from_tuples([(0, 1), (1, 2), np.nan]) + ii = IntervalIndex.from_tuples([(0, 1), (1, 2), np.nan], closed=closed) result = ii.dropna() tm.assert_index_equal(result, expected) - ii = IntervalIndex.from_arrays([0, 1, np.nan], [1, 2, np.nan]) + ii = IntervalIndex.from_arrays( + [0, 1, np.nan], [1, 2, np.nan], closed=closed) result = ii.dropna() tm.assert_index_equal(result, expected) - def test_non_contiguous(self): - index = IntervalIndex.from_tuples([(0, 1), (2, 3)]) + def test_non_contiguous(self, closed): + index = IntervalIndex.from_tuples([(0, 1), (2, 3)], closed=closed) target = [0.5, 1.5, 2.5] actual = index.get_indexer(target) expected = np.array([0, -1, 1], dtype='intp') @@ -602,31 +682,32 @@ def test_non_contiguous(self): assert 1.5 not in index - def test_union(self): - other = IntervalIndex.from_arrays([2], [3]) - expected = IntervalIndex.from_arrays(range(3), range(1, 4)) - actual = self.index.union(other) + def test_union(self, closed): + idx = self.create_index(closed=closed) + other = IntervalIndex.from_arrays([2], [3], closed=closed) + expected = IntervalIndex.from_arrays( + range(3), range(1, 4), closed=closed) + actual = idx.union(other) assert expected.equals(actual) - actual = other.union(self.index) + actual = other.union(idx) assert expected.equals(actual) - tm.assert_index_equal(self.index.union(self.index), self.index) - tm.assert_index_equal(self.index.union(self.index[:1]), - self.index) + tm.assert_index_equal(idx.union(idx), idx) + tm.assert_index_equal(idx.union(idx[:1]), idx) - def test_intersection(self): - other = IntervalIndex.from_breaks([1, 2, 3]) - expected = IntervalIndex.from_breaks([1, 2]) - actual = self.index.intersection(other) + def test_intersection(self, closed): + idx = self.create_index(closed=closed) + other = IntervalIndex.from_breaks([1, 2, 3], closed=closed) + expected = IntervalIndex.from_breaks([1, 2], closed=closed) + actual = idx.intersection(other) assert expected.equals(actual) - tm.assert_index_equal(self.index.intersection(self.index), - self.index) + tm.assert_index_equal(idx.intersection(idx), idx) - def test_difference(self): - tm.assert_index_equal(self.index.difference(self.index[:1]), - self.index[1:]) + def test_difference(self, closed): + idx = self.create_index(closed=closed) + tm.assert_index_equal(idx.difference(idx[:1]), idx[1:]) def test_symmetric_difference(self): result = self.index[:1].symmetric_difference(self.index[1:]) @@ -639,11 +720,12 @@ def test_set_operation_errors(self): other = IntervalIndex.from_breaks([0, 1, 2], closed='neither') pytest.raises(ValueError, self.index.union, other) - def test_isin(self): - actual = self.index.isin(self.index) + def test_isin(self, closed): + idx = self.create_index(closed=closed) + actual = idx.isin(idx) tm.assert_numpy_array_equal(np.array([True, True]), actual) - actual = self.index.isin(self.index[:1]) + actual = idx.isin(idx[:1]) tm.assert_numpy_array_equal(np.array([True, False]), actual) def test_comparison(self): @@ -702,25 +784,28 @@ def test_comparison(self): with pytest.raises(ValueError): self.index > np.arange(3) - def test_missing_values(self): - idx = pd.Index([np.nan, pd.Interval(0, 1), pd.Interval(1, 2)]) - idx2 = pd.IntervalIndex.from_arrays([np.nan, 0, 1], [np.nan, 1, 2]) + def test_missing_values(self, closed): + idx = Index([np.nan, Interval(0, 1, closed=closed), + Interval(1, 2, closed=closed)]) + idx2 = IntervalIndex.from_arrays( + [np.nan, 0, 1], [np.nan, 1, 2], closed=closed) assert idx.equals(idx2) with pytest.raises(ValueError): - IntervalIndex.from_arrays([np.nan, 0, 1], np.array([0, 1, 2])) + IntervalIndex.from_arrays( + [np.nan, 0, 1], np.array([0, 1, 2]), closed=closed) tm.assert_numpy_array_equal(isna(idx), np.array([True, False, False])) - def test_sort_values(self): - expected = IntervalIndex.from_breaks([1, 2, 3, 4]) - actual = IntervalIndex.from_tuples([(3, 4), (1, 2), - (2, 3)]).sort_values() + def test_sort_values(self, closed): + expected = IntervalIndex.from_breaks([1, 2, 3, 4], closed=closed) + actual = IntervalIndex.from_tuples( + [(3, 4), (1, 2), (2, 3)], closed=closed).sort_values() tm.assert_index_equal(expected, actual) # nan - idx = self.index_with_nan + idx = self.create_index_with_nan(closed=closed) mask = idx.isna() tm.assert_numpy_array_equal(mask, np.array([False, True, False])) @@ -733,84 +818,83 @@ def test_sort_values(self): tm.assert_numpy_array_equal(mask, np.array([True, False, False])) def test_datetime(self): - dates = pd.date_range('2000', periods=3) + dates = date_range('2000', periods=3) idx = IntervalIndex.from_breaks(dates) tm.assert_index_equal(idx.left, dates[:2]) tm.assert_index_equal(idx.right, dates[-2:]) - expected = pd.date_range('2000-01-01T12:00', periods=2) + expected = date_range('2000-01-01T12:00', periods=2) tm.assert_index_equal(idx.mid, expected) - assert pd.Timestamp('2000-01-01T12') not in idx - assert pd.Timestamp('2000-01-01T12') not in idx + assert Timestamp('2000-01-01T12') not in idx + assert Timestamp('2000-01-01T12') not in idx - target = pd.date_range('1999-12-31T12:00', periods=7, freq='12H') + target = date_range('1999-12-31T12:00', periods=7, freq='12H') actual = idx.get_indexer(target) expected = np.array([-1, -1, 0, 0, 1, 1, -1], dtype='intp') tm.assert_numpy_array_equal(actual, expected) - def test_append(self): + def test_append(self, closed): - index1 = IntervalIndex.from_arrays([0, 1], [1, 2]) - index2 = IntervalIndex.from_arrays([1, 2], [2, 3]) + index1 = IntervalIndex.from_arrays([0, 1], [1, 2], closed=closed) + index2 = IntervalIndex.from_arrays([1, 2], [2, 3], closed=closed) result = index1.append(index2) - expected = IntervalIndex.from_arrays([0, 1, 1, 2], [1, 2, 2, 3]) + expected = IntervalIndex.from_arrays( + [0, 1, 1, 2], [1, 2, 2, 3], closed=closed) tm.assert_index_equal(result, expected) result = index1.append([index1, index2]) - expected = IntervalIndex.from_arrays([0, 1, 0, 1, 1, 2], - [1, 2, 1, 2, 2, 3]) + expected = IntervalIndex.from_arrays( + [0, 1, 0, 1, 1, 2], [1, 2, 1, 2, 2, 3], closed=closed) tm.assert_index_equal(result, expected) - def f(): - index1.append(IntervalIndex.from_arrays([0, 1], [1, 2], - closed='both')) - - pytest.raises(ValueError, f) + msg = ('can only append two IntervalIndex objects that are closed ' + 'on the same side') + for other_closed in {'left', 'right', 'both', 'neither'} - {closed}: + index_other_closed = IntervalIndex.from_arrays( + [0, 1], [1, 2], closed=other_closed) + with tm.assert_raises_regex(ValueError, msg): + index1.append(index_other_closed) - def test_is_non_overlapping_monotonic(self): + def test_is_non_overlapping_monotonic(self, closed): # Should be True in all cases tpls = [(0, 1), (2, 3), (4, 5), (6, 7)] - for closed in ('left', 'right', 'neither', 'both'): - idx = IntervalIndex.from_tuples(tpls, closed=closed) - assert idx.is_non_overlapping_monotonic is True + idx = IntervalIndex.from_tuples(tpls, closed=closed) + assert idx.is_non_overlapping_monotonic is True - idx = IntervalIndex.from_tuples(reversed(tpls), closed=closed) - assert idx.is_non_overlapping_monotonic is True + idx = IntervalIndex.from_tuples(reversed(tpls), closed=closed) + assert idx.is_non_overlapping_monotonic is True # Should be False in all cases (overlapping) tpls = [(0, 2), (1, 3), (4, 5), (6, 7)] - for closed in ('left', 'right', 'neither', 'both'): - idx = IntervalIndex.from_tuples(tpls, closed=closed) - assert idx.is_non_overlapping_monotonic is False + idx = IntervalIndex.from_tuples(tpls, closed=closed) + assert idx.is_non_overlapping_monotonic is False - idx = IntervalIndex.from_tuples(reversed(tpls), closed=closed) - assert idx.is_non_overlapping_monotonic is False + idx = IntervalIndex.from_tuples(reversed(tpls), closed=closed) + assert idx.is_non_overlapping_monotonic is False # Should be False in all cases (non-monotonic) tpls = [(0, 1), (2, 3), (6, 7), (4, 5)] - for closed in ('left', 'right', 'neither', 'both'): - idx = IntervalIndex.from_tuples(tpls, closed=closed) - assert idx.is_non_overlapping_monotonic is False - - idx = IntervalIndex.from_tuples(reversed(tpls), closed=closed) - assert idx.is_non_overlapping_monotonic is False + idx = IntervalIndex.from_tuples(tpls, closed=closed) + assert idx.is_non_overlapping_monotonic is False - # Should be False for closed='both', overwise True (GH16560) - idx = IntervalIndex.from_breaks(range(4), closed='both') + idx = IntervalIndex.from_tuples(reversed(tpls), closed=closed) assert idx.is_non_overlapping_monotonic is False - for closed in ('left', 'right', 'neither'): + # Should be False for closed='both', overwise True (GH16560) + if closed == 'both': + idx = IntervalIndex.from_breaks(range(4), closed=closed) + assert idx.is_non_overlapping_monotonic is False + else: idx = IntervalIndex.from_breaks(range(4), closed=closed) assert idx.is_non_overlapping_monotonic is True class TestIntervalRange(object): - @pytest.mark.parametrize('closed', ['left', 'right', 'neither', 'both']) def test_construction_from_numeric(self, closed): # combinations of start/end/periods without freq expected = IntervalIndex.from_breaks( @@ -848,7 +932,6 @@ def test_construction_from_numeric(self, closed): closed=closed) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('closed', ['left', 'right', 'neither', 'both']) def test_construction_from_timestamp(self, closed): # combinations of start/end/periods without freq start, end = Timestamp('2017-01-01'), Timestamp('2017-01-06') @@ -915,7 +998,6 @@ def test_construction_from_timestamp(self, closed): closed=closed) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize('closed', ['left', 'right', 'neither', 'both']) def test_construction_from_timedelta(self, closed): # combinations of start/end/periods without freq start, end = Timedelta('1 day'), Timedelta('6 days') diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 18bfc3d0efbee..c9c4029786c64 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -2980,3 +2980,13 @@ def test_nan_stays_float(self): assert pd.isna(df0.index.get_level_values(1)).all() # the following failed in 0.14.1 assert pd.isna(dfm.index.get_level_values(1)[:-1]).all() + + def test_million_record_attribute_error(self): + # GH 18165 + r = list(range(1000000)) + df = pd.DataFrame({'a': r, 'b': r}, + index=pd.MultiIndex.from_tuples([(x, x) for x in r])) + + with tm.assert_raises_regex(AttributeError, + "'Series' object has no attribute 'foo'"): + df['a'].foo() diff --git a/pandas/tests/indexing/test_timedelta.py b/pandas/tests/indexing/test_timedelta.py index 32609362e49af..3ad3b771b2ab2 100644 --- a/pandas/tests/indexing/test_timedelta.py +++ b/pandas/tests/indexing/test_timedelta.py @@ -2,6 +2,7 @@ import pandas as pd from pandas.util import testing as tm +import numpy as np class TestTimedeltaIndexing(object): @@ -47,3 +48,23 @@ def test_string_indexing(self): expected = df.iloc[0] sliced = df.loc['0 days'] tm.assert_series_equal(sliced, expected) + + @pytest.mark.parametrize( + "value", + [None, pd.NaT, np.nan]) + def test_masked_setitem(self, value): + # issue (#18586) + series = pd.Series([0, 1, 2], dtype='timedelta64[ns]') + series[series == series[0]] = value + expected = pd.Series([pd.NaT, 1, 2], dtype='timedelta64[ns]') + tm.assert_series_equal(series, expected) + + @pytest.mark.parametrize( + "value", + [None, pd.NaT, np.nan]) + def test_listlike_setitem(self, value): + # issue (#18586) + series = pd.Series([0, 1, 2], dtype='timedelta64[ns]') + series.iloc[0] = value + expected = pd.Series([pd.NaT, 1, 2], dtype='timedelta64[ns]') + tm.assert_series_equal(series, expected) diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py new file mode 100644 index 0000000000000..828d5d0ccd3c6 --- /dev/null +++ b/pandas/tests/io/conftest.py @@ -0,0 +1,74 @@ +import os + +import moto +import pytest +from pandas.io.parsers import read_table + +HERE = os.path.dirname(__file__) + + +@pytest.fixture(scope='module') +def tips_file(): + """Path to the tips dataset""" + return os.path.join(HERE, 'parser', 'data', 'tips.csv') + + +@pytest.fixture(scope='module') +def jsonl_file(): + """Path a JSONL dataset""" + return os.path.join(HERE, 'parser', 'data', 'items.jsonl') + + +@pytest.fixture(scope='module') +def salaries_table(): + """DataFrame with the salaries dataset""" + path = os.path.join(HERE, 'parser', 'data', 'salaries.csv') + return read_table(path) + + +@pytest.fixture(scope='module') +def s3_resource(tips_file, jsonl_file): + """Fixture for mocking S3 interaction. + + The primary bucket name is "pandas-test". The following datasets + are loaded. + + - tips.csv + - tips.csv.gz + - tips.csv.bz2 + - items.jsonl + + A private bucket "cant_get_it" is also created. The boto3 s3 resource + is yielded by the fixture. + """ + pytest.importorskip('s3fs') + moto.mock_s3().start() + + test_s3_files = [ + ('tips.csv', tips_file), + ('tips.csv.gz', tips_file + '.gz'), + ('tips.csv.bz2', tips_file + '.bz2'), + ('items.jsonl', jsonl_file), + ] + + def add_tips_files(bucket_name): + for s3_key, file_name in test_s3_files: + with open(file_name, 'rb') as f: + conn.Bucket(bucket_name).put_object( + Key=s3_key, + Body=f) + + boto3 = pytest.importorskip('boto3') + # see gh-16135 + bucket = 'pandas-test' + + conn = boto3.resource("s3", region_name="us-east-1") + conn.create_bucket(Bucket=bucket) + add_tips_files(bucket) + + conn.create_bucket(Bucket='cant_get_it', ACL='private') + add_tips_files('cant_get_it') + + yield conn + + moto.mock_s3().stop() diff --git a/pandas/tests/io/data/stata13_dates.dta b/pandas/tests/io/data/stata13_dates.dta new file mode 100644 index 0000000000000..87b857559e501 Binary files /dev/null and b/pandas/tests/io/data/stata13_dates.dta differ diff --git a/pandas/tests/io/formats/test_to_latex.py b/pandas/tests/io/formats/test_to_latex.py index aa86d1d9231fb..c0b7d4cee384a 100644 --- a/pandas/tests/io/formats/test_to_latex.py +++ b/pandas/tests/io/formats/test_to_latex.py @@ -221,6 +221,28 @@ def test_to_latex_multiindex(self): assert result == expected + def test_to_latex_multiindex_dupe_level(self): + # see gh-14484 + # + # If an index is repeated in subsequent rows, it should be + # replaced with a blank in the created table. This should + # ONLY happen if all higher order indices (to the left) are + # equal too. In this test, 'c' has to be printed both times + # because the higher order index 'A' != 'B'. + df = pd.DataFrame(index=pd.MultiIndex.from_tuples( + [('A', 'c'), ('B', 'c')]), columns=['col']) + result = df.to_latex() + expected = r"""\begin{tabular}{lll} +\toprule + & & col \\ +\midrule +A & c & NaN \\ +B & c & NaN \\ +\bottomrule +\end{tabular} +""" + assert result == expected + def test_to_latex_multicolumnrow(self): df = pd.DataFrame({ ('c1', 0): dict((x, x) for x in range(5)), diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index 49b765b18d623..1cceae32cd748 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -173,6 +173,21 @@ def test_meta_name_conflict(self): for val in ['metafoo', 'metabar', 'foo', 'bar']: assert val in result + def test_meta_parameter_not_modified(self): + # GH 18610 + data = [{'foo': 'hello', + 'bar': 'there', + 'data': [{'foo': 'something', 'bar': 'else'}, + {'foo': 'something2', 'bar': 'else2'}]}] + + COLUMNS = ['foo', 'bar'] + result = json_normalize(data, 'data', meta=COLUMNS, + meta_prefix='meta') + + assert COLUMNS == ['foo', 'bar'] + for val in ['metafoo', 'metabar', 'foo', 'bar']: + assert val in result + def test_record_prefix(self, state_data): result = json_normalize(state_data[0], 'counties') expected = DataFrame(state_data[0]['counties']) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 6625446bea469..78e33f8966d1f 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -4,7 +4,6 @@ from pandas.compat import (range, lrange, StringIO, OrderedDict, is_platform_32bit) import os - import numpy as np from pandas import (Series, DataFrame, DatetimeIndex, Timestamp, read_json, compat) @@ -1030,6 +1029,70 @@ def test_tz_range_is_utc(self): df = DataFrame({'DT': dti}) assert dumps(df, iso_dates=True) == dfexp + def test_read_inline_jsonl(self): + # GH9180 + result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True) + expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) + assert_frame_equal(result, expected) + + def test_read_s3_jsonl(self, s3_resource): + pytest.importorskip('s3fs') + # GH17200 + + result = read_json('s3n://pandas-test/items.jsonl', lines=True) + expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) + assert_frame_equal(result, expected) + + def test_read_local_jsonl(self): + # GH17200 + with ensure_clean('tmp_items.json') as path: + with open(path, 'w') as infile: + infile.write('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n') + result = read_json(path, lines=True) + expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) + assert_frame_equal(result, expected) + + def test_read_jsonl_unicode_chars(self): + # GH15132: non-ascii unicode characters + # \u201d == RIGHT DOUBLE QUOTATION MARK + + # simulate file handle + json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n' + json = StringIO(json) + result = read_json(json, lines=True) + expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]], + columns=['a', 'b']) + assert_frame_equal(result, expected) + + # simulate string + json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n' + result = read_json(json, lines=True) + expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]], + columns=['a', 'b']) + assert_frame_equal(result, expected) + + def test_to_jsonl(self): + # GH9180 + df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) + result = df.to_json(orient="records", lines=True) + expected = '{"a":1,"b":2}\n{"a":1,"b":2}' + assert result == expected + + df = DataFrame([["foo}", "bar"], ['foo"', "bar"]], columns=['a', 'b']) + result = df.to_json(orient="records", lines=True) + expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\\"","b":"bar"}' + assert result == expected + assert_frame_equal(pd.read_json(result, lines=True), df) + + # GH15096: escaped characters in columns and data + df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]], + columns=["a\\", 'b']) + result = df.to_json(orient="records", lines=True) + expected = ('{"a\\\\":"foo\\\\","b":"bar"}\n' + '{"a\\\\":"foo\\"","b":"bar"}') + assert result == expected + assert_frame_equal(pd.read_json(result, lines=True), df) + def test_latin_encoding(self): if compat.PY2: tm.assert_raises_regex( diff --git a/pandas/tests/io/parser/c_parser_only.py b/pandas/tests/io/parser/c_parser_only.py index c68b2bf064d97..6d476e326213e 100644 --- a/pandas/tests/io/parser/c_parser_only.py +++ b/pandas/tests/io/parser/c_parser_only.py @@ -290,11 +290,11 @@ def test_empty_header_read(count): test_empty_header_read(count) def test_parse_trim_buffers(self): - # This test is part of a bugfix for issue #13703. It attmepts to + # This test is part of a bugfix for issue #13703. It attempts to # to stress the system memory allocator, to cause it to move the # stream buffer and either let the OS reclaim the region, or let # other memory requests of parser otherwise modify the contents - # of memory space, where it was formely located. + # of memory space, where it was formally located. # This test is designed to cause a `segfault` with unpatched # `tokenizer.c`. Sometimes the test fails on `segfault`, other # times it fails due to memory corruption, which causes the @@ -346,7 +346,7 @@ def test_parse_trim_buffers(self): # Generate the expected output: manually create the dataframe # by splitting by comma and repeating the `n_lines` times. - row = tuple(val_ if val_ else float("nan") + row = tuple(val_ if val_ else np.nan for val_ in record_.split(",")) expected = pd.DataFrame([row for _ in range(n_lines)], dtype=object, columns=None, index=None) @@ -359,6 +359,15 @@ def test_parse_trim_buffers(self): # Check for data corruption if there was no segfault tm.assert_frame_equal(result, expected) + # This extra test was added to replicate the fault in gh-5291. + # Force 'utf-8' encoding, so that `_string_convert` would take + # a different execution branch. + chunks_ = self.read_csv(StringIO(csv_data), header=None, + dtype=object, chunksize=chunksize, + encoding='utf_8') + result = pd.concat(chunks_, axis=0, ignore_index=True) + tm.assert_frame_equal(result, expected) + def test_internal_null_byte(self): # see gh-14012 # diff --git a/pandas/tests/io/parser/common.py b/pandas/tests/io/parser/common.py index e85d3ad294655..6a996213b28bb 100644 --- a/pandas/tests/io/parser/common.py +++ b/pandas/tests/io/parser/common.py @@ -823,7 +823,7 @@ def test_parse_integers_above_fp_precision(self): 17007000002000192, 17007000002000194]}) - assert np.array_equal(result['Numbers'], expected['Numbers']) + tm.assert_series_equal(result['Numbers'], expected['Numbers']) def test_chunks_have_consistent_numerical_type(self): integers = [str(i) for i in range(499999)] diff --git a/pandas/tests/io/parser/data/items.jsonl b/pandas/tests/io/parser/data/items.jsonl new file mode 100644 index 0000000000000..f784d37befa82 --- /dev/null +++ b/pandas/tests/io/parser/data/items.jsonl @@ -0,0 +1,2 @@ +{"a": 1, "b": 2} +{"b":2, "a" :1} diff --git a/pandas/tests/io/parser/dtypes.py b/pandas/tests/io/parser/dtypes.py index 7d3df6201a390..b91ce04673e29 100644 --- a/pandas/tests/io/parser/dtypes.py +++ b/pandas/tests/io/parser/dtypes.py @@ -114,6 +114,17 @@ def test_categorical_dtype(self): actual = self.read_csv(StringIO(data), dtype='category') tm.assert_frame_equal(actual, expected) + @pytest.mark.slow + def test_categorical_dtype_high_cardinality_numeric(self): + # GH 18186 + data = np.sort([str(i) for i in range(524289)]) + expected = DataFrame({'a': Categorical(data, ordered=True)}) + actual = self.read_csv(StringIO('a\n' + '\n'.join(data)), + dtype='category') + actual["a"] = actual["a"].cat.reorder_categories( + np.sort(actual.a.cat.categories), ordered=True) + tm.assert_frame_equal(actual, expected) + def test_categorical_dtype_encoding(self): # GH 10153 pth = tm.get_data_path('unicode_series.csv') diff --git a/pandas/tests/io/parser/na_values.py b/pandas/tests/io/parser/na_values.py index 7fbf174e19eee..8dc599b42ddc7 100644 --- a/pandas/tests/io/parser/na_values.py +++ b/pandas/tests/io/parser/na_values.py @@ -312,3 +312,21 @@ def test_empty_na_values_no_default_with_index(self): out = self.read_csv(StringIO(data), keep_default_na=False, index_col=0) tm.assert_frame_equal(out, expected) + + def test_no_na_filter_on_index(self): + # see gh-5239 + data = "a,b,c\n1,,3\n4,5,6" + + # Don't parse NA-values in index when na_filter=False. + out = self.read_csv(StringIO(data), index_col=[1], na_filter=False) + + expected = DataFrame({"a": [1, 4], "c": [3, 6]}, + index=Index(["", "5"], name="b")) + tm.assert_frame_equal(out, expected) + + # Parse NA-values in index when na_filter=True. + out = self.read_csv(StringIO(data), index_col=[1], na_filter=True) + + expected = DataFrame({"a": [1, 4], "c": [3, 6]}, + index=Index([np.nan, 5.0], name="b")) + tm.assert_frame_equal(out, expected) diff --git a/pandas/tests/io/parser/parse_dates.py b/pandas/tests/io/parser/parse_dates.py index 90103e7bf26b0..4c0f67fa6876a 100644 --- a/pandas/tests/io/parser/parse_dates.py +++ b/pandas/tests/io/parser/parse_dates.py @@ -656,3 +656,21 @@ def test_parse_date_column_with_empty_string(self): [621, ' ']] expected = DataFrame(expected_data, columns=['case', 'opdate']) tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("data,expected", [ + ("a\n135217135789158401\n1352171357E+5", + DataFrame({"a": [135217135789158401, + 135217135700000]}, dtype="float64")), + ("a\n99999999999\n123456789012345\n1234E+0", + DataFrame({"a": [99999999999, + 123456789012345, + 1234]}, dtype="float64")) + ]) + @pytest.mark.parametrize("parse_dates", [True, False]) + def test_parse_date_float(self, data, expected, parse_dates): + # see gh-2697 + # + # Date parsing should fail, so we leave the data untouched + # (i.e. float precision should remain unchanged). + result = self.read_csv(StringIO(data), parse_dates=parse_dates) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index 27cc708889fa2..d00d3f31ce189 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -4,10 +4,7 @@ Tests parsers ability to read and parse non-local files and hence require a network connection to be read. """ -import os - import pytest -import moto import pandas.util.testing as tm from pandas import DataFrame @@ -15,51 +12,6 @@ from pandas.compat import BytesIO -@pytest.fixture(scope='module') -def tips_file(): - return os.path.join(tm.get_data_path(), 'tips.csv') - - -@pytest.fixture(scope='module') -def salaries_table(): - path = os.path.join(tm.get_data_path(), 'salaries.csv') - return read_table(path) - - -@pytest.fixture(scope='module') -def s3_resource(tips_file): - pytest.importorskip('s3fs') - moto.mock_s3().start() - - test_s3_files = [ - ('tips.csv', tips_file), - ('tips.csv.gz', tips_file + '.gz'), - ('tips.csv.bz2', tips_file + '.bz2'), - ] - - def add_tips_files(bucket_name): - for s3_key, file_name in test_s3_files: - with open(file_name, 'rb') as f: - conn.Bucket(bucket_name).put_object( - Key=s3_key, - Body=f) - - boto3 = pytest.importorskip('boto3') - # see gh-16135 - bucket = 'pandas-test' - - conn = boto3.resource("s3", region_name="us-east-1") - conn.create_bucket(Bucket=bucket) - add_tips_files(bucket) - - conn.create_bucket(Bucket='cant_get_it', ACL='private') - add_tips_files('cant_get_it') - - yield conn - - moto.mock_s3().stop() - - @pytest.mark.network @pytest.mark.parametrize( "compression,extension", diff --git a/pandas/tests/io/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py index c9088d2ecc5e7..f66f9ccf065f7 100644 --- a/pandas/tests/io/parser/test_textreader.py +++ b/pandas/tests/io/parser/test_textreader.py @@ -161,9 +161,9 @@ def test_skip_bad_lines(self): error_bad_lines=False, warn_bad_lines=False) result = reader.read() - expected = {0: ['a', 'd', 'g', 'l'], - 1: ['b', 'e', 'h', 'm'], - 2: ['c', 'f', 'i', 'n']} + expected = {0: np.array(['a', 'd', 'g', 'l'], dtype=object), + 1: np.array(['b', 'e', 'h', 'm'], dtype=object), + 2: np.array(['c', 'f', 'i', 'n'], dtype=object)} assert_array_dicts_equal(result, expected) reader = TextReader(StringIO(data), delimiter=':', @@ -189,8 +189,10 @@ def test_header_not_enough_lines(self): assert header == expected recs = reader.read() - expected = {0: [1, 4], 1: [2, 5], 2: [3, 6]} - assert_array_dicts_equal(expected, recs) + expected = {0: np.array([1, 4], dtype=np.int64), + 1: np.array([2, 5], dtype=np.int64), + 2: np.array([3, 6], dtype=np.int64)} + assert_array_dicts_equal(recs, expected) # not enough rows pytest.raises(parser.ParserError, TextReader, StringIO(data), @@ -203,14 +205,16 @@ def test_header_not_enough_lines_as_recarray(self): '1,2,3\n' '4,5,6') - reader = TextReader(StringIO(data), delimiter=',', header=2, - as_recarray=True) + reader = TextReader(StringIO(data), delimiter=',', + header=2, as_recarray=True) header = reader.header expected = [['a', 'b', 'c']] assert header == expected recs = reader.read() - expected = {'a': [1, 4], 'b': [2, 5], 'c': [3, 6]} + expected = {'a': np.array([1, 4], dtype=np.int64), + 'b': np.array([2, 5], dtype=np.int64), + 'c': np.array([3, 6], dtype=np.int64)} assert_array_dicts_equal(expected, recs) # not enough rows @@ -225,7 +229,7 @@ def test_escapechar(self): reader = TextReader(StringIO(data), delimiter=',', header=None, escapechar='\\') result = reader.read() - expected = {0: ['"hello world"'] * 3} + expected = {0: np.array(['"hello world"'] * 3, dtype=object)} assert_array_dicts_equal(result, expected) def test_eof_has_eol(self): @@ -360,7 +364,7 @@ def test_empty_field_eof(self): result = TextReader(StringIO(data), delimiter=',').read() - expected = {0: np.array([1, 4]), + expected = {0: np.array([1, 4], dtype=np.int64), 1: np.array(['2', ''], dtype=object), 2: np.array(['3', ''], dtype=object)} assert_array_dicts_equal(result, expected) @@ -397,4 +401,5 @@ def test_empty_csv_input(self): def assert_array_dicts_equal(left, right): for k, v in compat.iteritems(left): - assert(np.array_equal(v, right[k])) + assert tm.assert_numpy_array_equal(np.asarray(v), + np.asarray(right[k])) diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index 940a331a9de84..b5d1435c29cb7 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -18,7 +18,7 @@ try: DataFrame({'A': [1, 2]}).to_clipboard() _DEPS_INSTALLED = 1 -except PyperclipException: +except (PyperclipException, RuntimeError): _DEPS_INSTALLED = 0 diff --git a/pandas/tests/io/test_packers.py b/pandas/tests/io/test_packers.py index a28adcf1ee771..bc58ea1c7c228 100644 --- a/pandas/tests/io/test_packers.py +++ b/pandas/tests/io/test_packers.py @@ -180,6 +180,15 @@ def test_scalar_float(self): x_rec = self.encode_decode(x) tm.assert_almost_equal(x, x_rec) + def test_scalar_bool(self): + x = np.bool_(1) + x_rec = self.encode_decode(x) + tm.assert_almost_equal(x, x_rec) + + x = np.bool_(0) + x_rec = self.encode_decode(x) + tm.assert_almost_equal(x, x_rec) + def test_scalar_complex(self): x = np.random.rand() + 1j * np.random.rand() x_rec = self.encode_decode(x) @@ -263,7 +272,7 @@ def test_numpy_array_complex(self): x.dtype == x_rec.dtype) def test_list_mixed(self): - x = [1.0, np.float32(3.5), np.complex128(4.25), u('foo')] + x = [1.0, np.float32(3.5), np.complex128(4.25), u('foo'), np.bool_(1)] x_rec = self.encode_decode(x) # current msgpack cannot distinguish list/tuple tm.assert_almost_equal(tuple(x), x_rec) @@ -401,6 +410,7 @@ def setup_method(self, method): 'G': [Timestamp('20130102', tz='US/Eastern')] * 5, 'H': Categorical([1, 2, 3, 4, 5]), 'I': Categorical([1, 2, 3, 4, 5], ordered=True), + 'J': (np.bool_(1), 2, 3, 4, 5), } self.d['float'] = Series(data['A']) @@ -410,6 +420,7 @@ def setup_method(self, method): self.d['dt_tz'] = Series(data['G']) self.d['cat_ordered'] = Series(data['H']) self.d['cat_unordered'] = Series(data['I']) + self.d['numpy_bool_mixed'] = Series(data['J']) def test_basic(self): diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index ecd4e8f719014..e7bcff22371b7 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -105,7 +105,7 @@ def test_options_py(df_compat, pa): with pd.option_context('io.parquet.engine', 'pyarrow'): df.to_parquet(path) - result = read_parquet(path, compression=None) + result = read_parquet(path) tm.assert_frame_equal(result, df) @@ -118,7 +118,7 @@ def test_options_fp(df_compat, fp): with pd.option_context('io.parquet.engine', 'fastparquet'): df.to_parquet(path, compression=None) - result = read_parquet(path, compression=None) + result = read_parquet(path) tm.assert_frame_equal(result, df) @@ -130,7 +130,7 @@ def test_options_auto(df_compat, fp, pa): with pd.option_context('io.parquet.engine', 'auto'): df.to_parquet(path) - result = read_parquet(path, compression=None) + result = read_parquet(path) tm.assert_frame_equal(result, df) @@ -162,7 +162,7 @@ def test_cross_engine_pa_fp(df_cross_compat, pa, fp): with tm.ensure_clean() as path: df.to_parquet(path, engine=pa, compression=None) - result = read_parquet(path, engine=fp, compression=None) + result = read_parquet(path, engine=fp) tm.assert_frame_equal(result, df) @@ -174,7 +174,7 @@ def test_cross_engine_fp_pa(df_cross_compat, pa, fp): with tm.ensure_clean() as path: df.to_parquet(path, engine=fp, compression=None) - result = read_parquet(path, engine=pa, compression=None) + result = read_parquet(path, engine=pa) tm.assert_frame_equal(result, df) @@ -188,19 +188,23 @@ def check_error_on_write(self, df, engine, exc): with tm.ensure_clean() as path: to_parquet(df, path, engine, compression=None) - def check_round_trip(self, df, engine, expected=None, **kwargs): - + def check_round_trip(self, df, engine, expected=None, + write_kwargs=None, read_kwargs=None): + if write_kwargs is None: + write_kwargs = {} + if read_kwargs is None: + read_kwargs = {} with tm.ensure_clean() as path: - df.to_parquet(path, engine, **kwargs) - result = read_parquet(path, engine) + df.to_parquet(path, engine, **write_kwargs) + result = read_parquet(path, engine, **read_kwargs) if expected is None: expected = df tm.assert_frame_equal(result, expected) # repeat - to_parquet(df, path, engine, **kwargs) - result = pd.read_parquet(path, engine) + to_parquet(df, path, engine, **write_kwargs) + result = pd.read_parquet(path, engine, **read_kwargs) if expected is None: expected = df @@ -222,7 +226,7 @@ def test_columns_dtypes(self, engine): # unicode df.columns = [u'foo', u'bar'] - self.check_round_trip(df, engine, compression=None) + self.check_round_trip(df, engine, write_kwargs={'compression': None}) def test_columns_dtypes_invalid(self, engine): @@ -246,7 +250,7 @@ def test_columns_dtypes_invalid(self, engine): def test_write_with_index(self, engine): df = pd.DataFrame({'A': [1, 2, 3]}) - self.check_round_trip(df, engine, compression=None) + self.check_round_trip(df, engine, write_kwargs={'compression': None}) # non-default index for index in [[2, 3, 4], @@ -280,7 +284,18 @@ def test_compression(self, engine, compression): pytest.importorskip('brotli') df = pd.DataFrame({'A': [1, 2, 3]}) - self.check_round_trip(df, engine, compression=compression) + self.check_round_trip(df, engine, + write_kwargs={'compression': compression}) + + def test_read_columns(self, engine): + # GH18154 + df = pd.DataFrame({'string': list('abc'), + 'int': list(range(1, 4))}) + + expected = pd.DataFrame({'string': list('abc')}) + self.check_round_trip(df, engine, expected=expected, + write_kwargs={'compression': None}, + read_kwargs={'columns': ['string']}) class TestParquetPyArrow(Base): @@ -368,7 +383,7 @@ def test_basic(self, fp): 'timedelta': pd.timedelta_range('1 day', periods=3), }) - self.check_round_trip(df, fp, compression=None) + self.check_round_trip(df, fp, write_kwargs={'compression': None}) @pytest.mark.skip(reason="not supported") def test_duplicate_columns(self, fp): @@ -381,7 +396,8 @@ def test_duplicate_columns(self, fp): def test_bool_with_none(self, fp): df = pd.DataFrame({'a': [True, None, False]}) expected = pd.DataFrame({'a': [1.0, np.nan, 0.0]}, dtype='float16') - self.check_round_trip(df, fp, expected=expected, compression=None) + self.check_round_trip(df, fp, expected=expected, + write_kwargs={'compression': None}) def test_unsupported(self, fp): @@ -397,7 +413,7 @@ def test_categorical(self, fp): if LooseVersion(fastparquet.__version__) < LooseVersion("0.1.3"): pytest.skip("CategoricalDtype not supported for older fp") df = pd.DataFrame({'a': pd.Categorical(list('abc'))}) - self.check_round_trip(df, fp, compression=None) + self.check_round_trip(df, fp, write_kwargs={'compression': None}) def test_datetime_tz(self, fp): # doesn't preserve tz @@ -407,4 +423,13 @@ def test_datetime_tz(self, fp): # warns on the coercion with catch_warnings(record=True): self.check_round_trip(df, fp, df.astype('datetime64[ns]'), - compression=None) + write_kwargs={'compression': None}) + + def test_filter_row_groups(self, fp): + d = {'a': list(range(0, 3))} + df = pd.DataFrame(d) + with tm.ensure_clean() as path: + df.to_parquet(path, fp, compression=None, + row_group_offsets=1) + result = read_parquet(path, fp, filters=[('a', '==', 0)]) + assert len(result) == 1 diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 2df43158b5370..4528565eefa0c 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -88,6 +88,7 @@ "TextCol" TEXT, "DateCol" TEXT, "IntDateCol" INTEGER, + "IntDateOnlyCol" INTEGER, "FloatCol" REAL, "IntCol" INTEGER, "BoolCol" INTEGER, @@ -98,6 +99,7 @@ `TextCol` TEXT, `DateCol` DATETIME, `IntDateCol` INTEGER, + `IntDateOnlyCol` INTEGER, `FloatCol` DOUBLE, `IntCol` INTEGER, `BoolCol` BOOLEAN, @@ -109,6 +111,7 @@ "DateCol" TIMESTAMP, "DateColWithTz" TIMESTAMP WITH TIME ZONE, "IntDateCol" INTEGER, + "IntDateOnlyCol" INTEGER, "FloatCol" DOUBLE PRECISION, "IntCol" INTEGER, "BoolCol" BOOLEAN, @@ -120,31 +123,33 @@ 'sqlite': { 'query': """ INSERT INTO types_test_data - VALUES(?, ?, ?, ?, ?, ?, ?, ?) + VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?) """, 'fields': ( - 'TextCol', 'DateCol', 'IntDateCol', 'FloatCol', - 'IntCol', 'BoolCol', 'IntColWithNull', 'BoolColWithNull' + 'TextCol', 'DateCol', 'IntDateCol', 'IntDateOnlyCol', + 'FloatCol', 'IntCol', 'BoolCol', 'IntColWithNull', + 'BoolColWithNull' ) }, 'mysql': { 'query': """ INSERT INTO types_test_data - VALUES("%s", %s, %s, %s, %s, %s, %s, %s) + VALUES("%s", %s, %s, %s, %s, %s, %s, %s, %s) """, 'fields': ( - 'TextCol', 'DateCol', 'IntDateCol', 'FloatCol', - 'IntCol', 'BoolCol', 'IntColWithNull', 'BoolColWithNull' + 'TextCol', 'DateCol', 'IntDateCol', 'IntDateOnlyCol', + 'FloatCol', 'IntCol', 'BoolCol', 'IntColWithNull', + 'BoolColWithNull' ) }, 'postgresql': { 'query': """ INSERT INTO types_test_data - VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s) + VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) """, 'fields': ( 'TextCol', 'DateCol', 'DateColWithTz', - 'IntDateCol', 'FloatCol', + 'IntDateCol', 'IntDateOnlyCol', 'FloatCol', 'IntCol', 'BoolCol', 'IntColWithNull', 'BoolColWithNull' ) }, @@ -313,13 +318,13 @@ def _load_raw_sql(self): self.drop_table('types_test_data') self._get_exec().execute(SQL_STRINGS['create_test_types'][self.flavor]) ins = SQL_STRINGS['insert_test_types'][self.flavor] - data = [ { 'TextCol': 'first', 'DateCol': '2000-01-03 00:00:00', 'DateColWithTz': '2000-01-01 00:00:00-08:00', 'IntDateCol': 535852800, + 'IntDateOnlyCol': 20101010, 'FloatCol': 10.10, 'IntCol': 1, 'BoolCol': False, @@ -331,6 +336,7 @@ def _load_raw_sql(self): 'DateCol': '2000-01-04 00:00:00', 'DateColWithTz': '2000-06-01 00:00:00-07:00', 'IntDateCol': 1356998400, + 'IntDateOnlyCol': 20101212, 'FloatCol': 10.10, 'IntCol': 1, 'BoolCol': False, @@ -610,20 +616,42 @@ def test_date_parsing(self): df = sql.read_sql_query("SELECT * FROM types_test_data", self.conn, parse_dates=['DateCol']) assert issubclass(df.DateCol.dtype.type, np.datetime64) + assert df.DateCol.tolist() == [ + pd.Timestamp(2000, 1, 3, 0, 0, 0), + pd.Timestamp(2000, 1, 4, 0, 0, 0) + ] df = sql.read_sql_query("SELECT * FROM types_test_data", self.conn, parse_dates={'DateCol': '%Y-%m-%d %H:%M:%S'}) assert issubclass(df.DateCol.dtype.type, np.datetime64) + assert df.DateCol.tolist() == [ + pd.Timestamp(2000, 1, 3, 0, 0, 0), + pd.Timestamp(2000, 1, 4, 0, 0, 0) + ] df = sql.read_sql_query("SELECT * FROM types_test_data", self.conn, parse_dates=['IntDateCol']) - assert issubclass(df.IntDateCol.dtype.type, np.datetime64) + assert df.IntDateCol.tolist() == [ + pd.Timestamp(1986, 12, 25, 0, 0, 0), + pd.Timestamp(2013, 1, 1, 0, 0, 0) + ] df = sql.read_sql_query("SELECT * FROM types_test_data", self.conn, parse_dates={'IntDateCol': 's'}) - assert issubclass(df.IntDateCol.dtype.type, np.datetime64) + assert df.IntDateCol.tolist() == [ + pd.Timestamp(1986, 12, 25, 0, 0, 0), + pd.Timestamp(2013, 1, 1, 0, 0, 0) + ] + + df = sql.read_sql_query("SELECT * FROM types_test_data", self.conn, + parse_dates={'IntDateOnlyCol': '%Y%m%d'}) + assert issubclass(df.IntDateOnlyCol.dtype.type, np.datetime64) + assert df.IntDateOnlyCol.tolist() == [ + pd.Timestamp('2010-10-10'), + pd.Timestamp('2010-12-12') + ] def test_date_and_index(self): # Test case where same column appears in parse_date and index_col diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 055a490bc6b5d..78b47960e1a04 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -96,6 +96,8 @@ def setup_method(self, method): self.dta24_111 = os.path.join(self.dirpath, 'stata7_111.dta') + self.stata_dates = os.path.join(self.dirpath, 'stata13_dates.dta') + def read_dta(self, file): # Legacy default reader configuration return read_stata(file, convert_dates=True) @@ -1327,3 +1329,22 @@ def test_set_index(self): df.to_stata(path) reread = pd.read_stata(path, index_col='index') tm.assert_frame_equal(df, reread) + + @pytest.mark.parametrize( + 'column', ['ms', 'day', 'week', 'month', 'qtr', 'half', 'yr']) + def test_date_parsing_ignores_format_details(self, column): + # GH 17797 + # + # Test that display formats are ignored when determining if a numeric + # column is a date value. + # + # All date types are stored as numbers and format associated with the + # column denotes both the type of the date and the display format. + # + # STATA supports 9 date types which each have distinct units. We test 7 + # of the 9 types, ignoring %tC and %tb. %tC is a variant of %tc that + # accounts for leap seconds and %tb relies on STATAs business calendar. + df = read_stata(self.stata_dates) + unformatted = df.loc[0, column] + formatted = df.loc[0, column + "_fmt"] + assert unformatted == formatted diff --git a/pandas/tests/plotting/test_converter.py b/pandas/tests/plotting/test_converter.py index e1f64bed5598d..3818c04649366 100644 --- a/pandas/tests/plotting/test_converter.py +++ b/pandas/tests/plotting/test_converter.py @@ -1,20 +1,144 @@ +import subprocess import pytest from datetime import datetime, date import numpy as np -from pandas import Timestamp, Period, Index +from pandas import Timestamp, Period, Index, date_range, Series from pandas.compat import u +import pandas.core.config as cf import pandas.util.testing as tm from pandas.tseries.offsets import Second, Milli, Micro, Day from pandas.compat.numpy import np_datetime64_compat converter = pytest.importorskip('pandas.plotting._converter') +from pandas.plotting import (register_matplotlib_converters, + deregister_matplotlib_converters) def test_timtetonum_accepts_unicode(): assert (converter.time2num("00:01") == converter.time2num(u("00:01"))) +class TestRegistration(object): + + def test_register_by_default(self): + # Run in subprocess to ensure a clean state + code = ("'import matplotlib.units; " + "import pandas as pd; " + "units = dict(matplotlib.units.registry); " + "assert pd.Timestamp in units)'") + call = ['python', '-c', code] + assert subprocess.check_call(call) == 0 + + def test_warns(self): + plt = pytest.importorskip("matplotlib.pyplot") + s = Series(range(12), index=date_range('2017', periods=12)) + _, ax = plt.subplots() + + # Set to the "warning" state, in case this isn't the first test run + converter._WARN = True + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False) as w: + ax.plot(s.index, s.values) + plt.close() + + assert len(w) == 1 + assert "Using an implicitly registered datetime converter" in str(w[0]) + + def test_registering_no_warning(self): + plt = pytest.importorskip("matplotlib.pyplot") + s = Series(range(12), index=date_range('2017', periods=12)) + _, ax = plt.subplots() + + # Set to the "warn" state, in case this isn't the first test run + converter._WARN = True + register_matplotlib_converters() + with tm.assert_produces_warning(None) as w: + ax.plot(s.index, s.values) + + assert len(w) == 0 + + def test_pandas_plots_register(self): + pytest.importorskip("matplotlib.pyplot") + s = Series(range(12), index=date_range('2017', periods=12)) + # Set to the "warn" state, in case this isn't the first test run + converter._WARN = True + with tm.assert_produces_warning(None) as w: + s.plot() + + assert len(w) == 0 + + def test_matplotlib_formatters(self): + units = pytest.importorskip("matplotlib.units") + assert Timestamp in units.registry + + ctx = cf.option_context("plotting.matplotlib.register_converters", + False) + with ctx: + assert Timestamp not in units.registry + + assert Timestamp in units.registry + + def test_option_no_warning(self): + pytest.importorskip("matplotlib.pyplot") + ctx = cf.option_context("plotting.matplotlib.register_converters", + False) + plt = pytest.importorskip("matplotlib.pyplot") + s = Series(range(12), index=date_range('2017', periods=12)) + _, ax = plt.subplots() + + converter._WARN = True + # Test without registering first, no warning + with ctx: + with tm.assert_produces_warning(None) as w: + ax.plot(s.index, s.values) + + assert len(w) == 0 + + # Now test with registering + converter._WARN = True + register_matplotlib_converters() + with ctx: + with tm.assert_produces_warning(None) as w: + ax.plot(s.index, s.values) + + assert len(w) == 0 + + def test_registry_resets(self): + units = pytest.importorskip("matplotlib.units") + dates = pytest.importorskip("matplotlib.dates") + + # make a copy, to reset to + original = dict(units.registry) + + try: + # get to a known state + units.registry.clear() + date_converter = dates.DateConverter() + units.registry[datetime] = date_converter + units.registry[date] = date_converter + + register_matplotlib_converters() + assert units.registry[date] is not date_converter + deregister_matplotlib_converters() + assert units.registry[date] is date_converter + + finally: + # restore original stater + units.registry.clear() + for k, v in original.items(): + units.registry[k] = v + + def test_old_import_warns(self): + with tm.assert_produces_warning(FutureWarning) as w: + from pandas.tseries import converter + converter.register() + + assert len(w) + assert ('pandas.plotting.register_matplotlib_converters' in + str(w[0].message)) + + class TestDateTimeConverter(object): def setup_method(self, method): diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index d66012e2a56a0..d6cedac747f25 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -1,13 +1,14 @@ """ Test cases for time series specific (freq conversion, etc) """ from datetime import datetime, timedelta, date, time +import pickle import pytest from pandas.compat import lrange, zip import numpy as np from pandas import Index, Series, DataFrame, NaT -from pandas.compat import is_platform_mac +from pandas.compat import is_platform_mac, PY3 from pandas.core.indexes.datetimes import date_range, bdate_range from pandas.core.indexes.timedeltas import timedelta_range from pandas.tseries.offsets import DateOffset @@ -1470,5 +1471,12 @@ def _check_plot_works(f, freq=None, series=None, *args, **kwargs): with ensure_clean(return_filelike=True) as path: plt.savefig(path) + + # GH18439 + # this is supported only in Python 3 pickle since + # pickle in Python2 doesn't support instancemethod pickling + if PY3: + with ensure_clean(return_filelike=True) as path: + pickle.dump(fig, path) finally: plt.close(fig) diff --git a/pandas/tests/plotting/test_misc.py b/pandas/tests/plotting/test_misc.py index 6f476553091d9..54a512d14fef4 100644 --- a/pandas/tests/plotting/test_misc.py +++ b/pandas/tests/plotting/test_misc.py @@ -201,6 +201,7 @@ def test_parallel_coordinates(self): with tm.assert_produces_warning(FutureWarning): parallel_coordinates(df, 'Name', colors=colors) + @pytest.mark.xfail(reason="unreliable test") def test_parallel_coordinates_with_sorted_labels(self): """ For #15908 """ from pandas.plotting import parallel_coordinates diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 84a15cab34cd0..11368e44943d8 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -1594,7 +1594,9 @@ def test_concat_series_axis1_same_names_ignore_index(self): s2 = Series(randn(len(dates)), index=dates, name='value') result = concat([s1, s2], axis=1, ignore_index=True) - assert np.array_equal(result.columns, [0, 1]) + expected = Index([0, 1]) + + tm.assert_index_equal(result.columns, expected) def test_concat_iterables(self): from collections import deque, Iterable @@ -1981,3 +1983,21 @@ def test_concat_will_upcast(dt, pdt): pdt(np.array([5], dtype=dt, ndmin=dims))] x = pd.concat(dfs) assert x.values.dtype == 'float64' + + +def test_concat_empty_and_non_empty_frame_regression(): + # GH 18178 regression test + df1 = pd.DataFrame({'foo': [1]}) + df2 = pd.DataFrame({'foo': []}) + expected = pd.DataFrame({'foo': [1.0]}) + result = pd.concat([df1, df2]) + assert_frame_equal(result, expected) + + +def test_concat_empty_and_non_empty_series_regression(): + # GH 18187 regression test + s1 = pd.Series([1]) + s2 = pd.Series([]) + expected = s1 + result = pd.concat([s1, s2]) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/reshape/test_merge.py b/pandas/tests/reshape/test_merge.py index 172667c9a0fb8..33d91af21c723 100644 --- a/pandas/tests/reshape/test_merge.py +++ b/pandas/tests/reshape/test_merge.py @@ -861,6 +861,12 @@ def test_validation(self): result = merge(left, right, on=['a', 'b'], validate='1:1') assert_frame_equal(result, expected_multi) + def test_merge_two_empty_df_no_division_error(self): + # GH17776, PR #17846 + a = pd.DataFrame({'a': [], 'b': [], 'c': []}) + with np.errstate(divide='raise'): + merge(a, a, on=('a', 'b')) + def _check_merge(x, y): for how in ['inner', 'left', 'outer']: diff --git a/pandas/tests/reshape/test_merge_asof.py b/pandas/tests/reshape/test_merge_asof.py index 78bfa2ff8597c..4b2680b9be592 100644 --- a/pandas/tests/reshape/test_merge_asof.py +++ b/pandas/tests/reshape/test_merge_asof.py @@ -973,3 +973,15 @@ def test_on_float_by_int(self): columns=['symbol', 'exch', 'price', 'mpv']) assert_frame_equal(result, expected) + + def test_merge_datatype_error(self): + """ Tests merge datatype mismatch error """ + msg = 'merge keys \[0\] object and int64, must be the same type' + + left = pd.DataFrame({'left_val': [1, 5, 10], + 'a': ['a', 'b', 'c']}) + right = pd.DataFrame({'right_val': [1, 2, 3, 6, 7], + 'a': [1, 2, 3, 6, 7]}) + + with tm.assert_raises_regex(MergeError, msg): + merge_asof(left, right, on='a') diff --git a/pandas/tests/scalar/test_nat.py b/pandas/tests/scalar/test_nat.py index 135e4c544de41..0e69371511294 100644 --- a/pandas/tests/scalar/test_nat.py +++ b/pandas/tests/scalar/test_nat.py @@ -125,12 +125,13 @@ def test_round_nat(klass): def test_NaT_methods(): # GH 9513 + # GH 17329 for `timestamp` raise_methods = ['astimezone', 'combine', 'ctime', 'dst', 'fromordinal', 'fromtimestamp', 'isocalendar', 'strftime', 'strptime', 'time', 'timestamp', 'timetuple', 'timetz', 'toordinal', 'tzname', 'utcfromtimestamp', 'utcnow', 'utcoffset', - 'utctimetuple'] + 'utctimetuple', 'timestamp'] nat_methods = ['date', 'now', 'replace', 'to_datetime', 'today', 'tz_convert', 'tz_localize'] nan_methods = ['weekday', 'isoweekday'] diff --git a/pandas/tests/scalar/test_timestamp.py b/pandas/tests/scalar/test_timestamp.py index c1b9f858a08de..4053257fbd2c8 100644 --- a/pandas/tests/scalar/test_timestamp.py +++ b/pandas/tests/scalar/test_timestamp.py @@ -19,7 +19,7 @@ from pandas._libs import tslib, period from pandas._libs.tslibs.timezones import get_timezone -from pandas.compat import lrange, long +from pandas.compat import lrange, long, PY3 from pandas.util.testing import assert_series_equal from pandas.compat.numpy import np_datetime64_compat from pandas import (Timestamp, date_range, Period, Timedelta, compat, @@ -1079,6 +1079,28 @@ def test_is_leap_year(self): dt = Timestamp('2100-01-01 00:00:00', tz=tz) assert not dt.is_leap_year + def test_timestamp(self): + # GH#17329 + # tz-naive --> treat it as if it were UTC for purposes of timestamp() + ts = Timestamp.now() + uts = ts.replace(tzinfo=utc) + assert ts.timestamp() == uts.timestamp() + + tsc = Timestamp('2014-10-11 11:00:01.12345678', tz='US/Central') + utsc = tsc.tz_convert('UTC') + + # utsc is a different representation of the same time + assert tsc.timestamp() == utsc.timestamp() + + if PY3: + + # datetime.timestamp() converts in the local timezone + with tm.set_timezone('UTC'): + + # should agree with datetime.timestamp method + dt = ts.to_pydatetime() + assert dt.timestamp() == ts.timestamp() + class TestTimestampNsOperations(object): diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 8cc40bb5146c5..2ee404ab5fe0d 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -848,6 +848,12 @@ def test_value_counts_nunique(self): result = series.nunique() assert result == 11 + # GH 18051 + s = pd.Series(pd.Categorical([])) + assert s.nunique() == 0 + s = pd.Series(pd.Categorical([np.nan])) + assert s.nunique() == 0 + def test_unique(self): # 714 also, dtype=float @@ -920,6 +926,14 @@ def test_drop_duplicates(self): sc.drop_duplicates(keep=False, inplace=True) assert_series_equal(sc, s[~expected]) + # GH 18051 + s = pd.Series(pd.Categorical([])) + tm.assert_categorical_equal(s.unique(), pd.Categorical([]), + check_dtype=False) + s = pd.Series(pd.Categorical([np.nan])) + tm.assert_categorical_equal(s.unique(), pd.Categorical([np.nan]), + check_dtype=False) + def test_clip(self): val = self.ts.median() diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index bd4e8b23f31b4..5ca4eba4da13b 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -636,17 +636,21 @@ def test_valid(self): def test_isna(self): ser = Series([0, 5.4, 3, nan, -0.001]) - np.array_equal(ser.isna(), - Series([False, False, False, True, False]).values) + expected = Series([False, False, False, True, False]) + tm.assert_series_equal(ser.isna(), expected) + ser = Series(["hi", "", nan]) - np.array_equal(ser.isna(), Series([False, False, True]).values) + expected = Series([False, False, True]) + tm.assert_series_equal(ser.isna(), expected) def test_notna(self): ser = Series([0, 5.4, 3, nan, -0.001]) - np.array_equal(ser.notna(), - Series([True, True, True, False, True]).values) + expected = Series([True, True, True, False, True]) + tm.assert_series_equal(ser.notna(), expected) + ser = Series(["hi", "", nan]) - np.array_equal(ser.notna(), Series([True, True, False]).values) + expected = Series([True, True, False]) + tm.assert_series_equal(ser.notna(), expected) def test_pad_nan(self): x = Series([np.nan, 1., np.nan, 3., np.nan], ['z', 'a', 'b', 'c', 'd'], diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 38625bfb29917..240a7ad4b22f9 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1132,19 +1132,19 @@ def test_pad_backfill_object_segfault(): result = libalgos.pad_object(old, new) expected = np.array([-1], dtype=np.int64) - assert (np.array_equal(result, expected)) + tm.assert_numpy_array_equal(result, expected) result = libalgos.pad_object(new, old) expected = np.array([], dtype=np.int64) - assert (np.array_equal(result, expected)) + tm.assert_numpy_array_equal(result, expected) result = libalgos.backfill_object(old, new) expected = np.array([-1], dtype=np.int64) - assert (np.array_equal(result, expected)) + tm.assert_numpy_array_equal(result, expected) result = libalgos.backfill_object(new, old) expected = np.array([], dtype=np.int64) - assert (np.array_equal(result, expected)) + tm.assert_numpy_array_equal(result, expected) def test_arrmap(): @@ -1219,7 +1219,7 @@ def test_is_lexsorted(): 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0]), + 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='int64'), np.array([30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 30, 29, 28, @@ -1231,19 +1231,10 @@ def test_is_lexsorted(): 7, 6, 5, 4, 3, 2, 1, 0, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, - 4, 3, 2, 1, 0])] + 4, 3, 2, 1, 0], dtype='int64')] assert (not libalgos.is_lexsorted(failure)) -# def test_get_group_index(): -# a = np.array([0, 1, 2, 0, 2, 1, 0, 0], dtype=np.int64) -# b = np.array([1, 0, 3, 2, 0, 2, 3, 0], dtype=np.int64) -# expected = np.array([1, 4, 11, 2, 8, 6, 3, 0], dtype=np.int64) - -# result = lib.get_group_index([a, b], (3, 4)) - -# assert(np.array_equal(result, expected)) - def test_groupsort_indexer(): a = np.random.randint(0, 1000, 100).astype(np.int64) @@ -1252,14 +1243,22 @@ def test_groupsort_indexer(): result = libalgos.groupsort_indexer(a, 1000)[0] # need to use a stable sort + # np.argsort returns int, groupsort_indexer + # always returns int64 expected = np.argsort(a, kind='mergesort') - assert (np.array_equal(result, expected)) + expected = expected.astype(np.int64) + + tm.assert_numpy_array_equal(result, expected) # compare with lexsort + # np.lexsort returns int, groupsort_indexer + # always returns int64 key = a * 1000 + b result = libalgos.groupsort_indexer(key, 1000000)[0] expected = np.lexsort((b, a)) - assert (np.array_equal(result, expected)) + expected = expected.astype(np.int64) + + tm.assert_numpy_array_equal(result, expected) def test_infinity_sort(): diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 272ba25bf8f8a..6366aae8ccdf6 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -2124,6 +2124,13 @@ def test_creation_astype(self): res = s.astype(CategoricalDtype(list('abcdef'), ordered=True)) tm.assert_series_equal(res, exp) + @pytest.mark.parametrize('columns', [['x'], ['x', 'y'], ['x', 'y', 'z']]) + def test_empty_astype(self, columns): + # GH 18004 + msg = '> 1 ndim Categorical are not supported at this time' + with tm.assert_raises_regex(NotImplementedError, msg): + DataFrame(columns=columns).astype('category') + def test_construction_series(self): l = [1, 2, 3, 1] diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index 61f0c992225c6..b8e9191002640 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -52,7 +52,6 @@ def test_xarray(df): assert df.to_xarray() is not None -@tm.network def test_statsmodels(): statsmodels = import_module('statsmodels') # noqa diff --git a/pandas/tests/test_join.py b/pandas/tests/test_join.py index cde1cab37d09c..af946436b55c7 100644 --- a/pandas/tests/test_join.py +++ b/pandas/tests/test_join.py @@ -53,7 +53,7 @@ def test_left_join_indexer_unique(): result = _join.left_join_indexer_unique_int64(b, a) expected = np.array([1, 1, 2, 3, 3], dtype=np.int64) - assert (np.array_equal(result, expected)) + tm.assert_numpy_array_equal(result, expected) def test_left_outer_join_bug(): @@ -69,13 +69,14 @@ def test_left_outer_join_bug(): lidx, ridx = _join.left_outer_join(left, right, max_groups, sort=False) - exp_lidx = np.arange(len(left)) - exp_ridx = -np.ones(len(left)) + exp_lidx = np.arange(len(left), dtype=np.int64) + exp_ridx = -np.ones(len(left), dtype=np.int64) + exp_ridx[left == 1] = 1 exp_ridx[left == 3] = 0 - assert (np.array_equal(lidx, exp_lidx)) - assert (np.array_equal(ridx, exp_ridx)) + tm.assert_numpy_array_equal(lidx, exp_lidx) + tm.assert_numpy_array_equal(ridx, exp_ridx) def test_inner_join_indexer(): diff --git a/pandas/tests/test_lib.py b/pandas/tests/test_lib.py index 2662720bb436d..75aa9aa4e8198 100644 --- a/pandas/tests/test_lib.py +++ b/pandas/tests/test_lib.py @@ -198,7 +198,7 @@ def test_get_reverse_indexer(self): indexer = np.array([-1, -1, 1, 2, 0, -1, 3, 4], dtype=np.int64) result = lib.get_reverse_indexer(indexer, 5) expected = np.array([4, 2, 3, 6, 7], dtype=np.int64) - assert np.array_equal(result, expected) + tm.assert_numpy_array_equal(result, expected) class TestNAObj(object): diff --git a/pandas/tests/test_resample.py b/pandas/tests/test_resample.py index ac8297a53de37..e64bf2217e717 100644 --- a/pandas/tests/test_resample.py +++ b/pandas/tests/test_resample.py @@ -2729,6 +2729,34 @@ def test_resample_weekly_bug_1726(self): # it works! df.resample('W-MON', closed='left', label='left').first() + def test_resample_with_dst_time_change(self): + # GH 15549 + index = pd.DatetimeIndex([1457537600000000000, 1458059600000000000], + tz='UTC').tz_convert('America/Chicago') + df = pd.DataFrame([1, 2], index=index) + result = df.resample('12h', closed='right', + label='right').last().ffill() + + expected_index_values = ['2016-03-09 12:00:00-06:00', + '2016-03-10 00:00:00-06:00', + '2016-03-10 12:00:00-06:00', + '2016-03-11 00:00:00-06:00', + '2016-03-11 12:00:00-06:00', + '2016-03-12 00:00:00-06:00', + '2016-03-12 12:00:00-06:00', + '2016-03-13 00:00:00-06:00', + '2016-03-13 13:00:00-05:00', + '2016-03-14 01:00:00-05:00', + '2016-03-14 13:00:00-05:00', + '2016-03-15 01:00:00-05:00', + '2016-03-15 13:00:00-05:00'] + index = pd.DatetimeIndex(expected_index_values, + tz='UTC').tz_convert('America/Chicago') + expected = pd.DataFrame([1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 2.0], index=index) + assert_frame_equal(result, expected) + def test_resample_bms_2752(self): # GH2753 foo = pd.Series(index=pd.bdate_range('20000101', '20000201')) @@ -3103,6 +3131,26 @@ def f(x): result = g.apply(f) assert_frame_equal(result, expected) + def test_apply_with_mutated_index(self): + # GH 15169 + index = pd.date_range('1-1-2015', '12-31-15', freq='D') + df = pd.DataFrame(data={'col1': np.random.rand(len(index))}, + index=index) + + def f(x): + s = pd.Series([1, 2], index=['a', 'b']) + return s + + expected = df.groupby(pd.Grouper(freq='M')).apply(f) + + result = df.resample('M').apply(f) + assert_frame_equal(result, expected) + + # A case for series + expected = df['col1'].groupby(pd.Grouper(freq='M')).apply(f) + result = df['col1'].resample('M').apply(f) + assert_series_equal(result, expected) + def test_resample_groupby_with_label(self): # GH 13235 index = date_range('2000-01-01', freq='2D', periods=5) @@ -3380,3 +3428,11 @@ def test_aggregate_with_nat(self): # if NaT is included, 'var', 'std', 'mean', 'first','last' # and 'nth' doesn't work yet + + def test_repr(self): + # GH18203 + result = repr(TimeGrouper(key='A', freq='H')) + expected = ("TimeGrouper(key='A', freq=, axis=0, sort=True, " + "closed='left', label='left', how='mean', " + "convention='e', base=0)") + assert result == expected diff --git a/pandas/tests/test_sorting.py b/pandas/tests/test_sorting.py index a5b12bbf9608a..06c1fa1c0905a 100644 --- a/pandas/tests/test_sorting.py +++ b/pandas/tests/test_sorting.py @@ -332,16 +332,17 @@ def testit(label_list, shape): label_list2 = decons_group_index(group_index, shape) for a, b in zip(label_list, label_list2): - assert (np.array_equal(a, b)) + tm.assert_numpy_array_equal(a, b) shape = (4, 5, 6) - label_list = [np.tile([0, 1, 2, 3, 0, 1, 2, 3], 100), np.tile( - [0, 2, 4, 3, 0, 1, 2, 3], 100), np.tile( - [5, 1, 0, 2, 3, 0, 5, 4], 100)] + label_list = [np.tile([0, 1, 2, 3, 0, 1, 2, 3], 100).astype(np.int64), + np.tile([0, 2, 4, 3, 0, 1, 2, 3], 100).astype(np.int64), + np.tile([5, 1, 0, 2, 3, 0, 5, 4], 100).astype(np.int64)] testit(label_list, shape) shape = (10000, 10000) - label_list = [np.tile(np.arange(10000), 5), np.tile(np.arange(10000), 5)] + label_list = [np.tile(np.arange(10000, dtype=np.int64), 5), + np.tile(np.arange(10000, dtype=np.int64), 5)] testit(label_list, shape) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index f1b97081b6d93..8aa69bcbfdf7f 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -2086,6 +2086,18 @@ def test_rsplit_to_multiindex_expand(self): tm.assert_index_equal(result, exp) assert result.nlevels == 2 + def test_split_nan_expand(self): + # gh-18450 + s = Series(["foo,bar,baz", NA]) + result = s.str.split(",", expand=True) + exp = DataFrame([["foo", "bar", "baz"], [NA, NA, NA]]) + tm.assert_frame_equal(result, exp) + + # check that these are actually np.nan and not None + # TODO see GH 18463 + # tm.assert_frame_equal does not differentiate + assert all(np.isnan(x) for x in result.iloc[1]) + def test_split_with_name(self): # GH 12617 diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index c567613acebd1..35ae4ad4d5db4 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -2491,6 +2491,14 @@ def test_rolling_corr_pairwise(self): self._check_pairwise_moment('rolling', 'corr', window=10, min_periods=5) + @pytest.mark.parametrize('window', range(7)) + def test_rolling_corr_with_zero_variance(self, window): + # GH 18430 + s = pd.Series(np.zeros(20)) + other = pd.Series(np.arange(20)) + + assert s.rolling(window=window).corr(other=other).isna().all() + def _check_pairwise_moment(self, dispatch, name, **kwargs): def get_result(obj, obj2=None): return getattr(getattr(obj, dispatch)(**kwargs), name)(obj2) @@ -2979,6 +2987,16 @@ def test_rolling_kurt_edge_cases(self): x = d.rolling(window=4).kurt() tm.assert_series_equal(expected, x) + def test_rolling_skew_eq_value_fperr(self): + # #18804 all rolling skew for all equal values should return Nan + a = pd.Series([1.1] * 15).rolling(window=10).skew() + assert np.isnan(a).all() + + def test_rolling_kurt_eq_value_fperr(self): + # #18804 all rolling kurt for all equal values should return Nan + a = pd.Series([1.1] * 15).rolling(window=10).kurt() + assert np.isnan(a).all() + def _check_expanding_ndarray(self, func, static_comp, has_min_periods=True, has_time_rule=True, preserve_nan=True): result = func(self.arr) diff --git a/pandas/tests/tseries/test_timezones.py b/pandas/tests/tseries/test_timezones.py index aa8fe90ea6500..823e22c4f87d1 100644 --- a/pandas/tests/tseries/test_timezones.py +++ b/pandas/tests/tseries/test_timezones.py @@ -13,7 +13,7 @@ import pandas.util.testing as tm import pandas.tseries.offsets as offsets -from pandas.compat import lrange, zip +from pandas.compat import lrange, zip, PY3 from pandas.core.indexes.datetimes import bdate_range, date_range from pandas.core.dtypes.dtypes import DatetimeTZDtype from pandas._libs import tslib @@ -70,7 +70,7 @@ def test_utc_to_local_no_modify(self): rng_eastern = rng.tz_convert(self.tzstr('US/Eastern')) # Values are unmodified - assert np.array_equal(rng.asi8, rng_eastern.asi8) + tm.assert_numpy_array_equal(rng.asi8, rng_eastern.asi8) assert self.cmptz(rng_eastern.tz, self.tz('US/Eastern')) @@ -108,7 +108,7 @@ def test_localize_utc_conversion_explicit(self): rng = date_range('3/10/2012', '3/11/2012', freq='30T') converted = rng.tz_localize(self.tz('US/Eastern')) expected_naive = rng + offsets.Hour(5) - assert np.array_equal(converted.asi8, expected_naive.asi8) + tm.assert_numpy_array_equal(converted.asi8, expected_naive.asi8) # DST ambiguity, this should fail rng = date_range('3/11/2012', '3/12/2012', freq='30T') @@ -424,7 +424,7 @@ def test_with_tz(self): # datetimes with tzinfo set dr = bdate_range(datetime(2005, 1, 1, tzinfo=pytz.utc), - '1/1/2009', tz=pytz.utc) + datetime(2009, 1, 1, tzinfo=pytz.utc)) pytest.raises(Exception, bdate_range, datetime(2005, 1, 1, tzinfo=pytz.utc), '1/1/2009', @@ -1278,16 +1278,22 @@ def test_replace_tzinfo(self): result_dt = dt.replace(tzinfo=tzinfo) result_pd = Timestamp(dt).replace(tzinfo=tzinfo) - if hasattr(result_dt, 'timestamp'): # New method in Py 3.3 - assert result_dt.timestamp() == result_pd.timestamp() + if PY3: + # datetime.timestamp() converts in the local timezone + with tm.set_timezone('UTC'): + assert result_dt.timestamp() == result_pd.timestamp() + assert result_dt == result_pd assert result_dt == result_pd.to_pydatetime() result_dt = dt.replace(tzinfo=tzinfo).replace(tzinfo=None) result_pd = Timestamp(dt).replace(tzinfo=tzinfo).replace(tzinfo=None) - if hasattr(result_dt, 'timestamp'): # New method in Py 3.3 - assert result_dt.timestamp() == result_pd.timestamp() + if PY3: + # datetime.timestamp() converts in the local timezone + with tm.set_timezone('UTC'): + assert result_dt.timestamp() == result_pd.timestamp() + assert result_dt == result_pd assert result_dt == result_pd.to_pydatetime() diff --git a/pandas/tseries/converter.py b/pandas/tseries/converter.py index df603c4d880d8..26d3f3cb85edc 100644 --- a/pandas/tseries/converter.py +++ b/pandas/tseries/converter.py @@ -1,6 +1,7 @@ # flake8: noqa +import warnings -from pandas.plotting._converter import (register, time2num, +from pandas.plotting._converter import (time2num, TimeConverter, TimeFormatter, PeriodConverter, get_datevalue, DatetimeConverter, @@ -9,3 +10,11 @@ MilliSecondLocator, get_finder, TimeSeries_DateLocator, TimeSeries_DateFormatter) + + +def register(): + from pandas.plotting._converter import register as register_ + msg = ("'pandas.tseries.converter.register' has been moved and renamed to " + "'pandas.plotting.register_matplotlib_converters'. ") + warnings.warn(msg, FutureWarning, stacklevel=2) + register_() diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 730d2782e85d2..dec67bbea854f 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -1074,8 +1074,12 @@ def assert_categorical_equal(left, right, check_dtype=True, def raise_assert_detail(obj, message, left, right, diff=None): if isinstance(left, np.ndarray): left = pprint_thing(left) + elif is_categorical_dtype(left): + left = repr(left) if isinstance(right, np.ndarray): right = pprint_thing(right) + elif is_categorical_dtype(right): + right = repr(right) msg = """{obj} are different diff --git a/scripts/convert_deps.py b/scripts/convert_deps.py new file mode 100644 index 0000000000000..aabeb24a0c3c8 --- /dev/null +++ b/scripts/convert_deps.py @@ -0,0 +1,29 @@ +""" +Convert the conda environment.yaml to a pip requirements.txt +""" +import yaml + +exclude = {'python=3'} +rename = {'pytables': 'tables'} + +with open("ci/environment-dev.yaml") as f: + dev = yaml.load(f) + +with open("ci/requirements-optional-conda.txt") as f: + optional = [x.strip() for x in f.readlines()] + +required = dev['dependencies'] +required = [rename.get(dep, dep) for dep in required if dep not in exclude] +optional = [rename.get(dep, dep) for dep in optional if dep not in exclude] + + +with open("ci/requirements_dev.txt", 'wt') as f: + f.write("# This file was autogenerated by scripts/convert_deps.py\n") + f.write("# Do not modify directly\n") + f.write('\n'.join(required)) + + +with open("ci/requirements-optional-pip.txt", 'wt') as f: + f.write("# This file was autogenerated by scripts/convert_deps.py\n") + f.write("# Do not modify directly\n") + f.write("\n".join(optional))