diff --git a/.travis.yml b/.travis.yml index 8b6700e11d2c5..5dc4256a268ad 100644 --- a/.travis.yml +++ b/.travis.yml @@ -74,7 +74,11 @@ matrix: # In allow_failures - os: linux env: - - JOB="3.5_DOC" DOC=true + - JOB="3.6_DOC" DOC=true + addons: + apt: + packages: + - xsel allow_failures: - os: linux env: @@ -87,7 +91,7 @@ matrix: - JOB="3.6_NUMPY_DEV" TEST_ARGS="--skip-slow --skip-network" PANDAS_TESTING_MODE="deprecate" - os: linux env: - - JOB="3.5_DOC" DOC=true + - JOB="3.6_DOC" DOC=true before_install: - echo "before_install" diff --git a/AUTHORS.md b/AUTHORS.md new file mode 100644 index 0000000000000..dcaaea101f4c8 --- /dev/null +++ b/AUTHORS.md @@ -0,0 +1,57 @@ +About the Copyright Holders +=========================== + +* Copyright (c) 2008-2011 AQR Capital Management, LLC + + AQR Capital Management began pandas development in 2008. Development was + led by Wes McKinney. AQR released the source under this license in 2009. +* Copyright (c) 2011-2012, Lambda Foundry, Inc. + + Wes is now an employee of Lambda Foundry, and remains the pandas project + lead. +* Copyright (c) 2011-2012, PyData Development Team + + The PyData Development Team is the collection of developers of the PyData + project. This includes all of the PyData sub-projects, including pandas. The + core team that coordinates development on GitHub can be found here: + http://github.com/pydata. + +Full credits for pandas contributors can be found in the documentation. + +Our Copyright Policy +==================== + +PyData uses a shared copyright model. Each contributor maintains copyright +over their contributions to PyData. However, it is important to note that +these contributions are typically only changes to the repositories. Thus, +the PyData source code, in its entirety, is not the copyright of any single +person or institution. Instead, it is the collective copyright of the +entire PyData Development Team. If individual contributors want to maintain +a record of what changes/contributions they have specific copyright on, +they should indicate their copyright in the commit message of the change +when they commit the change to one of the PyData repositories. + +With this in mind, the following banner should be used in any source code +file to indicate the copyright and license terms: + +``` +#----------------------------------------------------------------------------- +# Copyright (c) 2012, PyData Development Team +# All rights reserved. +# +# Distributed under the terms of the BSD Simplified License. +# +# The full license is in the LICENSE file, distributed with this software. +#----------------------------------------------------------------------------- +``` + +Other licenses can be found in the LICENSES directory. + +License +======= + +pandas is distributed under a 3-clause ("Simplified" or "New") BSD +license. Parts of NumPy, SciPy, numpydoc, bottleneck, which all have +BSD-compatible licenses, are included. Their licenses follow the pandas +license. + diff --git a/LICENSE b/LICENSE index c9b8834e8774b..924de26253bf4 100644 --- a/LICENSE +++ b/LICENSE @@ -1,87 +1,29 @@ -======= -License -======= +BSD 3-Clause License -pandas is distributed under a 3-clause ("Simplified" or "New") BSD -license. Parts of NumPy, SciPy, numpydoc, bottleneck, which all have -BSD-compatible licenses, are included. Their licenses follow the pandas -license. - -pandas license -============== - -Copyright (c) 2011-2012, Lambda Foundry, Inc. and PyData Development Team -All rights reserved. - -Copyright (c) 2008-2011 AQR Capital Management, LLC +Copyright (c) 2008-2012, AQR Capital Management, LLC, Lambda Foundry, Inc. and PyData Development Team All rights reserved. Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following - disclaimer in the documentation and/or other materials provided - with the distribution. - - * Neither the name of the copyright holder nor the names of any - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +* Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -About the Copyright Holders -=========================== - -AQR Capital Management began pandas development in 2008. Development was -led by Wes McKinney. AQR released the source under this license in 2009. -Wes is now an employee of Lambda Foundry, and remains the pandas project -lead. - -The PyData Development Team is the collection of developers of the PyData -project. This includes all of the PyData sub-projects, including pandas. The -core team that coordinates development on GitHub can be found here: -http://github.com/pydata. - -Full credits for pandas contributors can be found in the documentation. - -Our Copyright Policy -==================== - -PyData uses a shared copyright model. Each contributor maintains copyright -over their contributions to PyData. However, it is important to note that -these contributions are typically only changes to the repositories. Thus, -the PyData source code, in its entirety, is not the copyright of any single -person or institution. Instead, it is the collective copyright of the -entire PyData Development Team. If individual contributors want to maintain -a record of what changes/contributions they have specific copyright on, -they should indicate their copyright in the commit message of the change -when they commit the change to one of the PyData repositories. - -With this in mind, the following banner should be used in any source code -file to indicate the copyright and license terms: - -#----------------------------------------------------------------------------- -# Copyright (c) 2012, PyData Development Team -# All rights reserved. -# -# Distributed under the terms of the BSD Simplified License. -# -# The full license is in the LICENSE file, distributed with this software. -#----------------------------------------------------------------------------- - -Other licenses can be found in the LICENSES directory. \ No newline at end of file diff --git a/asv_bench/benchmarks/hdfstore_bench.py b/asv_bench/benchmarks/hdfstore_bench.py index dc72f3d548aaf..9da7aea2e7b5e 100644 --- a/asv_bench/benchmarks/hdfstore_bench.py +++ b/asv_bench/benchmarks/hdfstore_bench.py @@ -90,6 +90,14 @@ def time_query_store_table(self): stop = self.df2.index[15000] self.store.select('table', where="index > start and index < stop") + def time_store_tostring(self): + repr(self.store) + str(self.store) + + def time_store_info(self): + self.store.info() + + class HDF5Panel(object): goal_time = 0.2 diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index 6a2c9d48c4a28..d941ef20dc7ac 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -204,6 +204,12 @@ def setup(self): [np.arange(100), list('A'), list('A')], names=['one', 'two', 'three']) + rng = np.random.RandomState(4) + size = 1 << 16 + self.mi_unused_levels = pd.MultiIndex.from_arrays([ + rng.randint(0, 1 << 13, size), + rng.randint(0, 1 << 10, size)])[rng.rand(size) < 0.1] + def time_series_xs_mi_ix(self): self.s.ix[999] @@ -248,6 +254,9 @@ def time_multiindex_small_get_loc_warm(self): def time_is_monotonic(self): self.miint.is_monotonic + def time_remove_unused_levels(self): + self.mi_unused_levels.remove_unused_levels() + class IntervalIndexing(object): goal_time = 0.2 diff --git a/ci/build_docs.sh b/ci/build_docs.sh index 26917b8f9b792..a038304fe0f7a 100755 --- a/ci/build_docs.sh +++ b/ci/build_docs.sh @@ -59,6 +59,15 @@ if [ "$DOC" ]; then git remote -v git push origin gh-pages -f + + echo "Running doctests" + cd "$TRAVIS_BUILD_DIR" + pytest --doctest-modules \ + pandas/core/reshape/concat.py \ + pandas/core/reshape/pivot.py \ + pandas/core/reshape/reshape.py \ + pandas/core/reshape/tile.py + fi exit 0 diff --git a/ci/requirements-2.7.build b/ci/requirements-2.7.build index 415df13179fcf..a7b950e615464 100644 --- a/ci/requirements-2.7.build +++ b/ci/requirements-2.7.build @@ -2,5 +2,5 @@ python=2.7* python-dateutil=2.4.1 pytz=2013b nomkl -numpy +numpy=1.12* cython=0.23 diff --git a/ci/requirements-3.6.build b/ci/requirements-3.6.build index 1c4b46aea3865..8d09e0ee93070 100644 --- a/ci/requirements-3.6.build +++ b/ci/requirements-3.6.build @@ -2,5 +2,5 @@ python=3.6* python-dateutil pytz nomkl -numpy +numpy=1.12* cython diff --git a/ci/requirements-3.5_DOC.build b/ci/requirements-3.6_DOC.build similarity index 53% rename from ci/requirements-3.5_DOC.build rename to ci/requirements-3.6_DOC.build index 73aeb3192242f..37faaa7e4db88 100644 --- a/ci/requirements-3.5_DOC.build +++ b/ci/requirements-3.6_DOC.build @@ -1,5 +1,5 @@ -python=3.5* +python=3.6* python-dateutil pytz -numpy +numpy=1.12* cython diff --git a/ci/requirements-3.5_DOC.run b/ci/requirements-3.6_DOC.run similarity index 87% rename from ci/requirements-3.5_DOC.run rename to ci/requirements-3.6_DOC.run index 9647ab53ab835..df8087f62ef16 100644 --- a/ci/requirements-3.5_DOC.run +++ b/ci/requirements-3.6_DOC.run @@ -12,7 +12,7 @@ lxml beautifulsoup4 html5lib pytables -openpyxl=1.8.5 +openpyxl xlrd xlwt xlsxwriter @@ -21,4 +21,4 @@ numexpr bottleneck statsmodels xarray -pyqt=4.11.4 +pyqt diff --git a/ci/requirements-3.5_DOC.sh b/ci/requirements-3.6_DOC.sh similarity index 100% rename from ci/requirements-3.5_DOC.sh rename to ci/requirements-3.6_DOC.sh diff --git a/doc/source/advanced.rst b/doc/source/advanced.rst index ea00588ba156f..711c3e9a95d05 100644 --- a/doc/source/advanced.rst +++ b/doc/source/advanced.rst @@ -948,6 +948,16 @@ On the other hand, if the index is not monotonic, then both slice bounds must be In [11]: df.loc[2:3, :] KeyError: 'Cannot get right slice bound for non-unique label: 3' +:meth:`Index.is_monotonic_increasing` and :meth:`Index.is_monotonic_decreasing` only check that +an index is weakly monotonic. To check for strict montonicity, you can combine one of those with +:meth:`Index.is_unique` + +.. ipython:: python + + weakly_monotonic = pd.Index(['a', 'b', 'c', 'c']) + weakly_monotonic + weakly_monotonic.is_monotonic_increasing + weakly_monotonic.is_monotonic_increasing & weakly_monotonic.is_unique Endpoints are inclusive ~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/api.rst b/doc/source/api.rst index 888bb6d67e94b..d6053791d6f4b 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -99,6 +99,7 @@ HDFStore: PyTables (HDF5) HDFStore.append HDFStore.get HDFStore.select + HDFStore.info Feather ~~~~~~~ @@ -1705,6 +1706,7 @@ Computations / Descriptive Stats GroupBy.mean GroupBy.median GroupBy.min + GroupBy.ngroup GroupBy.nth GroupBy.ohlc GroupBy.prod diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index cf4f1059ae17a..865f1ccae2c04 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -1122,12 +1122,36 @@ To see the order in which each row appears within its group, use the .. ipython:: python - df = pd.DataFrame(list('aaabba'), columns=['A']) - df + dfg = pd.DataFrame(list('aaabba'), columns=['A']) + dfg + + dfg.groupby('A').cumcount() + + dfg.groupby('A').cumcount(ascending=False) + +.. _groupby.ngroup: + +Enumerate groups +~~~~~~~~~~~~~~~~ + +.. versionadded:: 0.20.2 + +To see the ordering of the groups (as opposed to the order of rows +within a group given by ``cumcount``) you can use the ``ngroup`` +method. + +Note that the numbers given to the groups match the order in which the +groups would be seen when iterating over the groupby object, not the +order they are first observed. + +.. ipython:: python - df.groupby('A').cumcount() + dfg = pd.DataFrame(list('aaabba'), columns=['A']) + dfg - df.groupby('A').cumcount(ascending=False) # kwarg only + dfg.groupby('A').ngroup() + + dfg.groupby('A').ngroup(ascending=False) Plotting ~~~~~~~~ @@ -1176,14 +1200,41 @@ Regroup columns of a DataFrame according to their sum, and sum the aggregated on df df.groupby(df.sum(), axis=1).sum() +.. _groupby.multicolumn_factorization + +Multi-column factorization +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +By using ``.ngroup()``, we can extract information about the groups in +a way similar to :func:`factorize` (as described further in the +:ref:`reshaping API `) but which applies +naturally to multiple columns of mixed type and different +sources. This can be useful as an intermediate categorical-like step +in processing, when the relationships between the group rows are more +important than their content, or as input to an algorithm which only +accepts the integer encoding. (For more information about support in +pandas for full categorical data, see the :ref:`Categorical +introduction ` and the +:ref:`API documentation `.) + +.. ipython:: python + + dfg = pd.DataFrame({"A": [1, 1, 2, 3, 2], "B": list("aaaba")}) + + dfg + + dfg.groupby(["A", "B"]).ngroup() + + dfg.groupby(["A", [0, 0, 0, 1, 1]]).ngroup() + Groupby by Indexer to 'resample' data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Resampling produces new hypothetical samples(resamples) from already existing observed data or from a model that generates data. These new samples are similar to the pre-existing samples. +Resampling produces new hypothetical samples (resamples) from already existing observed data or from a model that generates data. These new samples are similar to the pre-existing samples. In order to resample to work on indices that are non-datetimelike , the following procedure can be utilized. -In the following examples, **df.index // 5** returns a binary array which is used to determine what get's selected for the groupby operation. +In the following examples, **df.index // 5** returns a binary array which is used to determine what gets selected for the groupby operation. .. note:: The below example shows how we can downsample by consolidation of samples into fewer samples. Here by using **df.index // 5**, we are aggregating the samples in bins. By applying **std()** function, we aggregate the information contained in many samples into a small subset of values which is their standard deviation thereby reducing the number of samples. diff --git a/doc/source/io.rst b/doc/source/io.rst index bca23dd18a0e3..bd81b478b5326 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -226,8 +226,8 @@ NA and Missing Data Handling na_values : scalar, str, list-like, or dict, default ``None`` Additional strings to recognize as NA/NaN. If dict passed, specific per-column NA values. By default the following values are interpreted as NaN: - ``'-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A', 'N/A', 'NA', - '#NA', 'NULL', 'NaN', '-NaN', 'nan', '-nan', ''``. + ``'-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A', 'N/A', 'n/a', 'NA', + '#NA', 'NULL', 'null', 'NaN', '-NaN', 'nan', '-nan', ''``. keep_default_na : boolean, default ``True`` If na_values are specified and keep_default_na is ``False`` the default NaN values are overridden, otherwise they're appended to. @@ -2739,11 +2739,6 @@ should be passed to ``index_col`` and ``header`` import os os.remove('path_to_file.xlsx') -.. warning:: - - Excel files saved in version 0.16.2 or prior that had index names will still able to be read in, - but the ``has_index_names`` argument must specified to ``True``. - Parsing Specific Columns ++++++++++++++++++++++++ diff --git a/doc/source/release.rst b/doc/source/release.rst index 2587962299569..bf272e243e0dd 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -37,6 +37,56 @@ analysis / manipulation tool available in any language. * Binary installers on PyPI: http://pypi.python.org/pypi/pandas * Documentation: http://pandas.pydata.org +pandas 0.20.2 +------------- + +**Release date:** June 4, 2017 + +This is a minor bug-fix release in the 0.20.x series and includes some small regression fixes, +bug fixes and performance improvements. +We recommend that all users upgrade to this version. + +See the :ref:`v0.20.2 Whatsnew ` overview for an extensive list +of all enhancements and bugs that have been fixed in 0.20.2. + +Thanks +~~~~~~ + +- Aaron Barber +- Andrew δΊ® +- Becky Sweger +- Christian Prinoth +- Christian Stade-Schuldt +- DSM +- Erik Fredriksen +- Hugues Valois +- Jeff Reback +- Jeff Tratner +- JimStearns206 +- John W. O'Brien +- Joris Van den Bossche +- JosephWagner +- Keith Webber +- Mehmet Ali "Mali" Akmanalp +- Pankaj Pandey +- Patrick Luo +- Patrick O'Melveny +- Pietro Battiston +- RobinFiveWords +- Ryan Hendrickson +- SimonBaron +- Tom Augspurger +- WBare +- bpraggastis +- chernrick +- chris-b1 +- economy +- gfyoung +- jaredsnyder +- keitakurita +- linebp +- lloydkirk + pandas 0.20.0 / 0.20.1 ---------------------- diff --git a/doc/source/reshaping.rst b/doc/source/reshaping.rst index b93749922c8ea..5f125e329f6f1 100644 --- a/doc/source/reshaping.rst +++ b/doc/source/reshaping.rst @@ -636,7 +636,7 @@ When a column contains only one level, it will be omitted in the result. pd.get_dummies(df, drop_first=True) - +.. _reshaping.factorize: Factorizing values ------------------ diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst index 71d85f9b3995b..1dd80aec4fd6c 100644 --- a/doc/source/timeseries.rst +++ b/doc/source/timeseries.rst @@ -1922,7 +1922,7 @@ then you can use a ``PeriodIndex`` and/or ``Series`` of ``Periods`` to do comput span = pd.period_range('1215-01-01', '1381-01-01', freq='D') span -To convert from a ``int64`` based YYYYMMDD representation. +To convert from an ``int64`` based YYYYMMDD representation. .. ipython:: python diff --git a/doc/source/whatsnew.rst b/doc/source/whatsnew.rst index b1f9990a3e6af..3385bafc26467 100644 --- a/doc/source/whatsnew.rst +++ b/doc/source/whatsnew.rst @@ -20,6 +20,8 @@ These are new features and improvements of note in each release. .. include:: whatsnew/v0.21.0.txt +.. include:: whatsnew/v0.20.3.txt + .. include:: whatsnew/v0.20.2.txt .. include:: whatsnew/v0.20.0.txt diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index a0bf2f9b3758a..9d475390175b2 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -515,7 +515,6 @@ Other Enhancements - Options added to allow one to turn on/off using ``bottleneck`` and ``numexpr``, see :ref:`here ` (:issue:`16157`) - ``DataFrame.style.bar()`` now accepts two more options to further customize the bar chart. Bar alignment is set with ``align='left'|'mid'|'zero'``, the default is "left", which is backward compatible; You can now pass a list of ``color=[color_negative, color_positive]``. (:issue:`14757`) - .. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index 13365401f1d1c..31125db0f34d4 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -1,14 +1,12 @@ .. _whatsnew_0202: -v0.20.2 (???) -------------- +v0.20.2 (June 4, 2017) +---------------------- This is a minor bug-fix release in the 0.20.x series and includes some small regression fixes, bug fixes and performance improvements. We recommend that all users upgrade to this version. -Highlights include: - .. contents:: What's new in v0.20.2 :local: :backlinks: none @@ -22,6 +20,11 @@ Enhancements - Unblocked access to additional compression types supported in pytables: 'blosc:blosclz, 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy', 'blosc:zlib', 'blosc:zstd' (:issue:`14478`) - ``Series`` provides a ``to_latex`` method (:issue:`16180`) +- A new groupby method :meth:`~pandas.core.groupby.GroupBy.ngroup`, + parallel to the existing :meth:`~pandas.core.groupby.GroupBy.cumcount`, + has been added to return the group order (:issue:`11642`); see + :ref:`here `. + .. _whatsnew_0202.performance: Performance Improvements @@ -31,40 +34,50 @@ Performance Improvements - Performance regression fix for MultiIndexes (:issue:`16319`, :issue:`16346`) - Improved performance of ``.clip()`` with scalar arguments (:issue:`15400`) - Improved performance of groupby with categorical groupers (:issue:`16413`) +- Improved performance of ``MultiIndex.remove_unused_levels()`` (:issue:`16556`) .. _whatsnew_0202.bug_fixes: Bug Fixes ~~~~~~~~~ +- Silenced a warning on some Windows environments about "tput: terminal attributes: No such device or address" when + detecting the terminal size. This fix only applies to python 3 (:issue:`16496`) - Bug in using ``pathlib.Path`` or ``py.path.local`` objects with io functions (:issue:`16291`) +- Bug in ``Index.symmetric_difference()`` on two equal MultiIndex's, results in a ``TypeError`` (:issue `13490`) - Bug in ``DataFrame.update()`` with ``overwrite=False`` and ``NaN values`` (:issue:`15593`) - - - -- Fixed a compatibility issue with IPython 6.0's tab completion showing deprecation warnings on Categoricals (:issue:`16409`) - +- Passing an invalid engine to :func:`read_csv` now raises an informative + ``ValueError`` rather than ``UnboundLocalError``. (:issue:`16511`) +- Bug in :func:`unique` on an array of tuples (:issue:`16519`) +- Bug in :func:`cut` when ``labels`` are set, resulting in incorrect label ordering (:issue:`16459`) +- Fixed a compatibility issue with IPython 6.0's tab completion showing deprecation warnings on ``Categoricals`` (:issue:`16409`) Conversion ^^^^^^^^^^ -- Bug in ``pd.to_numeric()`` in which empty data inputs were causing Python to crash (:issue:`16302`) -- Silence numpy warnings when broadcasting DataFrame to Series with comparison ops (:issue:`16378`, :issue:`16306`) +- Bug in :func:`to_numeric` in which empty data inputs were causing a segfault of the interpreter (:issue:`16302`) +- Silence numpy warnings when broadcasting ``DataFrame`` to ``Series`` with comparison ops (:issue:`16378`, :issue:`16306`) Indexing ^^^^^^^^ - Bug in ``DataFrame.reset_index(level=)`` with single level index (:issue:`16263`) - +- Bug in partial string indexing with a monotonic, but not strictly-monotonic, index incorrectly reversing the slice bounds (:issue:`16515`) +- Bug in ``MultiIndex.remove_unused_levels()`` that would not return a ``MultiIndex`` equal to the original. (:issue:`16556`) I/O ^^^ -- Bug in pd.read_csv() when comment is passed in space deliminted text files (:issue:`16472`) +- Bug in :func:`read_csv` when ``comment`` is passed in a space delimited text file (:issue:`16472`) +- Bug in :func:`read_csv` not raising an exception with nonexistent columns in ``usecols`` when it had the correct length (:issue:`14671`) - Bug that would force importing of the clipboard routines unnecessarily, potentially causing an import error on startup (:issue:`16288`) -- Bug that raised IndexError HTML-rendering an empty DataFrame (:issue:`15953`) +- Bug that raised ``IndexError`` when HTML-rendering an empty ``DataFrame`` (:issue:`15953`) +- Bug in :func:`read_csv` in which tarfile object inputs were raising an error in Python 2.x for the C engine (:issue:`16530`) +- Bug where ``DataFrame.to_html()`` ignored the ``index_names`` parameter (:issue:`16493`) +- Bug where ``pd.read_hdf()`` returns numpy strings for index names (:issue:`13492`) +- Bug in ``HDFStore.select_as_multiple()`` where start/stop arguments were not respected (:issue:`16209`) Plotting ^^^^^^^^ @@ -79,26 +92,29 @@ Plotting Groupby/Resample/Rolling ^^^^^^^^^^^^^^^^^^^^^^^^ -- Bug creating datetime rolling window on an empty DataFrame (:issue:`15819`) +- Bug in creating a time-based rolling window on an empty ``DataFrame`` (:issue:`15819`) +- Bug in ``rolling.cov()`` with offset window (:issue:`16058`) +- Bug in ``.resample()`` and ``.groupby()`` when aggregating on integers (:issue:`16361`) Sparse ^^^^^^ -- Bug in construction of SparseDataFrame from ``scipy.sparse.dok_matrix`` (:issue:`16179`) +- Bug in construction of ``SparseDataFrame`` from ``scipy.sparse.dok_matrix`` (:issue:`16179`) Reshaping ^^^^^^^^^ -- Bug in ``DataFrame.stack`` with unsorted levels in MultiIndex columns (:issue:`16323`) +- Bug in ``DataFrame.stack`` with unsorted levels in ``MultiIndex`` columns (:issue:`16323`) - Bug in ``pd.wide_to_long()`` where no error was raised when ``i`` was not a unique identifier (:issue:`16382`) - Bug in ``Series.isin(..)`` with a list of tuples (:issue:`16394`) - Bug in construction of a ``DataFrame`` with mixed dtypes including an all-NaT column. (:issue:`16395`) +- Bug in ``DataFrame.agg()`` and ``Series.agg()`` with aggregating on non-callable attributes (:issue:`16405`) Numeric ^^^^^^^ -- Bug in .interpolate(), where limit_direction was not respected when limit=None (default) was passed (:issue:16282) +- Bug in ``.interpolate()``, where ``limit_direction`` was not respected when ``limit=None`` (default) was passed (:issue:`16282`) Categorical ^^^^^^^^^^^ @@ -108,4 +124,4 @@ Categorical Other ^^^^^ -- Bug in ``pd.drop([])`` for DataFrame with non-unique indices (:issue:`16270`) +- Bug in ``DataFrame.drop()`` with an empty-list with non-unique indices (:issue:`16270`) diff --git a/doc/source/whatsnew/v0.20.3.txt b/doc/source/whatsnew/v0.20.3.txt new file mode 100644 index 0000000000000..52f7701724f18 --- /dev/null +++ b/doc/source/whatsnew/v0.20.3.txt @@ -0,0 +1,90 @@ +.. _whatsnew_0203: + +v0.20.3 (June ??, 2017) +----------------------- + +This is a minor bug-fix release in the 0.20.x series and includes some small regression fixes, +bug fixes and performance improvements. +We recommend that all users upgrade to this version. + +.. contents:: What's new in v0.20.3 + :local: + :backlinks: none + + +.. _whatsnew_0203.enhancements: + +Enhancements +~~~~~~~~~~~~ + + + + + + +.. _whatsnew_0203.performance: + +Performance Improvements +~~~~~~~~~~~~~~~~~~~~~~~~ + + + + + + +.. _whatsnew_0203.bug_fixes: + +Bug Fixes +~~~~~~~~~ + + + + +Conversion +^^^^^^^^^^ + +- Bug in pickle compat prior to the v0.20.x series, when ``UTC`` is a timezone in a Series/DataFrame/Index (:issue:`16608`) +- Bug in Series construction when passing a Series with ``dtype='category'`` (:issue:`16524`). + +Indexing +^^^^^^^^ + +- Bug in ``Float64Index`` causing an empty array instead of None to be returned from ``.get(np.nan)`` on a Series whose index did not contain any NaNs (:issue:`8569`) + +I/O +^^^ + + + +Plotting +^^^^^^^^ + + + + +Groupby/Resample/Rolling +^^^^^^^^^^^^^^^^^^^^^^^^ + + + +Sparse +^^^^^^ + + + + +Reshaping +^^^^^^^^^ + + + +Numeric +^^^^^^^ + + +Categorical +^^^^^^^^^^^ + + +Other +^^^^^ diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index b4ca3f011a81d..fe3eb291d06ff 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -37,6 +37,7 @@ Other Enhancements - :func:`api.types.infer_dtype` now infers decimals. (:issue: `15690`) - :func:`read_feather` has gained the ``nthreads`` parameter for multi-threaded operations (:issue:`16359`) - :func:`DataFrame.clip()` and :func: `Series.cip()` have gained an inplace argument. (:issue: `15388`) +- :func:`crosstab` has gained a ``margins_name`` parameter to define the name of the row / column that will contain the totals when margins=True. (:issue:`15972`) .. _whatsnew_0210.api_breaking: @@ -48,6 +49,10 @@ Backwards incompatible API changes - Accessing a non-existent attribute on a closed :class:`HDFStore` will now raise an ``AttributeError`` rather than a ``ClosedFileError`` (:issue:`16301`) +- :func:`read_csv` now treats ``'null'`` strings as missing values by default (:issue:`16471`) +- :func:`read_csv` now treats ``'n/a'`` strings as missing values by default (:issue:`16078`) + +- :class:`pandas.HDFStore`'s string representation is now faster and less detailed. For the previous behavior, use ``pandas.HDFStore.info()``. (:issue:`16503`). .. _whatsnew_0210.api: @@ -69,6 +74,7 @@ Deprecations Removal of prior version deprecations/changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +- ``pd.read_excel()`` has dropped the ``has_index_names`` parameter (:issue:`10967`) .. _whatsnew_0210.performance: @@ -77,7 +83,6 @@ Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - .. _whatsnew_0210.bug_fixes: Bug Fixes diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 21680fb0b3921..5e92c506b5d0c 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -152,7 +152,7 @@ cdef class IndexEngine: try: return self.mapping.get_item(val) - except TypeError: + except (TypeError, ValueError): raise KeyError(val) cdef inline _get_loc_duplicates(self, object val): @@ -470,7 +470,7 @@ cdef class DatetimeEngine(Int64Engine): try: val = _to_i8(val) return self.mapping.get_item(val) - except TypeError: + except (TypeError, ValueError): self._date_check_type(val) raise KeyError(val) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 2def4dc9dcf24..2549c8545908d 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -277,7 +277,7 @@ DEFAULT_CHUNKSIZE = 256 * 1024 # no longer excluding inf representations # '1.#INF','-1.#INF', '1.#INF000000', _NA_VALUES = [b'-1.#IND', b'1.#QNAN', b'1.#IND', b'-1.#QNAN', - b'#N/A N/A', b'NA', b'#NA', b'NULL', b'NaN', + b'#N/A N/A', b'n/a', b'NA', b'#NA', b'NULL', b'null', b'NaN', b'nan', b''] diff --git a/pandas/compat/numpy/__init__.py b/pandas/compat/numpy/__init__.py index 4a9a2647ece0f..2c5a18973afa8 100644 --- a/pandas/compat/numpy/__init__.py +++ b/pandas/compat/numpy/__init__.py @@ -15,6 +15,7 @@ _np_version_under1p11 = _nlv < '1.11' _np_version_under1p12 = _nlv < '1.12' _np_version_under1p13 = _nlv < '1.13' +_np_version_under1p14 = _nlv < '1.14' if _nlv < '1.7.0': raise ImportError('this version of pandas is incompatible with ' @@ -74,4 +75,6 @@ def np_array_datetime64_compat(arr, *args, **kwargs): '_np_version_under1p10', '_np_version_under1p11', '_np_version_under1p12', + '_np_version_under1p13', + '_np_version_under1p14' ] diff --git a/pandas/compat/pickle_compat.py b/pandas/compat/pickle_compat.py index b875bbb0d63c0..f6223c48994ae 100644 --- a/pandas/compat/pickle_compat.py +++ b/pandas/compat/pickle_compat.py @@ -15,7 +15,7 @@ def load_reduce(self): args = stack.pop() func = stack[-1] - if type(args[0]) is type: + if len(args) and type(args[0]) is type: n = args[0].__name__ # noqa try: diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 77d79c9585e57..d74c5e66ea1a9 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -163,7 +163,7 @@ def _ensure_arraylike(values): ABCIndexClass, ABCSeries)): inferred = lib.infer_dtype(values) if inferred in ['mixed', 'string', 'unicode']: - values = np.asarray(values, dtype=object) + values = lib.list_to_object_array(values) else: values = np.asarray(values) return values @@ -328,6 +328,11 @@ def unique(values): [b, a, c] Categories (3, object): [a < b < c] + An array of tuples + + >>> pd.unique([('a', 'b'), ('b', 'a'), ('a', 'c'), ('b', 'a')]) + array([('a', 'b'), ('b', 'a'), ('a', 'c')], dtype=object) + See Also -------- pandas.Index.unique diff --git a/pandas/core/base.py b/pandas/core/base.py index a3ef24c80f883..97c4c8626dcbb 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -378,7 +378,7 @@ def aggregate(self, func, *args, **kwargs): def _try_aggregate_string_function(self, arg, *args, **kwargs): """ if arg is a string, then try to operate on it: - - try to find a function on ourselves + - try to find a function (or attribute) on ourselves - try to find a numpy function - raise @@ -387,7 +387,15 @@ def _try_aggregate_string_function(self, arg, *args, **kwargs): f = getattr(self, arg, None) if f is not None: - return f(*args, **kwargs) + if callable(f): + return f(*args, **kwargs) + + # people may try to aggregate on a non-callable attribute + # but don't let them think they can pass args to it + assert len(args) == 0 + assert len([kwarg for kwarg in kwargs + if kwarg not in ['axis', '_level']]) == 0 + return f f = getattr(np, arg, None) if f is not None: diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index fd61813a57c98..16b0a5c8a74ca 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -668,7 +668,7 @@ def maybe_convert_objects(values, convert_dates=True, convert_numeric=True, if convert_timedeltas == 'coerce': from pandas.core.tools.timedeltas import to_timedelta - new_values = to_timedelta(values, coerce=True) + new_values = to_timedelta(values, errors='coerce') # if we are all nans then leave me alone if not isnull(new_values).all(): diff --git a/pandas/core/dtypes/inference.py b/pandas/core/dtypes/inference.py index a5316a83612cb..ff7e215951a1f 100644 --- a/pandas/core/dtypes/inference.py +++ b/pandas/core/dtypes/inference.py @@ -171,7 +171,7 @@ def is_file_like(obj): if not (hasattr(obj, 'read') or hasattr(obj, 'write')): return False - if not is_iterator(obj): + if not hasattr(obj, "__iter__"): return False return True diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 743d623ee5e44..2b2e7be62427b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -184,7 +184,7 @@ dataset. * "many_to_one" or "m:1": check if merge keys are unique in right dataset. - * "many_to_may" or "m:m": allowed, but does not result in checks. + * "many_to_many" or "m:m": allowed, but does not result in checks. .. versionadded:: 0.21.0 @@ -241,17 +241,47 @@ class DataFrame(NDFrame): Column labels to use for resulting frame. Will default to np.arange(n) if no column labels are provided dtype : dtype, default None - Data type to force, otherwise infer + Data type to force. Only a single dtype is allowed. If None, infer copy : boolean, default False Copy data from inputs. Only affects DataFrame / 2d ndarray input Examples -------- - >>> d = {'col1': ts1, 'col2': ts2} - >>> df = DataFrame(data=d, index=index) - >>> df2 = DataFrame(np.random.randn(10, 5)) - >>> df3 = DataFrame(np.random.randn(10, 5), - ... columns=['a', 'b', 'c', 'd', 'e']) + Constructing DataFrame from a dictionary. + + >>> d = {'col1': [1, 2], 'col2': [3, 4]} + >>> df = pd.DataFrame(data=d) + >>> df + col1 col2 + 0 1 3 + 1 2 4 + + Notice that the inferred dtype is int64. + + >>> df.dtypes + col1 int64 + col2 int64 + dtype: object + + To enforce a single dtype: + + >>> df = pd.DataFrame(data=d, dtype=np.int8) + >>> df.dtypes + col1 int8 + col2 int8 + dtype: object + + Constructing DataFrame from numpy ndarray: + + >>> df2 = pd.DataFrame(np.random.randint(low=0, high=10, size=(5, 5)), + ... columns=['a', 'b', 'c', 'd', 'e']) + >>> df2 + a b c d e + 0 2 8 8 3 4 + 1 4 2 9 0 9 + 2 1 0 7 8 0 + 3 5 1 7 1 3 + 4 6 0 2 4 2 See also -------- @@ -1888,7 +1918,7 @@ def get_value(self, index, col, takeable=False): try: return engine.get_value(series._values, index) - except TypeError: + except (TypeError, ValueError): # we cannot handle direct indexing # use positional diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e541f1532d0a0..accb7d0db1d2c 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2366,9 +2366,14 @@ def add_suffix(self, suffix): 1 A 1 1 """ - def sort_values(self, by, axis=0, ascending=True, inplace=False, + def sort_values(self, by=None, axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last'): - raise AbstractMethodError(self) + """ + NOT IMPLEMENTED: do not call this method, as sorting values is not + supported for Panel objects and will raise an error. + """ + raise NotImplementedError("sort_values has not been implemented " + "on Panel or Panel4D objects.") _shared_docs['sort_index'] = """ Sort object by labels (along an axis) @@ -4280,7 +4285,7 @@ def asof(self, where, subset=None): raise ValueError("subset is not valid for Series") elif self.ndim > 2: raise NotImplementedError("asof is not implemented " - "for {type}".format(type(self))) + "for {type}".format(type=type(self))) else: if subset is None: subset = self.columns @@ -4975,7 +4980,7 @@ def last(self, offset): offset = to_offset(offset) - start_date = start = self.index[-1] - offset + start_date = self.index[-1] - offset start = self.index.searchsorted(start_date, side='right') return self.iloc[start:] @@ -5298,8 +5303,8 @@ def _where(self, cond, other=np.nan, inplace=False, axis=None, level=None, # slice me out of the other else: - raise NotImplemented("cannot align with a higher dimensional " - "NDFrame") + raise NotImplementedError("cannot align with a higher " + "dimensional NDFrame") elif is_list_like(other): diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 91b55c414b507..9d6d2297f6ea0 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -150,7 +150,7 @@ 'last', 'first', 'head', 'tail', 'median', 'mean', 'sum', 'min', 'max', - 'cumcount', + 'cumcount', 'ngroup', 'resample', 'rank', 'quantile', 'fillna', @@ -1437,6 +1437,75 @@ def nth(self, n, dropna=None): return result + @Substitution(name='groupby') + @Appender(_doc_template) + def ngroup(self, ascending=True): + """ + Number each group from 0 to the number of groups - 1. + + This is the enumerative complement of cumcount. Note that the + numbers given to the groups match the order in which the groups + would be seen when iterating over the groupby object, not the + order they are first observed. + + .. versionadded:: 0.20.2 + + Parameters + ---------- + ascending : bool, default True + If False, number in reverse, from number of group - 1 to 0. + + Examples + -------- + + >>> df = pd.DataFrame({"A": list("aaabba")}) + >>> df + A + 0 a + 1 a + 2 a + 3 b + 4 b + 5 a + >>> df.groupby('A').ngroup() + 0 0 + 1 0 + 2 0 + 3 1 + 4 1 + 5 0 + dtype: int64 + >>> df.groupby('A').ngroup(ascending=False) + 0 1 + 1 1 + 2 1 + 3 0 + 4 0 + 5 1 + dtype: int64 + >>> df.groupby(["A", [1,1,2,3,2,1]]).ngroup() + 0 0 + 1 0 + 2 1 + 3 3 + 4 2 + 5 0 + dtype: int64 + + See also + -------- + .cumcount : Number the rows in each group. + + """ + + self._set_group_selection() + + index = self._selected_obj.index + result = Series(self.grouper.group_info[0], index) + if not ascending: + result = self.ngroups - 1 - result + return result + @Substitution(name='groupby') @Appender(_doc_template) def cumcount(self, ascending=True): @@ -1481,6 +1550,10 @@ def cumcount(self, ascending=True): 4 0 5 0 dtype: int64 + + See also + -------- + .ngroup : Number the groups themselves. """ self._set_group_selection() @@ -3337,13 +3410,15 @@ def _cython_agg_blocks(self, how, alt=None, numeric_only=True): obj = self.obj[data.items[locs]] s = groupby(obj, self.grouper) result = s.aggregate(lambda x: alt(x, axis=self.axis)) - result = result._data.blocks[0] + newb = result._data.blocks[0] + + finally: - # see if we can cast the block back to the original dtype - result = block._try_coerce_and_cast_result(result) + # see if we can cast the block back to the original dtype + result = block._try_coerce_and_cast_result(result) + newb = block.make_block(result) new_items.append(locs) - newb = block.make_block_same_class(result) new_blocks.append(newb) if len(new_blocks) == 0: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 2af4f112ca941..028464ad5cd89 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1191,6 +1191,15 @@ def is_monotonic_increasing(self): """ return if the index is monotonic increasing (only equal or increasing) values. + + Examples + -------- + >>> Index([1, 2, 3]).is_monotonic_increasing + True + >>> Index([1, 2, 2]).is_monotonic_increasing + True + >>> Index([1, 3, 2]).is_monotonic_increasing + False """ return self._engine.is_monotonic_increasing @@ -1199,9 +1208,50 @@ def is_monotonic_decreasing(self): """ return if the index is monotonic decreasing (only equal or decreasing) values. + + Examples + -------- + >>> Index([3, 2, 1]).is_monotonic_decreasing + True + >>> Index([3, 2, 2]).is_monotonic_decreasing + True + >>> Index([3, 1, 2]).is_monotonic_decreasing + False """ return self._engine.is_monotonic_decreasing + @property + def _is_strictly_monotonic_increasing(self): + """return if the index is strictly monotonic increasing + (only increasing) values + + Examples + -------- + >>> Index([1, 2, 3])._is_strictly_monotonic_increasing + True + >>> Index([1, 2, 2])._is_strictly_monotonic_increasing + False + >>> Index([1, 3, 2])._is_strictly_monotonic_increasing + False + """ + return self.is_unique and self.is_monotonic_increasing + + @property + def _is_strictly_monotonic_decreasing(self): + """return if the index is strictly monotonic decreasing + (only decreasing) values + + Examples + -------- + >>> Index([3, 2, 1])._is_strictly_monotonic_decreasing + True + >>> Index([3, 2, 2])._is_strictly_monotonic_decreasing + False + >>> Index([3, 1, 2])._is_strictly_monotonic_decreasing + False + """ + return self.is_unique and self.is_monotonic_decreasing + def is_lexsorted_for_tuple(self, tup): return True @@ -1590,7 +1640,7 @@ def __contains__(self, key): hash(key) try: return key in self._engine - except TypeError: + except (TypeError, ValueError): return False _index_shared_docs['contains'] = """ @@ -1610,7 +1660,7 @@ def contains(self, key): hash(key) try: return key in self._engine - except TypeError: + except (TypeError, ValueError): return False def __hash__(self): diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index ec678b1577d81..239894cff3874 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -1472,7 +1472,7 @@ def _maybe_cast_slice_bound(self, label, side, kind): # the bounds need swapped if index is reverse sorted and has a # length > 1 (is_monotonic_decreasing gives True for empty # and length 1 index) - if self.is_monotonic_decreasing and len(self) > 1: + if self._is_strictly_monotonic_decreasing and len(self) > 1: return upper if side == 'left' else lower return lower if side == 'left' else upper else: diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index b1523cd6c0d0c..e6b2bc0953680 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -1053,11 +1053,11 @@ def interval_range(start=None, end=None, freq=None, periods=None, if periods is None or end is None: raise ValueError("must specify 2 of start, end, periods") start = end - periods * freq - elif end is None: + if end is None: if periods is None or start is None: raise ValueError("must specify 2 of start, end, periods") end = start + periods * freq - elif periods is None: + if periods is None: if start is None or end is None: raise ValueError("must specify 2 of start, end, periods") pass diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 569e16f2141ae..f30da5b05f8ae 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -414,6 +414,12 @@ def view(self, cls=None): return result def _shallow_copy_with_infer(self, values=None, **kwargs): + # On equal MultiIndexes the difference is empty. + # Therefore, an empty MultiIndex is returned GH13490 + if len(values) == 0: + return MultiIndex(levels=[[] for _ in range(self.nlevels)], + labels=[[] for _ in range(self.nlevels)], + **kwargs) return self._shallow_copy(values, **kwargs) @Appender(_index_shared_docs['_shallow_copy']) @@ -1284,8 +1290,8 @@ def remove_unused_levels(self): new_levels = [] new_labels = [] - changed = np.ones(self.nlevels, dtype=bool) - for i, (lev, lab) in enumerate(zip(self.levels, self.labels)): + changed = False + for lev, lab in zip(self.levels, self.labels): uniques = algos.unique(lab) @@ -1293,33 +1299,29 @@ def remove_unused_levels(self): if len(uniques) == len(lev): new_levels.append(lev) new_labels.append(lab) - changed[i] = False continue - # set difference, then reverse sort - diff = Index(np.arange(len(lev))).difference(uniques) - unused = diff.sort_values(ascending=False) + changed = True + + # labels get mapped from uniques to 0:len(uniques) + label_mapping = np.zeros(len(lev)) + label_mapping[uniques] = np.arange(len(uniques)) + lab = label_mapping[lab] # new levels are simple lev = lev.take(uniques) - # new labels, we remove the unsued - # by decrementing the labels for that value - # prob a better way - for u in unused: - - lab = np.where(lab > u, lab - 1, lab) - new_levels.append(lev) new_labels.append(lab) - # nothing changed - if not changed.any(): - return self + result = self._shallow_copy() - return MultiIndex(new_levels, new_labels, - names=self.names, sortorder=self.sortorder, - verify_integrity=False) + if changed: + result._reset_identity() + result._set_levels(new_levels, validate=False) + result._set_labels(new_labels, validate=False) + + return result @property def nlevels(self): diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index bdae0ac7ac5e9..72d521cbe2d60 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -369,6 +369,8 @@ def get_loc(self, key, method=None, tolerance=None): except (ValueError, IndexError): # should only need to catch ValueError here but on numpy # 1.7 .item() can raise IndexError when NaNs are present + if not len(nan_idxs): + raise KeyError(key) return nan_idxs except (TypeError, NotImplementedError): pass diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 15fd9b7dc2b6a..f8af6c8303d99 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -125,15 +125,7 @@ def _new_PeriodIndex(cls, **d): class PeriodIndex(DatelikeOps, DatetimeIndexOpsMixin, Int64Index): """ Immutable ndarray holding ordinal values indicating regular periods in - time such as particular years, quarters, months, etc. A value of 1 is the - period containing the Gregorian proleptic datetime Jan 1, 0001 00:00:00. - This ordinal representation is from the scikits.timeseries project. - - For instance, - # construct period for day 1/1/1 and get the first second - i = Period(year=1,month=1,day=1,freq='D').asfreq('S', 'S') - i.ordinal - ===> 1 + time such as particular years, quarters, months, etc. Index keys are boxed to Period objects which carries the metadata (eg, frequency information). diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 15851a17274ca..f2a7ac76481d4 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -471,7 +471,7 @@ def astype(self, dtype, copy=False, errors='raise', values=None, **kwargs): **kwargs) def _astype(self, dtype, copy=False, errors='raise', values=None, - klass=None, mgr=None, **kwargs): + klass=None, mgr=None, raise_on_error=False, **kwargs): """ Coerce to the new type (if copy=True, return a new copy) raise on an except if raise == True @@ -4645,7 +4645,6 @@ def _block2d_to_blocknd(values, placement, shape, labels, ref_items): pvalues = np.empty(panel_shape, dtype=dtype) pvalues.fill(fill_value) - values = values for i in range(len(placement)): pvalues[i].flat[mask] = values[:, i] @@ -5154,8 +5153,6 @@ def dtype(self): return _get_dtype(maybe_promote(self.block.dtype, self.block.fill_value)[0]) - return self._dtype - @cache_readonly def is_null(self): if self.block is None: diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 51778684d68f5..5aabc9d8730dd 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -143,12 +143,6 @@ def interpolate_1d(xvalues, yvalues, method='linear', limit=None, 'DatetimeIndex') method = 'values' - def _interp_limit(invalid, fw_limit, bw_limit): - "Get idx of values that won't be filled b/c they exceed the limits." - for x in np.where(invalid)[0]: - if invalid[max(0, x - fw_limit):x + bw_limit + 1].all(): - yield x - valid_limit_directions = ['forward', 'backward', 'both'] limit_direction = limit_direction.lower() if limit_direction not in valid_limit_directions: @@ -180,21 +174,29 @@ def _interp_limit(invalid, fw_limit, bw_limit): # default limit is unlimited GH #16282 if limit is None: - limit = len(xvalues) + # limit = len(xvalues) + pass elif not is_integer(limit): raise ValueError('Limit must be an integer') elif limit < 1: raise ValueError('Limit must be greater than 0') # each possible limit_direction - if limit_direction == 'forward': + # TODO: do we need sorted? + if limit_direction == 'forward' and limit is not None: violate_limit = sorted(start_nans | set(_interp_limit(invalid, limit, 0))) - elif limit_direction == 'backward': + elif limit_direction == 'forward': + violate_limit = sorted(start_nans) + elif limit_direction == 'backward' and limit is not None: violate_limit = sorted(end_nans | set(_interp_limit(invalid, 0, limit))) - elif limit_direction == 'both': + elif limit_direction == 'backward': + violate_limit = sorted(end_nans) + elif limit_direction == 'both' and limit is not None: violate_limit = sorted(_interp_limit(invalid, limit, limit)) + else: + violate_limit = [] xvalues = getattr(xvalues, 'values', xvalues) yvalues = getattr(yvalues, 'values', yvalues) @@ -630,3 +632,58 @@ def fill_zeros(result, x, y, name, fill): result = result.reshape(shape) return result + + +def _interp_limit(invalid, fw_limit, bw_limit): + """Get idx of values that won't be filled b/c they exceed the limits. + + This is equivalent to the more readable, but slower + + .. code-block:: python + + for x in np.where(invalid)[0]: + if invalid[max(0, x - fw_limit):x + bw_limit + 1].all(): + yield x + """ + # handle forward first; the backward direction is the same except + # 1. operate on the reversed array + # 2. subtract the returned indicies from N - 1 + N = len(invalid) + + def inner(invalid, limit): + limit = min(limit, N) + windowed = _rolling_window(invalid, limit + 1).all(1) + idx = (set(np.where(windowed)[0] + limit) | + set(np.where((~invalid[:limit + 1]).cumsum() == 0)[0])) + return idx + + if fw_limit == 0: + f_idx = set(np.where(invalid)[0]) + else: + f_idx = inner(invalid, fw_limit) + + if bw_limit == 0: + # then we don't even need to care about backwards, just use forwards + return f_idx + else: + b_idx = set(N - 1 - np.asarray(list(inner(invalid[::-1], bw_limit)))) + if fw_limit == 0: + return b_idx + return f_idx & b_idx + + +def _rolling_window(a, window): + """ + [True, True, False, True, False], 2 -> + + [ + [True, True], + [True, False], + [False, True], + [True, False], + ] + """ + # https://stackoverflow.com/a/6811241 + shape = a.shape[:-1] + (a.shape[-1] - window + 1, window) + strides = a.strides + (a.strides[-1],) + return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index af2eb734a02f6..96603b6adc3b0 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -197,6 +197,8 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, 0 a 2 >>> pd.concat([df5, df6], verify_integrity=True) + Traceback (most recent call last): + ... ValueError: Indexes have overlapping values: ['a'] """ op = _Concatenator(objs, axis=axis, join_axes=join_axes, diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 74dbbfc00cb11..0581ec7484c49 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -50,26 +50,36 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', Examples -------- + >>> df = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo", + ... "bar", "bar", "bar", "bar"], + ... "B": ["one", "one", "one", "two", "two", + ... "one", "one", "two", "two"], + ... "C": ["small", "large", "large", "small", + ... "small", "large", "small", "small", + ... "large"], + ... "D": [1, 2, 2, 3, 3, 4, 5, 6, 7]}) >>> df - A B C D - 0 foo one small 1 - 1 foo one large 2 - 2 foo one large 2 - 3 foo two small 3 - 4 foo two small 3 - 5 bar one large 4 - 6 bar one small 5 - 7 bar two small 6 - 8 bar two large 7 + A B C D + 0 foo one small 1 + 1 foo one large 2 + 2 foo one large 2 + 3 foo two small 3 + 4 foo two small 3 + 5 bar one large 4 + 6 bar one small 5 + 7 bar two small 6 + 8 bar two large 7 >>> table = pivot_table(df, values='D', index=['A', 'B'], ... columns=['C'], aggfunc=np.sum) >>> table - small large - foo one 1 4 - two 6 NaN - bar one 5 4 - two 6 7 + ... # doctest: +NORMALIZE_WHITESPACE + C large small + A B + bar one 4.0 5.0 + two 7.0 6.0 + foo one 4.0 1.0 + two NaN 6.0 Returns ------- @@ -388,7 +398,8 @@ def _convert_by(by): def crosstab(index, columns, values=None, rownames=None, colnames=None, - aggfunc=None, margins=False, dropna=True, normalize=False): + aggfunc=None, margins=False, margins_name='All', dropna=True, + normalize=False): """ Compute a simple cross-tabulation of two (or more) factors. By default computes a frequency table of the factors unless an array of values and an @@ -411,6 +422,12 @@ def crosstab(index, columns, values=None, rownames=None, colnames=None, If passed, must match number of column arrays passed margins : boolean, default False Add row/column margins (subtotals) + margins_name : string, default 'All' + Name of the row / column that will contain the totals + when margins is True. + + .. versionadded:: 0.21.0 + dropna : boolean, default True Do not include columns whose entries are all NaN normalize : boolean, {'all', 'index', 'columns'}, or {0,1}, default False @@ -438,27 +455,27 @@ def crosstab(index, columns, values=None, rownames=None, colnames=None, Examples -------- - >>> a - array([foo, foo, foo, foo, bar, bar, - bar, bar, foo, foo, foo], dtype=object) - >>> b - array([one, one, one, two, one, one, - one, two, two, two, one], dtype=object) - >>> c - array([dull, dull, shiny, dull, dull, shiny, - shiny, dull, shiny, shiny, shiny], dtype=object) - - >>> crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c']) - b one two - c dull shiny dull shiny + >>> a = np.array(["foo", "foo", "foo", "foo", "bar", "bar", + ... "bar", "bar", "foo", "foo", "foo"], dtype=object) + >>> b = np.array(["one", "one", "one", "two", "one", "one", + ... "one", "two", "two", "two", "one"], dtype=object) + >>> c = np.array(["dull", "dull", "shiny", "dull", "dull", "shiny", + ... "shiny", "dull", "shiny", "shiny", "shiny"], + ... dtype=object) + + >>> pd.crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c']) + ... # doctest: +NORMALIZE_WHITESPACE + b one two + c dull shiny dull shiny a - bar 1 2 1 0 - foo 2 2 1 2 + bar 1 2 1 0 + foo 2 2 1 2 >>> foo = pd.Categorical(['a', 'b'], categories=['a', 'b', 'c']) >>> bar = pd.Categorical(['d', 'e'], categories=['d', 'e', 'f']) >>> crosstab(foo, bar) # 'c' and 'f' are not represented in the data, - # but they still will be counted in the output + ... # but they still will be counted in the output + ... # doctest: +SKIP col_0 d e f row_0 a 1 0 0 @@ -490,23 +507,26 @@ def crosstab(index, columns, values=None, rownames=None, colnames=None, df = DataFrame(data) df['__dummy__'] = 0 table = df.pivot_table('__dummy__', index=rownames, columns=colnames, - aggfunc=len, margins=margins, dropna=dropna) + aggfunc=len, margins=margins, + margins_name=margins_name, dropna=dropna) table = table.fillna(0).astype(np.int64) else: data['__dummy__'] = values df = DataFrame(data) table = df.pivot_table('__dummy__', index=rownames, columns=colnames, - aggfunc=aggfunc, margins=margins, dropna=dropna) + aggfunc=aggfunc, margins=margins, + margins_name=margins_name, dropna=dropna) # Post-process if normalize is not False: - table = _normalize(table, normalize=normalize, margins=margins) + table = _normalize(table, normalize=normalize, margins=margins, + margins_name=margins_name) return table -def _normalize(table, normalize, margins): +def _normalize(table, normalize, margins, margins_name='All'): if not isinstance(normalize, bool) and not isinstance(normalize, compat.string_types): @@ -537,9 +557,9 @@ def _normalize(table, normalize, margins): elif margins is True: - column_margin = table.loc[:, 'All'].drop('All') - index_margin = table.loc['All', :].drop('All') - table = table.drop('All', axis=1).drop('All') + column_margin = table.loc[:, margins_name].drop(margins_name) + index_margin = table.loc[margins_name, :].drop(margins_name) + table = table.drop(margins_name, axis=1).drop(margins_name) # to keep index and columns names table_index_names = table.index.names table_columns_names = table.columns.names @@ -561,7 +581,7 @@ def _normalize(table, normalize, margins): elif normalize == "all" or normalize is True: column_margin = column_margin / column_margin.sum() index_margin = index_margin / index_margin.sum() - index_margin.loc['All'] = 1 + index_margin.loc[margins_name] = 1 table = concat([table, column_margin], axis=1) table = table.append(index_margin) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index f944dfe22361a..dcb83d225699d 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -48,23 +48,23 @@ class _Unstacker(object): >>> import pandas as pd >>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'), ... ('two', 'a'), ('two', 'b')]) - >>> s = pd.Series(np.arange(1.0, 5.0), index=index) + >>> s = pd.Series(np.arange(1, 5, dtype=np.int64), index=index) >>> s - one a 1 - b 2 - two a 3 - b 4 - dtype: float64 + one a 1 + b 2 + two a 3 + b 4 + dtype: int64 >>> s.unstack(level=-1) - a b + a b one 1 2 two 3 4 >>> s.unstack(level=0) one two - a 1 2 - b 3 4 + a 1 3 + b 2 4 Returns ------- @@ -789,18 +789,18 @@ def lreshape(data, groups, dropna=True, label=None): >>> import pandas as pd >>> data = pd.DataFrame({'hr1': [514, 573], 'hr2': [545, 526], ... 'team': ['Red Sox', 'Yankees'], - ... 'year1': [2007, 2008], 'year2': [2008, 2008]}) + ... 'year1': [2007, 2007], 'year2': [2008, 2008]}) >>> data hr1 hr2 team year1 year2 0 514 545 Red Sox 2007 2008 1 573 526 Yankees 2007 2008 >>> pd.lreshape(data, {'year': ['year1', 'year2'], 'hr': ['hr1', 'hr2']}) - team hr year - 0 Red Sox 514 2007 - 1 Yankees 573 2007 - 2 Red Sox 545 2008 - 3 Yankees 526 2008 + team year hr + 0 Red Sox 2007 514 + 1 Yankees 2007 573 + 2 Red Sox 2008 545 + 3 Yankees 2008 526 Returns ------- @@ -905,11 +905,12 @@ def wide_to_long(df, stubnames, i, j, sep="", suffix='\d+'): ... }) >>> df["id"] = df.index >>> df - A1970 A1980 B1970 B1980 X id + A1970 A1980 B1970 B1980 X id 0 a d 2.5 3.2 -1.085631 0 1 b e 1.2 1.3 0.997345 1 2 c f 0.7 0.1 0.282978 2 >>> pd.wide_to_long(df, ["A", "B"], i="id", j="year") + ... # doctest: +NORMALIZE_WHITESPACE X A B id year 0 1970 -1.085631 a 2.5 @@ -940,6 +941,7 @@ def wide_to_long(df, stubnames, i, j, sep="", suffix='\d+'): 8 3 3 2.1 2.9 >>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age') >>> l + ... # doctest: +NORMALIZE_WHITESPACE ht famid birth age 1 1 1 2.8 @@ -979,41 +981,44 @@ def wide_to_long(df, stubnames, i, j, sep="", suffix='\d+'): Less wieldy column names are also handled + >>> np.random.seed(0) >>> df = pd.DataFrame({'A(quarterly)-2010': np.random.rand(3), ... 'A(quarterly)-2011': np.random.rand(3), ... 'B(quarterly)-2010': np.random.rand(3), ... 'B(quarterly)-2011': np.random.rand(3), ... 'X' : np.random.randint(3, size=3)}) >>> df['id'] = df.index - >>> df - A(quarterly)-2010 A(quarterly)-2011 B(quarterly)-2010 B(quarterly)-2011 - 0 0.531828 0.724455 0.322959 0.293714 - 1 0.634401 0.611024 0.361789 0.630976 - 2 0.849432 0.722443 0.228263 0.092105 - \ + >>> df # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS + A(quarterly)-2010 A(quarterly)-2011 B(quarterly)-2010 ... + 0 0.548814 0.544883 0.437587 ... + 1 0.715189 0.423655 0.891773 ... + 2 0.602763 0.645894 0.963663 ... X id 0 0 0 1 1 1 - 2 2 2 - >>> pd.wide_to_long(df, ['A(quarterly)', 'B(quarterly)'], - i='id', j='year', sep='-') - X A(quarterly) B(quarterly) + 2 1 2 + + >>> pd.wide_to_long(df, ['A(quarterly)', 'B(quarterly)'], i='id', + ... j='year', sep='-') + ... # doctest: +NORMALIZE_WHITESPACE + X A(quarterly) B(quarterly) id year - 0 2010 0 0.531828 0.322959 - 1 2010 2 0.634401 0.361789 - 2 2010 2 0.849432 0.228263 - 0 2011 0 0.724455 0.293714 - 1 2011 2 0.611024 0.630976 - 2 2011 2 0.722443 0.092105 + 0 2010 0 0.548814 0.437587 + 1 2010 1 0.715189 0.891773 + 2 2010 1 0.602763 0.963663 + 0 2011 0 0.544883 0.383442 + 1 2011 1 0.423655 0.791725 + 2 2011 1 0.645894 0.528895 If we have many columns, we could also use a regex to find our stubnames and pass that list on to wide_to_long - >>> stubnames = set([match[0] for match in - df.columns.str.findall('[A-B]\(.*\)').values - if match != [] ]) + >>> stubnames = sorted( + ... set([match[0] for match in df.columns.str.findall( + ... r'[A-B]\(.*\)').values if match != [] ]) + ... ) >>> list(stubnames) - ['B(quarterly)', 'A(quarterly)'] + ['A(quarterly)', 'B(quarterly)'] Notes ----- @@ -1133,7 +1138,7 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, 2 0 0 1 >>> df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'], - 'C': [1, 2, 3]}) + ... 'C': [1, 2, 3]}) >>> pd.get_dummies(df, prefix=['col1', 'col2']) C col1_a col1_b col2_a col2_b col2_c @@ -1149,7 +1154,7 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, 3 1 0 0 4 1 0 0 - >>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True)) + >>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True) b c 0 0 0 1 1 0 diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 746742f47f2aa..d8398023a5083 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -75,18 +75,18 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, Examples -------- >>> pd.cut(np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]), 3, retbins=True) - ([(0.191, 3.367], (0.191, 3.367], (0.191, 3.367], (3.367, 6.533], - (6.533, 9.7], (0.191, 3.367]] - Categories (3, object): [(0.191, 3.367] < (3.367, 6.533] < (6.533, 9.7]], - array([ 0.1905 , 3.36666667, 6.53333333, 9.7 ])) + ... # doctest: +ELLIPSIS + ([(0.19, 3.367], (0.19, 3.367], (0.19, 3.367], (3.367, 6.533], ... + Categories (3, interval[float64]): [(0.19, 3.367] < (3.367, 6.533] ... - >>> pd.cut(np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]), 3, - labels=["good","medium","bad"]) + >>> pd.cut(np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]), + ... 3, labels=["good", "medium", "bad"]) + ... # doctest: +SKIP [good, good, good, medium, bad, good] Categories (3, object): [good < medium < bad] >>> pd.cut(np.ones(5), 4, labels=False) - array([1, 1, 1, 1, 1], dtype=int64) + array([1, 1, 1, 1, 1]) """ # NOTE: this binning code is changed a bit from histogram for var(x) == 0 @@ -182,15 +182,17 @@ def qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise'): Examples -------- >>> pd.qcut(range(5), 4) - [[0, 1], [0, 1], (1, 2], (2, 3], (3, 4]] - Categories (4, object): [[0, 1] < (1, 2] < (2, 3] < (3, 4]] + ... # doctest: +ELLIPSIS + [(-0.001, 1.0], (-0.001, 1.0], (1.0, 2.0], (2.0, 3.0], (3.0, 4.0]] + Categories (4, interval[float64]): [(-0.001, 1.0] < (1.0, 2.0] ... - >>> pd.qcut(range(5), 3, labels=["good","medium","bad"]) + >>> pd.qcut(range(5), 3, labels=["good", "medium", "bad"]) + ... # doctest: +SKIP [good, good, medium, bad, bad] Categories (3, object): [good < medium < bad] >>> pd.qcut(range(5), 4, labels=False) - array([0, 0, 1, 2, 3], dtype=int64) + array([0, 0, 1, 2, 3]) """ x_is_series, series_index, name, x = _preprocess_for_cut(x) @@ -252,7 +254,7 @@ def _bins_to_cuts(x, bins, right=True, labels=None, raise ValueError('Bin labels must be one fewer than ' 'the number of bin edges') if not is_categorical_dtype(labels): - labels = Categorical(labels, ordered=True) + labels = Categorical(labels, categories=labels, ordered=True) np.putmask(ids, na_mask, 0) result = algos.take_nd(labels, ids - 1) diff --git a/pandas/core/sparse/array.py b/pandas/core/sparse/array.py index 8ac9d3916573e..c75de01b98e4e 100644 --- a/pandas/core/sparse/array.py +++ b/pandas/core/sparse/array.py @@ -125,7 +125,7 @@ def _sparse_array_op(left, right, op, name, series=False): name = name[1:] if name in ('and', 'or') and dtype == 'bool': - opname = 'sparse_{name}_uint8'.format(name=name, dtype=dtype) + opname = 'sparse_{name}_uint8'.format(name=name) # to make template simple, cast here left_sp_values = left.sp_values.view(np.uint8) right_sp_values = right.sp_values.view(np.uint8) diff --git a/pandas/core/window.py b/pandas/core/window.py index cf1bad706ae1d..ba7e79944ab0e 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -81,6 +81,7 @@ def __init__(self, obj, window=None, min_periods=None, freq=None, self.freq = freq self.center = center self.win_type = win_type + self.win_freq = None self.axis = obj._get_axis_number(axis) if axis is not None else None self.validate() @@ -996,7 +997,12 @@ def cov(self, other=None, pairwise=None, ddof=1, **kwargs): # only default unset pairwise = True if pairwise is None else pairwise other = self._shallow_copy(other) - window = self._get_window(other) + + # GH 16058: offset window + if self.is_freq_type: + window = self.win_freq + else: + window = self._get_window(other) def _get_cov(X, Y): # GH #12373 : rolling functions error on float32 data @@ -1088,6 +1094,7 @@ def validate(self): "based windows") # this will raise ValueError on non-fixed freqs + self.win_freq = self.window self.window = freq.nanos self.win_type = 'freq' diff --git a/pandas/io/common.py b/pandas/io/common.py index f4e12ea3fb173..cbfc33dbebb81 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -31,7 +31,7 @@ # '1.#INF','-1.#INF', '1.#INF000000', _NA_VALUES = set([ '-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A', - 'N/A', 'NA', '#NA', 'NULL', 'NaN', '-NaN', 'nan', '-nan', '' + 'N/A', 'n/a', 'NA', '#NA', 'NULL', 'null', 'NaN', '-NaN', 'nan', '-nan', '' ]) try: diff --git a/pandas/io/excel.py b/pandas/io/excel.py index aa08e5fd378f0..a4d2fabf76a41 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -141,10 +141,6 @@ convert integral floats to int (i.e., 1.0 --> 1). If False, all numeric data will be read in as floats: Excel stores all numbers as floats internally -has_index_names : boolean, default None - DEPRECATED: for version 0.17+ index names will be automatically - inferred based on index_col. To read Excel output from 0.16.2 and - prior that had saved index names, use True. Returns ------- @@ -198,8 +194,8 @@ def get_writer(engine_name): def read_excel(io, sheet_name=0, header=0, skiprows=None, skip_footer=0, index_col=None, names=None, parse_cols=None, parse_dates=False, date_parser=None, na_values=None, thousands=None, - convert_float=True, has_index_names=None, converters=None, - dtype=None, true_values=None, false_values=None, engine=None, + convert_float=True, converters=None, dtype=None, + true_values=None, false_values=None, engine=None, squeeze=False, **kwds): # Can't use _deprecate_kwarg since sheetname=None has a special meaning @@ -218,10 +214,9 @@ def read_excel(io, sheet_name=0, header=0, skiprows=None, skip_footer=0, sheetname=sheet_name, header=header, skiprows=skiprows, names=names, index_col=index_col, parse_cols=parse_cols, parse_dates=parse_dates, date_parser=date_parser, na_values=na_values, thousands=thousands, - convert_float=convert_float, has_index_names=has_index_names, - skip_footer=skip_footer, converters=converters, dtype=dtype, - true_values=true_values, false_values=false_values, squeeze=squeeze, - **kwds) + convert_float=convert_float, skip_footer=skip_footer, + converters=converters, dtype=dtype, true_values=true_values, + false_values=false_values, squeeze=squeeze, **kwds) class ExcelFile(object): @@ -283,9 +278,8 @@ def __fspath__(self): def parse(self, sheet_name=0, header=0, skiprows=None, skip_footer=0, names=None, index_col=None, parse_cols=None, parse_dates=False, date_parser=None, na_values=None, thousands=None, - convert_float=True, has_index_names=None, - converters=None, true_values=None, false_values=None, - squeeze=False, **kwds): + convert_float=True, converters=None, true_values=None, + false_values=None, squeeze=False, **kwds): """ Parse specified sheet(s) into a DataFrame @@ -296,7 +290,6 @@ def parse(self, sheet_name=0, header=0, skiprows=None, skip_footer=0, return self._parse_excel(sheetname=sheet_name, header=header, skiprows=skiprows, names=names, index_col=index_col, - has_index_names=has_index_names, parse_cols=parse_cols, parse_dates=parse_dates, date_parser=date_parser, na_values=na_values, @@ -343,23 +336,17 @@ def _excel2num(x): return i in parse_cols def _parse_excel(self, sheetname=0, header=0, skiprows=None, names=None, - skip_footer=0, index_col=None, has_index_names=None, - parse_cols=None, parse_dates=False, date_parser=None, - na_values=None, thousands=None, convert_float=True, - true_values=None, false_values=None, verbose=False, - dtype=None, squeeze=False, **kwds): + skip_footer=0, index_col=None, parse_cols=None, + parse_dates=False, date_parser=None, na_values=None, + thousands=None, convert_float=True, true_values=None, + false_values=None, verbose=False, dtype=None, + squeeze=False, **kwds): skipfooter = kwds.pop('skipfooter', None) if skipfooter is not None: skip_footer = skipfooter _validate_header_arg(header) - if has_index_names is not None: - warn("\nThe has_index_names argument is deprecated; index names " - "will be automatically inferred based on index_col.\n" - "This argmument is still necessary if reading Excel output " - "from 0.16.2 or prior with index names.", FutureWarning, - stacklevel=3) if 'chunksize' in kwds: raise NotImplementedError("chunksize keyword of read_excel " @@ -511,8 +498,7 @@ def _parse_cell(cell_contents, cell_typ): else: last = data[row][col] - if is_list_like(header) and len(header) > 1: - has_index_names = True + has_index_names = is_list_like(header) and len(header) > 1 # GH 12292 : error when read one empty column from excel file try: diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 054db769c56dd..3deaec2dfbbc5 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1292,7 +1292,9 @@ def _column_header(): self.write_tr(col_row, indent, self.indent_delta, header=True, align=align) - if self.fmt.has_index_names and self.fmt.index: + if all((self.fmt.has_index_names, + self.fmt.index, + self.fmt.show_index_names)): row = ([x if x is not None else '' for x in self.frame.index.names] + [''] * min(len(self.columns), self.max_cols)) diff --git a/pandas/io/formats/terminal.py b/pandas/io/formats/terminal.py index dadd09ae74ea4..30bd1d16b538a 100644 --- a/pandas/io/formats/terminal.py +++ b/pandas/io/formats/terminal.py @@ -14,6 +14,8 @@ from __future__ import print_function import os +import sys +import shutil __all__ = ['get_terminal_size'] @@ -26,6 +28,10 @@ def get_terminal_size(): IPython zmq frontends, or IDLE do not run in a terminal, """ import platform + + if sys.version_info[0] >= 3: + return shutil.get_terminal_size() + current_os = platform.system() tuple_xy = None if current_os == 'Windows': diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index e287d92f67ef6..c2d5a629b03a3 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -13,7 +13,7 @@ import numpy as np from pandas import compat -from pandas.compat import (range, lrange, StringIO, lzip, +from pandas.compat import (range, lrange, PY3, StringIO, lzip, zip, string_types, map, u) from pandas.core.dtypes.common import ( is_integer, _ensure_object, @@ -31,10 +31,10 @@ from pandas.core.common import AbstractMethodError from pandas.io.date_converters import generic_parser from pandas.errors import ParserWarning, ParserError, EmptyDataError -from pandas.io.common import (get_filepath_or_buffer, _validate_header_arg, - _get_handle, UnicodeReader, UTF8Recoder, - BaseIterator, - _NA_VALUES, _infer_compression) +from pandas.io.common import (get_filepath_or_buffer, is_file_like, + _validate_header_arg, _get_handle, + UnicodeReader, UTF8Recoder, _NA_VALUES, + BaseIterator, _infer_compression) from pandas.core.tools import datetimes as tools from pandas.util._decorators import Appender @@ -755,7 +755,9 @@ def __init__(self, f, engine=None, **kwds): self.squeeze = options.pop('squeeze', False) # might mutate self.engine + self.engine = self._check_file_or_buffer(f, engine) self.options, self.engine = self._clean_options(options, engine) + if 'has_index_names' in kwds: self.options['has_index_names'] = kwds['has_index_names'] @@ -801,6 +803,23 @@ def _get_options_with_defaults(self, engine): return options + def _check_file_or_buffer(self, f, engine): + # see gh-16530 + if is_file_like(f): + next_attr = "__next__" if PY3 else "next" + + # The C engine doesn't need the file-like to have the "next" or + # "__next__" attribute. However, the Python engine explicitly calls + # "next(...)" when iterating through such an object, meaning it + # needs to have that attribute ("next" for Python 2.x, "__next__" + # for Python 3.x) + if engine != "c" and not hasattr(f, next_attr): + msg = ("The 'python' engine cannot iterate " + "through this file buffer.") + raise ValueError(msg) + + return engine + def _clean_options(self, options, engine): result = options.copy() @@ -969,6 +988,10 @@ def _make_engine(self, engine='c'): klass = PythonParser elif engine == 'python-fwf': klass = FixedWidthFieldParser + else: + raise ValueError('Unknown engine: {engine} (valid options are' + ' "c", "python", or' ' "python-fwf")'.format( + engine=engine)) self._engine = klass(self.f, **self.options) def _failover_to_python(self): @@ -1626,6 +1649,12 @@ def __init__(self, src, **kwds): if self.usecols: usecols = _evaluate_usecols(self.usecols, self.orig_names) + + # GH 14671 + if (self.usecols_dtype == 'string' and + not set(usecols).issubset(self.orig_names)): + raise ValueError("Usecols do not match names.") + if len(self.names) > len(usecols): self.names = [n for i, n in enumerate(self.names) if (i in usecols or n in usecols)] @@ -2182,7 +2211,7 @@ def _exclude_implicit_index(self, alldata): def get_chunk(self, size=None): if size is None: size = self.chunksize - return self.read(nrows=size) + return self.read(rows=size) def _convert_data(self, data): # apply converters diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 6665ccf8ce4c5..ddd25aafa060c 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -73,6 +73,18 @@ def _ensure_encoding(encoding): return encoding +def _ensure_str(name): + """Ensure that an index / column name is a str (python 3) or + unicode (python 2); otherwise they may be np.string dtype. + Non-string dtypes are passed through unchanged. + + https://github.com/pandas-dev/pandas/issues/13492 + """ + if isinstance(name, compat.string_types): + name = compat.text_type(name) + return name + + Term = Expr @@ -494,32 +506,7 @@ def __len__(self): return len(self.groups()) def __unicode__(self): - output = '%s\nFile path: %s\n' % (type(self), pprint_thing(self._path)) - if self.is_open: - lkeys = sorted(list(self.keys())) - if len(lkeys): - keys = [] - values = [] - - for k in lkeys: - try: - s = self.get_storer(k) - if s is not None: - keys.append(pprint_thing(s.pathname or k)) - values.append( - pprint_thing(s or 'invalid_HDFStore node')) - except Exception as detail: - keys.append(k) - values.append("[invalid_HDFStore node: %s]" - % pprint_thing(detail)) - - output += adjoin(12, keys, values) - else: - output += 'Empty' - else: - output += "File is CLOSED" - - return output + return '%s\nFile path: %s\n' % (type(self), pprint_thing(self._path)) def __enter__(self): return self @@ -831,8 +818,8 @@ def func(_start, _stop, _where): # retrieve the objs, _where is always passed as a set of # coordinates here - objs = [t.read(where=_where, columns=columns, **kwargs) - for t in tbls] + objs = [t.read(where=_where, columns=columns, start=_start, + stop=_stop, **kwargs) for t in tbls] # concat and return return concat(objs, axis=axis, @@ -1161,6 +1148,38 @@ def copy(self, file, mode='w', propindexes=True, keys=None, complib=None, return new_store + def info(self): + """return detailed information on the store + + .. versionadded:: 0.21.0 + """ + output = '%s\nFile path: %s\n' % (type(self), pprint_thing(self._path)) + if self.is_open: + lkeys = sorted(list(self.keys())) + if len(lkeys): + keys = [] + values = [] + + for k in lkeys: + try: + s = self.get_storer(k) + if s is not None: + keys.append(pprint_thing(s.pathname or k)) + values.append( + pprint_thing(s or 'invalid_HDFStore node')) + except Exception as detail: + keys.append(k) + values.append("[invalid_HDFStore node: %s]" + % pprint_thing(detail)) + + output += adjoin(12, keys, values) + else: + output += 'Empty' + else: + output += "File is CLOSED" + + return output + # private methods ###### def _check_if_open(self): if not self.is_open: @@ -1425,7 +1444,8 @@ def get_result(self, coordinates=False): # if specified read via coordinates (necessary for multiple selections if coordinates: - where = self.s.read_coordinates(where=self.where) + where = self.s.read_coordinates(where=self.where, start=self.start, + stop=self.stop) else: where = self.where @@ -2566,7 +2586,7 @@ def read_index_node(self, node, start=None, stop=None): name = None if 'name' in node._v_attrs: - name = node._v_attrs.name + name = _ensure_str(node._v_attrs.name) index_class = self._alias_to_class(getattr(node._v_attrs, 'index_class', '')) diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index b88481abcb2ec..ec5fe45d7f610 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -120,9 +120,9 @@ class MockFile(object): m = MockFile() assert not is_file(m) + # gh-16530: Valid iterator just means we have the + # __iter__ attribute for our purposes. MockFile.__iter__ = lambda self: self - MockFile.__next__ = lambda self: 0 - MockFile.next = MockFile.__next__ # Valid write-only file m = MockFile() diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index aa7c7a7120c1b..a6f39cabb60ed 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -635,3 +635,48 @@ def test_nuiscance_columns(self): expected = DataFrame([[6, 6., 'foobarbaz']], index=['sum'], columns=['A', 'B', 'C']) assert_frame_equal(result, expected) + + def test_non_callable_aggregates(self): + + # GH 16405 + # 'size' is a property of frame/series + # validate that this is working + df = DataFrame({'A': [None, 2, 3], + 'B': [1.0, np.nan, 3.0], + 'C': ['foo', None, 'bar']}) + + # Function aggregate + result = df.agg({'A': 'count'}) + expected = pd.Series({'A': 2}) + + assert_series_equal(result, expected) + + # Non-function aggregate + result = df.agg({'A': 'size'}) + expected = pd.Series({'A': 3}) + + assert_series_equal(result, expected) + + # Mix function and non-function aggs + result1 = df.agg(['count', 'size']) + result2 = df.agg({'A': ['count', 'size'], + 'B': ['count', 'size'], + 'C': ['count', 'size']}) + expected = pd.DataFrame({'A': {'count': 2, 'size': 3}, + 'B': {'count': 2, 'size': 3}, + 'C': {'count': 2, 'size': 3}}) + + assert_frame_equal(result1, result2, check_like=True) + assert_frame_equal(result2, expected, check_like=True) + + # Just functional string arg is same as calling df.arg() + result = df.agg('count') + expected = df.count() + + assert_series_equal(result, expected) + + # Just a string attribute arg same as calling df.arg + result = df.agg('size') + expected = df.size + + assert result == expected diff --git a/pandas/tests/groupby/test_counting.py b/pandas/tests/groupby/test_counting.py new file mode 100644 index 0000000000000..485241d593d4f --- /dev/null +++ b/pandas/tests/groupby/test_counting.py @@ -0,0 +1,197 @@ +# -*- coding: utf-8 -*- +from __future__ import print_function + +import numpy as np + +from pandas import (DataFrame, Series, MultiIndex) +from pandas.util.testing import assert_series_equal +from pandas.compat import (range, product as cart_product) + + +class TestCounting(object): + + def test_cumcount(self): + df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A']) + g = df.groupby('A') + sg = g.A + + expected = Series([0, 1, 2, 0, 3]) + + assert_series_equal(expected, g.cumcount()) + assert_series_equal(expected, sg.cumcount()) + + def test_cumcount_empty(self): + ge = DataFrame().groupby(level=0) + se = Series().groupby(level=0) + + # edge case, as this is usually considered float + e = Series(dtype='int64') + + assert_series_equal(e, ge.cumcount()) + assert_series_equal(e, se.cumcount()) + + def test_cumcount_dupe_index(self): + df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'], + index=[0] * 5) + g = df.groupby('A') + sg = g.A + + expected = Series([0, 1, 2, 0, 3], index=[0] * 5) + + assert_series_equal(expected, g.cumcount()) + assert_series_equal(expected, sg.cumcount()) + + def test_cumcount_mi(self): + mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]]) + df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'], + index=mi) + g = df.groupby('A') + sg = g.A + + expected = Series([0, 1, 2, 0, 3], index=mi) + + assert_series_equal(expected, g.cumcount()) + assert_series_equal(expected, sg.cumcount()) + + def test_cumcount_groupby_not_col(self): + df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'], + index=[0] * 5) + g = df.groupby([0, 0, 0, 1, 0]) + sg = g.A + + expected = Series([0, 1, 2, 0, 3], index=[0] * 5) + + assert_series_equal(expected, g.cumcount()) + assert_series_equal(expected, sg.cumcount()) + + def test_ngroup(self): + df = DataFrame({'A': list('aaaba')}) + g = df.groupby('A') + sg = g.A + + expected = Series([0, 0, 0, 1, 0]) + + assert_series_equal(expected, g.ngroup()) + assert_series_equal(expected, sg.ngroup()) + + def test_ngroup_distinct(self): + df = DataFrame({'A': list('abcde')}) + g = df.groupby('A') + sg = g.A + + expected = Series(range(5), dtype='int64') + + assert_series_equal(expected, g.ngroup()) + assert_series_equal(expected, sg.ngroup()) + + def test_ngroup_one_group(self): + df = DataFrame({'A': [0] * 5}) + g = df.groupby('A') + sg = g.A + + expected = Series([0] * 5) + + assert_series_equal(expected, g.ngroup()) + assert_series_equal(expected, sg.ngroup()) + + def test_ngroup_empty(self): + ge = DataFrame().groupby(level=0) + se = Series().groupby(level=0) + + # edge case, as this is usually considered float + e = Series(dtype='int64') + + assert_series_equal(e, ge.ngroup()) + assert_series_equal(e, se.ngroup()) + + def test_ngroup_series_matches_frame(self): + df = DataFrame({'A': list('aaaba')}) + s = Series(list('aaaba')) + + assert_series_equal(df.groupby(s).ngroup(), + s.groupby(s).ngroup()) + + def test_ngroup_dupe_index(self): + df = DataFrame({'A': list('aaaba')}, index=[0] * 5) + g = df.groupby('A') + sg = g.A + + expected = Series([0, 0, 0, 1, 0], index=[0] * 5) + + assert_series_equal(expected, g.ngroup()) + assert_series_equal(expected, sg.ngroup()) + + def test_ngroup_mi(self): + mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]]) + df = DataFrame({'A': list('aaaba')}, index=mi) + g = df.groupby('A') + sg = g.A + expected = Series([0, 0, 0, 1, 0], index=mi) + + assert_series_equal(expected, g.ngroup()) + assert_series_equal(expected, sg.ngroup()) + + def test_ngroup_groupby_not_col(self): + df = DataFrame({'A': list('aaaba')}, index=[0] * 5) + g = df.groupby([0, 0, 0, 1, 0]) + sg = g.A + + expected = Series([0, 0, 0, 1, 0], index=[0] * 5) + + assert_series_equal(expected, g.ngroup()) + assert_series_equal(expected, sg.ngroup()) + + def test_ngroup_descending(self): + df = DataFrame(['a', 'a', 'b', 'a', 'b'], columns=['A']) + g = df.groupby(['A']) + + ascending = Series([0, 0, 1, 0, 1]) + descending = Series([1, 1, 0, 1, 0]) + + assert_series_equal(descending, (g.ngroups - 1) - ascending) + assert_series_equal(ascending, g.ngroup(ascending=True)) + assert_series_equal(descending, g.ngroup(ascending=False)) + + def test_ngroup_matches_cumcount(self): + # verify one manually-worked out case works + df = DataFrame([['a', 'x'], ['a', 'y'], ['b', 'x'], + ['a', 'x'], ['b', 'y']], columns=['A', 'X']) + g = df.groupby(['A', 'X']) + g_ngroup = g.ngroup() + g_cumcount = g.cumcount() + expected_ngroup = Series([0, 1, 2, 0, 3]) + expected_cumcount = Series([0, 0, 0, 1, 0]) + + assert_series_equal(g_ngroup, expected_ngroup) + assert_series_equal(g_cumcount, expected_cumcount) + + def test_ngroup_cumcount_pair(self): + # brute force comparison for all small series + for p in cart_product(range(3), repeat=4): + df = DataFrame({'a': p}) + g = df.groupby(['a']) + + order = sorted(set(p)) + ngroupd = [order.index(val) for val in p] + cumcounted = [p[:i].count(val) for i, val in enumerate(p)] + + assert_series_equal(g.ngroup(), Series(ngroupd)) + assert_series_equal(g.cumcount(), Series(cumcounted)) + + def test_ngroup_respects_groupby_order(self): + np.random.seed(0) + df = DataFrame({'a': np.random.choice(list('abcdef'), 100)}) + for sort_flag in (False, True): + g = df.groupby(['a'], sort=sort_flag) + df['group_id'] = -1 + df['group_index'] = -1 + + for i, (_, group) in enumerate(g): + df.loc[group.index, 'group_id'] = i + for j, ind in enumerate(group.index): + df.loc[ind, 'group_index'] = j + + assert_series_equal(Series(df['group_id'].values), + g.ngroup()) + assert_series_equal(Series(df['group_index'].values), + g.cumcount()) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 88afa51e46b6c..19124a33bdbcb 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -3399,60 +3399,6 @@ def test_groupby_with_small_elem(self): res = grouped.get_group((pd.Timestamp('2014-08-31'), 'start')) tm.assert_frame_equal(res, df.iloc[[2], :]) - def test_cumcount(self): - df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A']) - g = df.groupby('A') - sg = g.A - - expected = Series([0, 1, 2, 0, 3]) - - assert_series_equal(expected, g.cumcount()) - assert_series_equal(expected, sg.cumcount()) - - def test_cumcount_empty(self): - ge = DataFrame().groupby(level=0) - se = Series().groupby(level=0) - - # edge case, as this is usually considered float - e = Series(dtype='int64') - - assert_series_equal(e, ge.cumcount()) - assert_series_equal(e, se.cumcount()) - - def test_cumcount_dupe_index(self): - df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'], - index=[0] * 5) - g = df.groupby('A') - sg = g.A - - expected = Series([0, 1, 2, 0, 3], index=[0] * 5) - - assert_series_equal(expected, g.cumcount()) - assert_series_equal(expected, sg.cumcount()) - - def test_cumcount_mi(self): - mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]]) - df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'], - index=mi) - g = df.groupby('A') - sg = g.A - - expected = Series([0, 1, 2, 0, 3], index=mi) - - assert_series_equal(expected, g.cumcount()) - assert_series_equal(expected, sg.cumcount()) - - def test_cumcount_groupby_not_col(self): - df = DataFrame([['a'], ['a'], ['a'], ['b'], ['a']], columns=['A'], - index=[0] * 5) - g = df.groupby([0, 0, 0, 1, 0]) - sg = g.A - - expected = Series([0, 1, 2, 0, 3], index=[0] * 5) - - assert_series_equal(expected, g.cumcount()) - assert_series_equal(expected, sg.cumcount()) - def test_fill_constistency(self): # GH9221 diff --git a/pandas/tests/groupby/test_whitelist.py b/pandas/tests/groupby/test_whitelist.py index 5d131717f8345..2c8bf57f20fae 100644 --- a/pandas/tests/groupby/test_whitelist.py +++ b/pandas/tests/groupby/test_whitelist.py @@ -24,6 +24,7 @@ 'head', 'tail', 'cumcount', + 'ngroup', 'resample', 'rank', 'quantile', @@ -61,6 +62,7 @@ 'head', 'tail', 'cumcount', + 'ngroup', 'resample', 'rank', 'quantile', @@ -237,7 +239,7 @@ def test_tab_completion(mframe): 'prod', 'size', 'std', 'sum', 'transform', 'var', 'sem', 'count', 'nunique', 'head', 'describe', 'cummax', 'quantile', 'rank', 'cumprod', 'tail', 'resample', 'cummin', 'fillna', - 'cumsum', 'cumcount', 'all', 'shift', 'skew', + 'cumsum', 'cumcount', 'ngroup', 'all', 'shift', 'skew', 'take', 'tshift', 'pct_change', 'any', 'mad', 'corr', 'corrwith', 'cov', 'dtypes', 'ndim', 'diff', 'idxmax', 'idxmin', 'ffill', 'bfill', 'pad', 'backfill', 'rolling', 'expanding']) diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index 6cba7e17abf8e..f99dcee9e5c8a 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -771,3 +771,10 @@ def test_slice_bounds_empty(self): left = empty_idx._maybe_cast_slice_bound('2015-01-02', 'left', 'loc') exp = Timestamp('2015-01-02 00:00:00') assert left == exp + + def test_slice_duplicate_monotonic(self): + # https://github.com/pandas-dev/pandas/issues/16515 + idx = pd.DatetimeIndex(['2017', '2017']) + result = idx._maybe_cast_slice_bound('2017-01-01', 'left', 'loc') + expected = Timestamp('2017-01-01') + assert result == expected diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 6a2087b37631e..18dbe6624008a 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -188,7 +188,6 @@ def test_constructor_ndarray_like(self): # it should be possible to convert any object that satisfies the numpy # ndarray interface directly into an Index class ArrayLike(object): - def __init__(self, array): self.array = array @@ -246,7 +245,6 @@ def test_index_ctor_infer_nan_nat(self): [np.timedelta64('nat'), np.nan], [pd.NaT, np.timedelta64('nat')], [np.timedelta64('nat'), pd.NaT]]: - tm.assert_index_equal(Index(data), exp) tm.assert_index_equal(Index(np.array(data, dtype=object)), exp) @@ -1330,8 +1328,10 @@ def test_tuple_union_bug(self): def test_is_monotonic_incomparable(self): index = Index([5, datetime.now(), 7]) - assert not index.is_monotonic + assert not index.is_monotonic_increasing assert not index.is_monotonic_decreasing + assert not index._is_strictly_monotonic_increasing + assert not index._is_strictly_monotonic_decreasing def test_get_set_value(self): values = np.random.randn(100) @@ -1800,6 +1800,25 @@ def test_string_index_repr(self): assert coerce(idx) == expected + @pytest.mark.parametrize('dtype', [np.int64, np.float64]) + @pytest.mark.parametrize('delta', [1, 0, -1]) + def test_addsub_arithmetic(self, dtype, delta): + # GH 8142 + delta = dtype(delta) + idx = pd.Index([10, 11, 12], dtype=dtype) + result = idx + delta + expected = pd.Index(idx.values + delta, dtype=dtype) + tm.assert_index_equal(result, expected) + + # this subtraction used to fail + result = idx - delta + expected = pd.Index(idx.values - delta, dtype=dtype) + tm.assert_index_equal(result, expected) + + tm.assert_index_equal(idx + idx, 2 * idx) + tm.assert_index_equal(idx - idx, 0 * idx) + assert not (idx - idx).empty + class TestMixedIntIndex(Base): # Mostly the tests from common.py for which the results differ @@ -2030,6 +2049,8 @@ def test_is_monotonic_na(self): for index in examples: assert not index.is_monotonic_increasing assert not index.is_monotonic_decreasing + assert not index._is_strictly_monotonic_increasing + assert not index._is_strictly_monotonic_decreasing def test_repr_summary(self): with cf.option_context('display.max_seq_items', 10): diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 1fe4d85815c4b..3f6fd8c8aa827 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -1172,6 +1172,14 @@ def test_get_loc_level(self): assert result == expected assert new_index.equals(index.droplevel(0)) + def test_get_loc_missing_nan(self): + # GH 8569 + idx = MultiIndex.from_arrays([[1.0, 2.0], [3.0, 4.0]]) + assert isinstance(idx.get_loc(1), slice) + pytest.raises(KeyError, idx.get_loc, 3) + pytest.raises(KeyError, idx.get_loc, np.nan) + pytest.raises(KeyError, idx.get_loc, [np.nan]) + def test_slice_locs(self): df = tm.makeTimeDataFrame() stacked = df.stack() @@ -2373,22 +2381,30 @@ def test_is_monotonic(self): i = MultiIndex.from_product([np.arange(10), np.arange(10)], names=['one', 'two']) assert i.is_monotonic + assert i._is_strictly_monotonic_increasing assert Index(i.values).is_monotonic + assert i._is_strictly_monotonic_increasing i = MultiIndex.from_product([np.arange(10, 0, -1), np.arange(10)], names=['one', 'two']) assert not i.is_monotonic + assert not i._is_strictly_monotonic_increasing assert not Index(i.values).is_monotonic + assert not Index(i.values)._is_strictly_monotonic_increasing i = MultiIndex.from_product([np.arange(10), np.arange(10, 0, -1)], names=['one', 'two']) assert not i.is_monotonic + assert not i._is_strictly_monotonic_increasing assert not Index(i.values).is_monotonic + assert not Index(i.values)._is_strictly_monotonic_increasing i = MultiIndex.from_product([[1.0, np.nan, 2.0], ['a', 'b', 'c']]) assert not i.is_monotonic + assert not i._is_strictly_monotonic_increasing assert not Index(i.values).is_monotonic + assert not Index(i.values)._is_strictly_monotonic_increasing # string ordering i = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], @@ -2398,6 +2414,8 @@ def test_is_monotonic(self): names=['first', 'second']) assert not i.is_monotonic assert not Index(i.values).is_monotonic + assert not i._is_strictly_monotonic_increasing + assert not Index(i.values)._is_strictly_monotonic_increasing i = MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['mom', 'next', 'zenith']], @@ -2406,6 +2424,8 @@ def test_is_monotonic(self): names=['first', 'second']) assert i.is_monotonic assert Index(i.values).is_monotonic + assert i._is_strictly_monotonic_increasing + assert Index(i.values)._is_strictly_monotonic_increasing # mixed levels, hits the TypeError i = MultiIndex( @@ -2416,6 +2436,20 @@ def test_is_monotonic(self): names=['household_id', 'asset_id']) assert not i.is_monotonic + assert not i._is_strictly_monotonic_increasing + + def test_is_strictly_monotonic(self): + idx = pd.MultiIndex(levels=[['bar', 'baz'], ['mom', 'next']], + labels=[[0, 0, 1, 1], [0, 0, 0, 1]]) + assert idx.is_monotonic_increasing + assert not idx._is_strictly_monotonic_increasing + + @pytest.mark.xfail(reason="buggy MultiIndex.is_monotonic_decresaing.") + def test__is_strictly_monotonic_decreasing(self): + idx = pd.MultiIndex(levels=[['baz', 'bar'], ['next', 'mom']], + labels=[[0, 0, 1, 1], [0, 0, 0, 1]]) + assert idx.is_monotonic_decreasing + assert not idx._is_strictly_monotonic_decreasing def test_reconstruct_sort(self): @@ -2489,7 +2523,34 @@ def test_reconstruct_remove_unused(self): # idempotent result2 = result.remove_unused_levels() tm.assert_index_equal(result2, expected) - assert result2 is result + assert result2.is_(result) + + @pytest.mark.parametrize('first_type,second_type', [ + ('int64', 'int64'), + ('datetime64[D]', 'str')]) + def test_remove_unused_levels_large(self, first_type, second_type): + # GH16556 + + # because tests should be deterministic (and this test in particular + # checks that levels are removed, which is not the case for every + # random input): + rng = np.random.RandomState(4) # seed is arbitrary value that works + + size = 1 << 16 + df = DataFrame(dict( + first=rng.randint(0, 1 << 13, size).astype(first_type), + second=rng.randint(0, 1 << 10, size).astype(second_type), + third=rng.rand(size))) + df = df.groupby(['first', 'second']).sum() + df = df[df.third < 0.1] + + result = df.index.remove_unused_levels() + assert len(result.levels[0]) < len(df.index.levels[0]) + assert len(result.levels[1]) < len(df.index.levels[1]) + assert result.equals(df.index) + + expected = df.reset_index().set_index(['first', 'second']).index + tm.assert_index_equal(result, expected) def test_isin(self): values = [('foo', 2), ('bar', 3), ('quux', 4)] @@ -2805,3 +2866,24 @@ def test_tuples_with_name_string(self): pd.Index(li, name='abc') with pytest.raises(ValueError): pd.Index(li, name='a') + + def test_nan_stays_float(self): + + # GH 7031 + idx0 = pd.MultiIndex(levels=[["A", "B"], []], + labels=[[1, 0], [-1, -1]], + names=[0, 1]) + idx1 = pd.MultiIndex(levels=[["C"], ["D"]], + labels=[[0], [0]], + names=[0, 1]) + idxm = idx0.join(idx1, how='outer') + assert pd.isnull(idx0.get_level_values(1)).all() + # the following failed in 0.14.1 + assert pd.isnull(idxm.get_level_values(1)[:-1]).all() + + df0 = pd.DataFrame([[1, 2]], index=idx0) + df1 = pd.DataFrame([[3, 4]], index=idx1) + dfm = df0 - df1 + assert pd.isnull(df0.index.get_level_values(1)).all() + # the following failed in 0.14.1 + assert pd.isnull(dfm.index.get_level_values(1)[:-1]).all() diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index 3d06f1672ae32..62ac337d02727 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -371,6 +371,14 @@ def test_get_loc_na(self): assert idx.get_loc(1) == 1 pytest.raises(KeyError, idx.slice_locs, np.nan) + def test_get_loc_missing_nan(self): + # GH 8569 + idx = Float64Index([1, 2]) + assert idx.get_loc(1) == 0 + pytest.raises(KeyError, idx.get_loc, 3) + pytest.raises(KeyError, idx.get_loc, np.nan) + pytest.raises(KeyError, idx.get_loc, [np.nan]) + def test_contains_nans(self): i = Float64Index([1.0, 2.0, np.nan]) assert np.nan in i @@ -465,16 +473,36 @@ def test_view(self): def test_is_monotonic(self): assert self.index.is_monotonic assert self.index.is_monotonic_increasing + assert self.index._is_strictly_monotonic_increasing assert not self.index.is_monotonic_decreasing + assert not self.index._is_strictly_monotonic_decreasing index = self._holder([4, 3, 2, 1]) assert not index.is_monotonic - assert index.is_monotonic_decreasing + assert not index._is_strictly_monotonic_increasing + assert index._is_strictly_monotonic_decreasing index = self._holder([1]) assert index.is_monotonic assert index.is_monotonic_increasing assert index.is_monotonic_decreasing + assert index._is_strictly_monotonic_increasing + assert index._is_strictly_monotonic_decreasing + + def test_is_strictly_monotonic(self): + index = self._holder([1, 1, 2, 3]) + assert index.is_monotonic_increasing + assert not index._is_strictly_monotonic_increasing + + index = self._holder([3, 2, 1, 1]) + assert index.is_monotonic_decreasing + assert not index._is_strictly_monotonic_decreasing + + index = self._holder([1, 1]) + assert index.is_monotonic_increasing + assert index.is_monotonic_decreasing + assert not index._is_strictly_monotonic_increasing + assert not index._is_strictly_monotonic_decreasing def test_logical_compat(self): idx = self.create_index() diff --git a/pandas/tests/indexes/test_range.py b/pandas/tests/indexes/test_range.py index c7af0954cf483..0d88e88030604 100644 --- a/pandas/tests/indexes/test_range.py +++ b/pandas/tests/indexes/test_range.py @@ -331,25 +331,35 @@ def test_is_monotonic(self): assert self.index.is_monotonic assert self.index.is_monotonic_increasing assert not self.index.is_monotonic_decreasing + assert self.index._is_strictly_monotonic_increasing + assert not self.index._is_strictly_monotonic_decreasing index = RangeIndex(4, 0, -1) assert not index.is_monotonic + assert not index._is_strictly_monotonic_increasing assert index.is_monotonic_decreasing + assert index._is_strictly_monotonic_decreasing index = RangeIndex(1, 2) assert index.is_monotonic assert index.is_monotonic_increasing assert index.is_monotonic_decreasing + assert index._is_strictly_monotonic_increasing + assert index._is_strictly_monotonic_decreasing index = RangeIndex(2, 1) assert index.is_monotonic assert index.is_monotonic_increasing assert index.is_monotonic_decreasing + assert index._is_strictly_monotonic_increasing + assert index._is_strictly_monotonic_decreasing index = RangeIndex(1, 1) assert index.is_monotonic assert index.is_monotonic_increasing assert index.is_monotonic_decreasing + assert index._is_strictly_monotonic_increasing + assert index._is_strictly_monotonic_decreasing def test_equals_range(self): equiv_pairs = [(RangeIndex(0, 9, 2), RangeIndex(0, 10, 2)), diff --git a/pandas/tests/indexing/test_multiindex.py b/pandas/tests/indexing/test_multiindex.py index 483c39ed8694e..fc6c627075c96 100644 --- a/pandas/tests/indexing/test_multiindex.py +++ b/pandas/tests/indexing/test_multiindex.py @@ -697,6 +697,17 @@ def test_multiindex_slice_first_level(self): index=range(30, 71)) tm.assert_frame_equal(result, expected) + def test_multiindex_symmetric_difference(self): + # GH 13490 + idx = MultiIndex.from_product([['a', 'b'], ['A', 'B']], + names=['a', 'b']) + result = idx ^ idx + assert result.names == idx.names + + idx2 = idx.copy().rename(['A', 'B']) + result = idx ^ idx2 + assert result.names == [None, None] + class TestMultiIndexSlicers(object): diff --git a/pandas/tests/io/data/legacy_pickle/0.19.2/0.19.2_x86_64_darwin_3.6.1.pickle b/pandas/tests/io/data/legacy_pickle/0.19.2/0.19.2_x86_64_darwin_3.6.1.pickle index 6bb02672a4151..75ea95ff402c4 100644 Binary files a/pandas/tests/io/data/legacy_pickle/0.19.2/0.19.2_x86_64_darwin_3.6.1.pickle and b/pandas/tests/io/data/legacy_pickle/0.19.2/0.19.2_x86_64_darwin_3.6.1.pickle differ diff --git a/pandas/tests/io/formats/test_to_html.py b/pandas/tests/io/formats/test_to_html.py index cde920b1511d2..9f4e532ec2287 100644 --- a/pandas/tests/io/formats/test_to_html.py +++ b/pandas/tests/io/formats/test_to_html.py @@ -1869,3 +1869,10 @@ def test_to_html_notebook_has_no_style(self): df = pd.DataFrame({"A": [1, 2, 3]}) result = df.to_html() assert "thead tr:only-child" not in result + + def test_to_html_with_index_names_false(self): + # gh-16493 + df = pd.DataFrame({"A": [1, 2]}, index=pd.Index(['a', 'b'], + name='myindexname')) + result = df.to_html(index_names=False) + assert 'myindexname' not in result diff --git a/pandas/tests/io/generate_legacy_storage_files.py b/pandas/tests/io/generate_legacy_storage_files.py old mode 100644 new mode 100755 index 22c62b738e6a2..996965999724e --- a/pandas/tests/io/generate_legacy_storage_files.py +++ b/pandas/tests/io/generate_legacy_storage_files.py @@ -1,3 +1,5 @@ +#!/usr/env/bin python + """ self-contained to write legacy storage (pickle/msgpack) files """ from __future__ import print_function from warnings import catch_warnings @@ -125,7 +127,11 @@ def create_data(): mixed_dup=mixed_dup_df, dt_mixed_tzs=DataFrame({ u'A': Timestamp('20130102', tz='US/Eastern'), - u'B': Timestamp('20130603', tz='CET')}, index=range(5)) + u'B': Timestamp('20130603', tz='CET')}, index=range(5)), + dt_mixed2_tzs=DataFrame({ + u'A': Timestamp('20130102', tz='US/Eastern'), + u'B': Timestamp('20130603', tz='CET'), + u'C': Timestamp('20130603', tz='UTC')}, index=range(5)) ) with catch_warnings(record=True): diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index 662f06dbb725e..76fb6d442a25a 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -27,7 +27,7 @@ else partial(json.dumps, encoding="utf-8")) -class UltraJSONTests(object): +class TestUltraJSONTests(object): @pytest.mark.skipif(compat.is_platform_32bit(), reason="not compliant on 32-bit, xref #15865") @@ -944,19 +944,19 @@ def my_obj_handler(obj): ujson.decode(ujson.encode(l, default_handler=str))) -class NumpyJSONTests(object): +class TestNumpyJSONTests(object): - def testBool(self): + def test_Bool(self): b = np.bool(True) assert ujson.decode(ujson.encode(b)) == b - def testBoolArray(self): + def test_BoolArray(self): inpt = np.array([True, False, True, True, False, True, False, False], dtype=np.bool) outp = np.array(ujson.decode(ujson.encode(inpt)), dtype=np.bool) tm.assert_numpy_array_equal(inpt, outp) - def testInt(self): + def test_Int(self): num = np.int(2562010) assert np.int(ujson.decode(ujson.encode(num))) == num @@ -984,7 +984,7 @@ def testInt(self): num = np.uint64(2562010) assert np.uint64(ujson.decode(ujson.encode(num))) == num - def testIntArray(self): + def test_IntArray(self): arr = np.arange(100, dtype=np.int) dtypes = (np.int, np.int8, np.int16, np.int32, np.int64, np.uint, np.uint8, np.uint16, np.uint32, np.uint64) @@ -993,7 +993,7 @@ def testIntArray(self): outp = np.array(ujson.decode(ujson.encode(inpt)), dtype=dtype) tm.assert_numpy_array_equal(inpt, outp) - def testIntMax(self): + def test_IntMax(self): num = np.int(np.iinfo(np.int).max) assert np.int(ujson.decode(ujson.encode(num))) == num @@ -1023,7 +1023,7 @@ def testIntMax(self): num = np.uint64(np.iinfo(np.int64).max) assert np.uint64(ujson.decode(ujson.encode(num))) == num - def testFloat(self): + def test_Float(self): num = np.float(256.2013) assert np.float(ujson.decode(ujson.encode(num))) == num @@ -1033,7 +1033,7 @@ def testFloat(self): num = np.float64(256.2013) assert np.float64(ujson.decode(ujson.encode(num))) == num - def testFloatArray(self): + def test_FloatArray(self): arr = np.arange(12.5, 185.72, 1.7322, dtype=np.float) dtypes = (np.float, np.float32, np.float64) @@ -1043,7 +1043,7 @@ def testFloatArray(self): inpt, double_precision=15)), dtype=dtype) tm.assert_almost_equal(inpt, outp) - def testFloatMax(self): + def test_FloatMax(self): num = np.float(np.finfo(np.float).max / 10) tm.assert_almost_equal(np.float(ujson.decode( ujson.encode(num, double_precision=15))), num, 15) @@ -1056,7 +1056,7 @@ def testFloatMax(self): tm.assert_almost_equal(np.float64(ujson.decode( ujson.encode(num, double_precision=15))), num, 15) - def testArrays(self): + def test_Arrays(self): arr = np.arange(100) arr = arr.reshape((10, 10)) @@ -1097,13 +1097,13 @@ def testArrays(self): outp = ujson.decode(ujson.encode(arr), numpy=True, dtype=np.float32) tm.assert_almost_equal(arr, outp) - def testOdArray(self): + def test_OdArray(self): def will_raise(): ujson.encode(np.array(1)) pytest.raises(TypeError, will_raise) - def testArrayNumpyExcept(self): + def test_ArrayNumpyExcept(self): input = ujson.dumps([42, {}, 'a']) try: @@ -1186,7 +1186,7 @@ def testArrayNumpyExcept(self): except: assert False, "Wrong exception" - def testArrayNumpyLabelled(self): + def test_ArrayNumpyLabelled(self): input = {'a': []} output = ujson.loads(ujson.dumps(input), numpy=True, labelled=True) assert (np.empty((1, 0)) == output[0]).all() @@ -1220,9 +1220,9 @@ def testArrayNumpyLabelled(self): assert (np.array(['a', 'b']) == output[2]).all() -class PandasJSONTests(object): +class TestPandasJSONTests(object): - def testDataFrame(self): + def test_DataFrame(self): df = DataFrame([[1, 2, 3], [4, 5, 6]], index=[ 'a', 'b'], columns=['x', 'y', 'z']) @@ -1252,7 +1252,7 @@ def testDataFrame(self): tm.assert_index_equal(df.transpose().columns, outp.columns) tm.assert_index_equal(df.transpose().index, outp.index) - def testDataFrameNumpy(self): + def test_DataFrameNumpy(self): df = DataFrame([[1, 2, 3], [4, 5, 6]], index=[ 'a', 'b'], columns=['x', 'y', 'z']) @@ -1275,7 +1275,7 @@ def testDataFrameNumpy(self): tm.assert_index_equal(df.transpose().columns, outp.columns) tm.assert_index_equal(df.transpose().index, outp.index) - def testDataFrameNested(self): + def test_DataFrameNested(self): df = DataFrame([[1, 2, 3], [4, 5, 6]], index=[ 'a', 'b'], columns=['x', 'y', 'z']) @@ -1301,7 +1301,7 @@ def testDataFrameNested(self): 'df2': ujson.decode(ujson.encode(df, orient="split"))} assert ujson.decode(ujson.encode(nested, orient="split")) == exp - def testDataFrameNumpyLabelled(self): + def test_DataFrameNumpyLabelled(self): df = DataFrame([[1, 2, 3], [4, 5, 6]], index=[ 'a', 'b'], columns=['x', 'y', 'z']) @@ -1324,7 +1324,7 @@ def testDataFrameNumpyLabelled(self): tm.assert_index_equal(df.columns, outp.columns) tm.assert_index_equal(df.index, outp.index) - def testSeries(self): + def test_Series(self): s = Series([10, 20, 30, 40, 50, 60], name="series", index=[6, 7, 8, 9, 10, 15]).sort_values() @@ -1372,7 +1372,7 @@ def testSeries(self): s, orient="index"), numpy=True)).sort_values() tm.assert_series_equal(outp, exp) - def testSeriesNested(self): + def test_SeriesNested(self): s = Series([10, 20, 30, 40, 50, 60], name="series", index=[6, 7, 8, 9, 10, 15]).sort_values() @@ -1398,7 +1398,7 @@ def testSeriesNested(self): 's2': ujson.decode(ujson.encode(s, orient="index"))} assert ujson.decode(ujson.encode(nested, orient="index")) == exp - def testIndex(self): + def test_Index(self): i = Index([23, 45, 18, 98, 43, 11], name="index") # column indexed diff --git a/pandas/tests/io/parser/c_parser_only.py b/pandas/tests/io/parser/c_parser_only.py index 56ac10404b7b2..48812c04e3b55 100644 --- a/pandas/tests/io/parser/c_parser_only.py +++ b/pandas/tests/io/parser/c_parser_only.py @@ -7,7 +7,9 @@ further arguments when parsing. """ +import os import sys +import tarfile import pytest import numpy as np @@ -446,3 +448,37 @@ def test_comment_whitespace_delimited(self): [7, np.nan], [8, np.nan]]) tm.assert_frame_equal(df, expected) + + def test_file_like_no_next(self): + # gh-16530: the file-like need not have a "next" or "__next__" + # attribute despite having an "__iter__" attribute. + # + # NOTE: This is only true for the C engine, not Python engine. + class NoNextBuffer(StringIO): + def __next__(self): + raise AttributeError("No next method") + + next = __next__ + + data = "a\n1" + + expected = pd.DataFrame({"a": [1]}) + result = self.read_csv(NoNextBuffer(data)) + + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("tar_suffix", [".tar", ".tar.gz"]) + def test_read_tarfile(self, tar_suffix): + # see gh-16530 + # + # Unfortunately, Python's CSV library can't handle + # tarfile objects (expects string, not bytes when + # iterating through a file-like). + tar_path = os.path.join(self.dirpath, "tar_csv" + tar_suffix) + + tar = tarfile.open(tar_path, "r") + data_file = tar.extractfile("tar_data.csv") + + out = self.read_csv(data_file) + expected = pd.DataFrame({"a": [1]}) + tm.assert_frame_equal(out, expected) diff --git a/pandas/tests/io/parser/data/tar_csv.tar b/pandas/tests/io/parser/data/tar_csv.tar new file mode 100644 index 0000000000000..d1819550e0a00 Binary files /dev/null and b/pandas/tests/io/parser/data/tar_csv.tar differ diff --git a/pandas/tests/io/parser/data/tar_csv.tar.gz b/pandas/tests/io/parser/data/tar_csv.tar.gz new file mode 100644 index 0000000000000..b5a0f3e1b5805 Binary files /dev/null and b/pandas/tests/io/parser/data/tar_csv.tar.gz differ diff --git a/pandas/tests/io/parser/na_values.py b/pandas/tests/io/parser/na_values.py index 362837a46f838..170f9d428c9cc 100644 --- a/pandas/tests/io/parser/na_values.py +++ b/pandas/tests/io/parser/na_values.py @@ -70,8 +70,8 @@ def test_non_string_na_values(self): def test_default_na_values(self): _NA_VALUES = set(['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', - '#N/A', 'N/A', 'NA', '#NA', 'NULL', 'NaN', - 'nan', '-NaN', '-nan', '#N/A N/A', '']) + '#N/A', 'N/A', 'n/a', 'NA', '#NA', 'NULL', 'null', + 'NaN', 'nan', '-NaN', '-nan', '#N/A N/A', '']) assert _NA_VALUES == parsers._NA_VALUES nv = len(_NA_VALUES) diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py index 3f62ff44531fb..5d248f2fef59c 100644 --- a/pandas/tests/io/parser/test_unsupported.py +++ b/pandas/tests/io/parser/test_unsupported.py @@ -16,6 +16,13 @@ from pandas.errors import ParserError from pandas.io.parsers import read_csv, read_table +import pytest + + +@pytest.fixture(params=["python", "python-fwf"], ids=lambda val: val) +def python_engine(request): + return request.param + class TestUnsupportedFeatures(object): @@ -82,7 +89,7 @@ def test_c_engine(self): with tm.assert_raises_regex(ValueError, msg): read_csv(StringIO(data), lineterminator='~~') - def test_python_engine(self): + def test_python_engine(self, python_engine): from pandas.io.parsers import _python_unsupported as py_unsupported data = """1,2,3,, @@ -90,16 +97,32 @@ def test_python_engine(self): 1,2,3,4,5 1,2,,, 1,2,3,4,""" - engines = 'python', 'python-fwf' - for engine in engines: - for default in py_unsupported: - msg = ('The %r option is not supported ' - 'with the %r engine' % (default, engine)) + for default in py_unsupported: + msg = ('The %r option is not supported ' + 'with the %r engine' % (default, python_engine)) + + kwargs = {default: object()} + with tm.assert_raises_regex(ValueError, msg): + read_csv(StringIO(data), engine=python_engine, **kwargs) - kwargs = {default: object()} - with tm.assert_raises_regex(ValueError, msg): - read_csv(StringIO(data), engine=engine, **kwargs) + def test_python_engine_file_no_next(self, python_engine): + # see gh-16530 + class NoNextBuffer(object): + def __init__(self, csv_data): + self.data = csv_data + + def __iter__(self): + return self + + def read(self): + return self.data + + data = "a\n1" + msg = "The 'python' engine cannot iterate" + + with tm.assert_raises_regex(ValueError, msg): + read_csv(NoNextBuffer(data), engine=python_engine) class TestDeprecatedFeatures(object): diff --git a/pandas/tests/io/parser/usecols.py b/pandas/tests/io/parser/usecols.py index 8761d1ccd3da4..f582e5037ca07 100644 --- a/pandas/tests/io/parser/usecols.py +++ b/pandas/tests/io/parser/usecols.py @@ -475,3 +475,54 @@ def test_uneven_length_cols(self): 'C': [3, 5, 4, 3, 3, 7]}) df = self.read_csv(StringIO(data), usecols=usecols) tm.assert_frame_equal(df, expected) + + def test_raise_on_usecols_names_mismatch(self): + # GH 14671 + data = 'a,b,c,d\n1,2,3,4\n5,6,7,8' + + if self.engine == 'c': + msg = 'Usecols do not match names' + else: + msg = 'is not in list' + + usecols = ['a', 'b', 'c', 'd'] + df = self.read_csv(StringIO(data), usecols=usecols) + expected = DataFrame({'a': [1, 5], 'b': [2, 6], 'c': [3, 7], + 'd': [4, 8]}) + tm.assert_frame_equal(df, expected) + + usecols = ['a', 'b', 'c', 'f'] + with tm.assert_raises_regex(ValueError, msg): + self.read_csv(StringIO(data), usecols=usecols) + + usecols = ['a', 'b', 'f'] + with tm.assert_raises_regex(ValueError, msg): + self.read_csv(StringIO(data), usecols=usecols) + + names = ['A', 'B', 'C', 'D'] + + df = self.read_csv(StringIO(data), header=0, names=names) + expected = DataFrame({'A': [1, 5], 'B': [2, 6], 'C': [3, 7], + 'D': [4, 8]}) + tm.assert_frame_equal(df, expected) + + # TODO: https://github.com/pandas-dev/pandas/issues/16469 + # usecols = ['A','C'] + # df = self.read_csv(StringIO(data), header=0, names=names, + # usecols=usecols) + # expected = DataFrame({'A': [1,5], 'C': [3,7]}) + # tm.assert_frame_equal(df, expected) + # + # usecols = [0,2] + # df = self.read_csv(StringIO(data), header=0, names=names, + # usecols=usecols) + # expected = DataFrame({'A': [1,5], 'C': [3,7]}) + # tm.assert_frame_equal(df, expected) + + usecols = ['A', 'B', 'C', 'f'] + with tm.assert_raises_regex(ValueError, msg): + self.read_csv(StringIO(data), header=0, names=names, + usecols=usecols) + usecols = ['A', 'B', 'f'] + with tm.assert_raises_regex(ValueError, msg): + self.read_csv(StringIO(data), names=names, usecols=usecols) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index b7d158dd75960..b527e3c5dc254 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -143,7 +143,6 @@ def test_read_fspath_all(self, reader, module, path): ('to_csv', {}, 'os'), ('to_excel', {'engine': 'xlwt'}, 'xlwt'), ('to_feather', {}, 'feather'), - ('to_hdf', {'key': 'bar', 'mode': 'w'}, 'tables'), ('to_html', {}, 'os'), ('to_json', {}, 'os'), ('to_latex', {}, 'os'), @@ -171,6 +170,26 @@ def test_write_fspath_all(self, writer_name, writer_kwargs, module): assert result == expected + def test_write_fspath_hdf5(self): + # Same test as write_fspath_all, except HDF5 files aren't + # necessarily byte-for-byte identical for a given dataframe, so we'll + # have to read and compare equality + pytest.importorskip('tables') + + df = pd.DataFrame({"A": [1, 2]}) + p1 = tm.ensure_clean('string') + p2 = tm.ensure_clean('fspath') + + with p1 as string, p2 as fspath: + mypath = CustomFSPath(fspath) + df.to_hdf(mypath, key='bar') + df.to_hdf(string, key='bar') + + result = pd.read_hdf(fspath, key='bar') + expected = pd.read_hdf(string, key='bar') + + tm.assert_frame_equal(result, expected) + class TestMMapWrapper(object): @@ -223,3 +242,10 @@ def test_next(self): assert next_line.strip() == line.strip() pytest.raises(StopIteration, next, wrapper) + + def test_unknown_engine(self): + with tm.ensure_clean() as path: + df = tm.makeDataFrame() + df.to_csv(path) + with tm.assert_raises_regex(ValueError, 'Unknown engine'): + read_csv(path, engine='pyt') diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index 4441ed815370b..abe3757ec64f3 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -881,8 +881,42 @@ def test_excel_multindex_roundtrip(self): tm.assert_frame_equal( df, act, check_names=check_names) - def test_excel_oldindex_format(self): - # GH 4679 + def test_excel_old_index_format(self): + # see gh-4679 + filename = 'test_index_name_pre17' + self.ext + in_file = os.path.join(self.dirpath, filename) + + # We detect headers to determine if index names exist, so + # that "index" name in the "names" version of the data will + # now be interpreted as rows that include null data. + data = np.array([[None, None, None, None, None], + ['R0C0', 'R0C1', 'R0C2', 'R0C3', 'R0C4'], + ['R1C0', 'R1C1', 'R1C2', 'R1C3', 'R1C4'], + ['R2C0', 'R2C1', 'R2C2', 'R2C3', 'R2C4'], + ['R3C0', 'R3C1', 'R3C2', 'R3C3', 'R3C4'], + ['R4C0', 'R4C1', 'R4C2', 'R4C3', 'R4C4']]) + columns = ['C_l0_g0', 'C_l0_g1', 'C_l0_g2', 'C_l0_g3', 'C_l0_g4'] + mi = MultiIndex(levels=[['R0', 'R_l0_g0', 'R_l0_g1', + 'R_l0_g2', 'R_l0_g3', 'R_l0_g4'], + ['R1', 'R_l1_g0', 'R_l1_g1', + 'R_l1_g2', 'R_l1_g3', 'R_l1_g4']], + labels=[[0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5]], + names=[None, None]) + si = Index(['R0', 'R_l0_g0', 'R_l0_g1', 'R_l0_g2', + 'R_l0_g3', 'R_l0_g4'], name=None) + + expected = pd.DataFrame(data, index=si, columns=columns) + + actual = pd.read_excel(in_file, 'single_names') + tm.assert_frame_equal(actual, expected) + + expected.index = mi + + actual = pd.read_excel(in_file, 'multi_names') + tm.assert_frame_equal(actual, expected) + + # The analogous versions of the "names" version data + # where there are explicitly no names for the indices. data = np.array([['R0C0', 'R0C1', 'R0C2', 'R0C3', 'R0C4'], ['R1C0', 'R1C1', 'R1C2', 'R1C3', 'R1C4'], ['R2C0', 'R2C1', 'R2C2', 'R2C3', 'R2C4'], @@ -894,40 +928,19 @@ def test_excel_oldindex_format(self): ['R_l1_g0', 'R_l1_g1', 'R_l1_g2', 'R_l1_g3', 'R_l1_g4']], labels=[[0, 1, 2, 3, 4], [0, 1, 2, 3, 4]], - names=['R0', 'R1']) + names=[None, None]) si = Index(['R_l0_g0', 'R_l0_g1', 'R_l0_g2', - 'R_l0_g3', 'R_l0_g4'], name='R0') - - in_file = os.path.join( - self.dirpath, 'test_index_name_pre17' + self.ext) + 'R_l0_g3', 'R_l0_g4'], name=None) expected = pd.DataFrame(data, index=si, columns=columns) - with tm.assert_produces_warning(FutureWarning): - actual = pd.read_excel( - in_file, 'single_names', has_index_names=True) - tm.assert_frame_equal(actual, expected) - expected.index.name = None actual = pd.read_excel(in_file, 'single_no_names') tm.assert_frame_equal(actual, expected) - with tm.assert_produces_warning(FutureWarning): - actual = pd.read_excel( - in_file, 'single_no_names', has_index_names=False) - tm.assert_frame_equal(actual, expected) expected.index = mi - with tm.assert_produces_warning(FutureWarning): - actual = pd.read_excel( - in_file, 'multi_names', has_index_names=True) - tm.assert_frame_equal(actual, expected) - expected.index.names = [None, None] actual = pd.read_excel(in_file, 'multi_no_names', index_col=[0, 1]) tm.assert_frame_equal(actual, expected, check_names=False) - with tm.assert_produces_warning(FutureWarning): - actual = pd.read_excel(in_file, 'multi_no_names', index_col=[0, 1], - has_index_names=False) - tm.assert_frame_equal(actual, expected, check_names=False) def test_read_excel_bool_header_arg(self): # GH 6114 diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 6da77bf423609..1e1d653cf94d1 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -20,7 +20,7 @@ from pandas import (DataFrame, MultiIndex, read_csv, Timestamp, Index, date_range, Series) from pandas.compat import (map, zip, StringIO, string_types, BytesIO, - is_platform_windows) + is_platform_windows, PY3) from pandas.io.common import URLError, urlopen, file_path_to_url from pandas.io.html import read_html from pandas._libs.parsers import ParserError @@ -96,6 +96,9 @@ def read_html(self, *args, **kwargs): class TestReadHtml(ReadHtmlMixin): flavor = 'bs4' spam_data = os.path.join(DATA_PATH, 'spam.html') + spam_data_kwargs = {} + if PY3: + spam_data_kwargs['encoding'] = 'UTF-8' banklist_data = os.path.join(DATA_PATH, 'banklist.html') @classmethod @@ -247,10 +250,10 @@ def test_infer_types(self): assert_framelist_equal(df1, df2) def test_string_io(self): - with open(self.spam_data) as f: + with open(self.spam_data, **self.spam_data_kwargs) as f: data1 = StringIO(f.read()) - with open(self.spam_data) as f: + with open(self.spam_data, **self.spam_data_kwargs) as f: data2 = StringIO(f.read()) df1 = self.read_html(data1, '.*Water.*') @@ -258,7 +261,7 @@ def test_string_io(self): assert_framelist_equal(df1, df2) def test_string(self): - with open(self.spam_data) as f: + with open(self.spam_data, **self.spam_data_kwargs) as f: data = f.read() df1 = self.read_html(data, '.*Water.*') @@ -267,10 +270,10 @@ def test_string(self): assert_framelist_equal(df1, df2) def test_file_like(self): - with open(self.spam_data) as f: + with open(self.spam_data, **self.spam_data_kwargs) as f: df1 = self.read_html(f, '.*Water.*') - with open(self.spam_data) as f: + with open(self.spam_data, **self.spam_data_kwargs) as f: df2 = self.read_html(f, 'Unit') assert_framelist_equal(df1, df2) diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index 17f524cc279c0..efec778e12b50 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -16,7 +16,7 @@ date_range, timedelta_range, Index, DatetimeIndex, isnull) -from pandas.compat import is_platform_windows, PY3, PY35, BytesIO +from pandas.compat import is_platform_windows, PY3, PY35, BytesIO, text_type from pandas.io.formats.printing import pprint_thing tables = pytest.importorskip('tables') @@ -387,6 +387,7 @@ def test_repr(self): with ensure_clean_store(self.path) as store: repr(store) + store.info() store['a'] = tm.makeTimeSeries() store['b'] = tm.makeStringSeries() store['c'] = tm.makeDataFrame() @@ -418,8 +419,9 @@ def test_repr(self): # make a random group in hdf space store._handle.create_group(store._handle.root, 'bah') - repr(store) - str(store) + assert store.filename in repr(store) + assert store.filename in str(store) + store.info() # storers with ensure_clean_store(self.path) as store: @@ -2920,6 +2922,27 @@ def test_store_index_name_with_tz(self): recons = store['frame'] tm.assert_frame_equal(recons, df) + @pytest.mark.parametrize('table_format', ['table', 'fixed']) + def test_store_index_name_numpy_str(self, table_format): + # GH #13492 + idx = pd.Index(pd.to_datetime([datetime.date(2000, 1, 1), + datetime.date(2000, 1, 2)]), + name=u('cols\u05d2')) + idx1 = pd.Index(pd.to_datetime([datetime.date(2010, 1, 1), + datetime.date(2010, 1, 2)]), + name=u('rows\u05d0')) + df = pd.DataFrame(np.arange(4).reshape(2, 2), columns=idx, index=idx1) + + # This used to fail, returning numpy strings instead of python strings. + with ensure_clean_path(self.path) as path: + df.to_hdf(path, 'df', format=table_format) + df2 = read_hdf(path, 'df') + + assert_frame_equal(df, df2, check_names=True) + + assert type(df2.index.name) == text_type + assert type(df2.columns.name) == text_type + def test_store_series_name(self): df = tm.makeDataFrame() series = df['A'] @@ -4219,6 +4242,21 @@ def test_start_stop_table(self): expected = df.loc[30:40, ['A']] tm.assert_frame_equal(result, expected) + def test_start_stop_multiple(self): + + # GH 16209 + with ensure_clean_store(self.path) as store: + + df = DataFrame({"foo": [1, 2], "bar": [1, 2]}) + + store.append_to_multiple({'selector': ['foo'], 'data': None}, df, + selector='selector') + result = store.select_as_multiple(['selector', 'data'], + selector='selector', start=0, + stop=1) + expected = df.loc[[0], ['foo', 'bar']] + tm.assert_frame_equal(result, expected) + def test_start_stop_fixed(self): with ensure_clean_store(self.path) as store: @@ -4371,11 +4409,11 @@ def test_multiple_open_close(self): # single store = HDFStore(path) - assert 'CLOSED' not in str(store) + assert 'CLOSED' not in store.info() assert store.is_open store.close() - assert 'CLOSED' in str(store) + assert 'CLOSED' in store.info() assert not store.is_open with ensure_clean_path(self.path) as path: @@ -4396,20 +4434,20 @@ def f(): store1 = HDFStore(path) store2 = HDFStore(path) - assert 'CLOSED' not in str(store1) - assert 'CLOSED' not in str(store2) + assert 'CLOSED' not in store1.info() + assert 'CLOSED' not in store2.info() assert store1.is_open assert store2.is_open store1.close() - assert 'CLOSED' in str(store1) + assert 'CLOSED' in store1.info() assert not store1.is_open - assert 'CLOSED' not in str(store2) + assert 'CLOSED' not in store2.info() assert store2.is_open store2.close() - assert 'CLOSED' in str(store1) - assert 'CLOSED' in str(store2) + assert 'CLOSED' in store1.info() + assert 'CLOSED' in store2.info() assert not store1.is_open assert not store2.is_open @@ -4420,11 +4458,11 @@ def f(): store2 = HDFStore(path) store2.append('df2', df) store2.close() - assert 'CLOSED' in str(store2) + assert 'CLOSED' in store2.info() assert not store2.is_open store.close() - assert 'CLOSED' in str(store) + assert 'CLOSED' in store.info() assert not store.is_open # double closing @@ -4433,11 +4471,11 @@ def f(): store2 = HDFStore(path) store.close() - assert 'CLOSED' in str(store) + assert 'CLOSED' in store.info() assert not store.is_open store2.close() - assert 'CLOSED' in str(store2) + assert 'CLOSED' in store2.info() assert not store2.is_open # ops on a closed store @@ -4784,9 +4822,10 @@ def test_categorical(self): tm.assert_frame_equal(result, df2) # Make sure the metadata is OK - assert '/df2 ' in str(store) - assert '/df2/meta/values_block_0/meta' in str(store) - assert '/df2/meta/values_block_1/meta' in str(store) + info = store.info() + assert '/df2 ' in info + assert '/df2/meta/values_block_0/meta' in info + assert '/df2/meta/values_block_1/meta' in info # unordered s = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'c'], categories=[ diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 7b3717281bf89..deeb8cba2b228 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -816,6 +816,16 @@ def test_unicode_column_name(self): df = DataFrame([[1, 2], [3, 4]], columns=[u'\xe9', u'b']) df.to_sql('test_unicode', self.conn, index=False) + def test_escaped_table_name(self): + # GH 13206 + df = DataFrame({'A': [0, 1, 2], 'B': [0.2, np.nan, 5.6]}) + df.to_sql('d1187b08-4943-4c8d-a7f6', self.conn, index=False) + + res = sql.read_sql_query('SELECT * FROM `d1187b08-4943-4c8d-a7f6`', + self.conn) + + tm.assert_frame_equal(res, df) + @pytest.mark.single class TestSQLApi(SQLAlchemyMixIn, _TestSQLApi): @@ -938,6 +948,13 @@ def test_database_uri_string(self): # using driver that will not be installed on Travis to trigger error # in sqlalchemy.create_engine -> test passing of this error to user + try: + # the rest of this test depends on pg8000's being absent + import pg8000 # noqa + pytest.skip("pg8000 is installed") + except ImportError: + pass + db_uri = "postgresql+pg8000://user:pass@host/dbname" with tm.assert_raises_regex(ImportError, "pg8000"): sql.read_sql("select * from table", db_uri) diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index 0e15aaa2555f4..0cff365be3ec8 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -55,16 +55,15 @@ def test_ts_plot_with_tz(self): def test_fontsize_set_correctly(self): # For issue #8765 - import matplotlib.pyplot as plt # noqa df = DataFrame(np.random.randn(10, 9), index=range(10)) - ax = df.plot(fontsize=2) + fig, ax = self.plt.subplots() + df.plot(fontsize=2, ax=ax) for label in (ax.get_xticklabels() + ax.get_yticklabels()): assert label.get_fontsize() == 2 @slow def test_frame_inferred(self): # inferred freq - import matplotlib.pyplot as plt # noqa idx = date_range('1/1/1987', freq='MS', periods=100) idx = DatetimeIndex(idx.values, freq=None) @@ -90,26 +89,24 @@ def test_is_error_nozeroindex(self): _check_plot_works(a.plot, yerr=a) def test_nonnumeric_exclude(self): - import matplotlib.pyplot as plt - idx = date_range('1/1/1987', freq='A', periods=3) df = DataFrame({'A': ["x", "y", "z"], 'B': [1, 2, 3]}, idx) - ax = df.plot() # it works + fig, ax = self.plt.subplots() + df.plot(ax=ax) # it works assert len(ax.get_lines()) == 1 # B was plotted - plt.close(plt.gcf()) + self.plt.close(fig) pytest.raises(TypeError, df['A'].plot) @slow def test_tsplot(self): from pandas.tseries.plotting import tsplot - import matplotlib.pyplot as plt - ax = plt.gca() + _, ax = self.plt.subplots() ts = tm.makeTimeSeries() - f = lambda *args, **kwds: tsplot(s, plt.Axes.plot, *args, **kwds) + f = lambda *args, **kwds: tsplot(s, self.plt.Axes.plot, *args, **kwds) for s in self.period_ser: _check_plot_works(f, s.index.freq, ax=ax, series=s) @@ -123,12 +120,12 @@ def test_tsplot(self): for s in self.datetime_ser: _check_plot_works(s.plot, ax=ax) - ax = ts.plot(style='k') + _, ax = self.plt.subplots() + ts.plot(style='k', ax=ax) color = (0., 0., 0., 1) if self.mpl_ge_2_0_0 else (0., 0., 0.) assert color == ax.get_lines()[0].get_color() def test_both_style_and_color(self): - import matplotlib.pyplot as plt # noqa ts = tm.makeTimeSeries() pytest.raises(ValueError, ts.plot, style='b-', color='#000099') @@ -140,9 +137,10 @@ def test_both_style_and_color(self): def test_high_freq(self): freaks = ['ms', 'us'] for freq in freaks: + _, ax = self.plt.subplots() rng = date_range('1/1/2012', periods=100000, freq=freq) ser = Series(np.random.randn(len(rng)), rng) - _check_plot_works(ser.plot) + _check_plot_works(ser.plot, ax=ax) def test_get_datevalue(self): from pandas.plotting._converter import get_datevalue @@ -167,22 +165,25 @@ def check_format_of_first_point(ax, expected_string): annual = Series(1, index=date_range('2014-01-01', periods=3, freq='A-DEC')) - check_format_of_first_point(annual.plot(), 't = 2014 y = 1.000000') + _, ax = self.plt.subplots() + annual.plot(ax=ax) + check_format_of_first_point(ax, 't = 2014 y = 1.000000') # note this is added to the annual plot already in existence, and # changes its freq field daily = Series(1, index=date_range('2014-01-01', periods=3, freq='D')) - check_format_of_first_point(daily.plot(), + daily.plot(ax=ax) + check_format_of_first_point(ax, 't = 2014-01-01 y = 1.000000') tm.close() # tsplot - import matplotlib.pyplot as plt + _, ax = self.plt.subplots() from pandas.tseries.plotting import tsplot - tsplot(annual, plt.Axes.plot) - check_format_of_first_point(plt.gca(), 't = 2014 y = 1.000000') - tsplot(daily, plt.Axes.plot) - check_format_of_first_point(plt.gca(), 't = 2014-01-01 y = 1.000000') + tsplot(annual, self.plt.Axes.plot, ax=ax) + check_format_of_first_point(ax, 't = 2014 y = 1.000000') + tsplot(daily, self.plt.Axes.plot, ax=ax) + check_format_of_first_point(ax, 't = 2014-01-01 y = 1.000000') @slow def test_line_plot_period_series(self): @@ -215,14 +216,11 @@ def test_line_plot_inferred_freq(self): _check_plot_works(ser.plot) def test_fake_inferred_business(self): - import matplotlib.pyplot as plt - fig = plt.gcf() - plt.clf() - fig.add_subplot(111) + _, ax = self.plt.subplots() rng = date_range('2001-1-1', '2001-1-10') ts = Series(lrange(len(rng)), rng) ts = ts[:3].append(ts[5:]) - ax = ts.plot() + ts.plot(ax=ax) assert not hasattr(ax, 'freq') @slow @@ -244,15 +242,11 @@ def test_plot_multiple_inferred_freq(self): @slow def test_uhf(self): import pandas.plotting._converter as conv - import matplotlib.pyplot as plt - fig = plt.gcf() - plt.clf() - fig.add_subplot(111) - idx = date_range('2012-6-22 21:59:51.960928', freq='L', periods=500) df = DataFrame(np.random.randn(len(idx), 2), idx) - ax = df.plot() + _, ax = self.plt.subplots() + df.plot(ax=ax) axis = ax.get_xaxis() tlocs = axis.get_ticklocs() @@ -265,49 +259,40 @@ def test_uhf(self): @slow def test_irreg_hf(self): - import matplotlib.pyplot as plt - fig = plt.gcf() - plt.clf() - fig.add_subplot(111) - idx = date_range('2012-6-22 21:59:51', freq='S', periods=100) df = DataFrame(np.random.randn(len(idx), 2), idx) irreg = df.iloc[[0, 1, 3, 4]] - ax = irreg.plot() + _, ax = self.plt.subplots() + irreg.plot(ax=ax) diffs = Series(ax.get_lines()[0].get_xydata()[:, 0]).diff() sec = 1. / 24 / 60 / 60 assert (np.fabs(diffs[1:] - [sec, sec * 2, sec]) < 1e-8).all() - plt.clf() - fig.add_subplot(111) + _, ax = self.plt.subplots() df2 = df.copy() df2.index = df.index.asobject - ax = df2.plot() + df2.plot(ax=ax) diffs = Series(ax.get_lines()[0].get_xydata()[:, 0]).diff() assert (np.fabs(diffs[1:] - sec) < 1e-8).all() def test_irregular_datetime64_repr_bug(self): - import matplotlib.pyplot as plt ser = tm.makeTimeSeries() ser = ser[[0, 1, 2, 7]] - fig = plt.gcf() - plt.clf() + _, ax = self.plt.subplots() - ax = fig.add_subplot(211) - - ret = ser.plot() + ret = ser.plot(ax=ax) assert ret is not None for rs, xp in zip(ax.get_lines()[0].get_xdata(), ser.index): assert rs == xp def test_business_freq(self): - import matplotlib.pyplot as plt # noqa bts = tm.makePeriodSeries() - ax = bts.plot() + _, ax = self.plt.subplots() + bts.plot(ax=ax) assert ax.get_lines()[0].get_xydata()[0, 0] == bts.index[0].ordinal idx = ax.get_lines()[0].get_xdata() assert PeriodIndex(data=idx).freqstr == 'B' @@ -319,7 +304,8 @@ def test_business_freq_convert(self): bts = tm.makeTimeSeries().asfreq('BM') tm.N = n ts = bts.to_period('M') - ax = bts.plot() + _, ax = self.plt.subplots() + bts.plot(ax=ax) assert ax.get_lines()[0].get_xydata()[0, 0] == ts.index[0].ordinal idx = ax.get_lines()[0].get_xdata() assert PeriodIndex(data=idx).freqstr == 'M' @@ -329,19 +315,20 @@ def test_nonzero_base(self): idx = (date_range('2012-12-20', periods=24, freq='H') + timedelta( minutes=30)) df = DataFrame(np.arange(24), index=idx) - ax = df.plot() + _, ax = self.plt.subplots() + df.plot(ax=ax) rs = ax.get_lines()[0].get_xdata() assert not Index(rs).is_normalized def test_dataframe(self): bts = DataFrame({'a': tm.makeTimeSeries()}) - ax = bts.plot() + _, ax = self.plt.subplots() + bts.plot(ax=ax) idx = ax.get_lines()[0].get_xdata() tm.assert_index_equal(bts.index.to_period(), PeriodIndex(idx)) @slow def test_axis_limits(self): - import matplotlib.pyplot as plt def _test(ax): xlim = ax.get_xlim() @@ -369,14 +356,16 @@ def _test(ax): assert int(result[0]) == expected[0].ordinal assert int(result[1]) == expected[1].ordinal fig = ax.get_figure() - plt.close(fig) + self.plt.close(fig) ser = tm.makeTimeSeries() - ax = ser.plot() + _, ax = self.plt.subplots() + ser.plot(ax=ax) _test(ax) + _, ax = self.plt.subplots() df = DataFrame({'a': ser, 'b': ser + 1}) - ax = df.plot() + df.plot(ax=ax) _test(ax) df = DataFrame({'a': ser, 'b': ser + 1}) @@ -397,13 +386,13 @@ def test_get_finder(self): @slow def test_finder_daily(self): - import matplotlib.pyplot as plt xp = Period('1999-1-1', freq='B').ordinal day_lst = [10, 40, 252, 400, 950, 2750, 10000] for n in day_lst: rng = bdate_range('1999-1-1', periods=n) ser = Series(np.random.randn(len(rng)), rng) - ax = ser.plot() + _, ax = self.plt.subplots() + ser.plot(ax=ax) xaxis = ax.get_xaxis() rs = xaxis.get_majorticklocs()[0] assert xp == rs @@ -411,17 +400,17 @@ def test_finder_daily(self): ax.set_xlim(vmin + 0.9, vmax) rs = xaxis.get_majorticklocs()[0] assert xp == rs - plt.close(ax.get_figure()) + self.plt.close(ax.get_figure()) @slow def test_finder_quarterly(self): - import matplotlib.pyplot as plt xp = Period('1988Q1').ordinal yrs = [3.5, 11] for n in yrs: rng = period_range('1987Q2', periods=int(n * 4), freq='Q') ser = Series(np.random.randn(len(rng)), rng) - ax = ser.plot() + _, ax = self.plt.subplots() + ser.plot(ax=ax) xaxis = ax.get_xaxis() rs = xaxis.get_majorticklocs()[0] assert rs == xp @@ -429,17 +418,17 @@ def test_finder_quarterly(self): ax.set_xlim(vmin + 0.9, vmax) rs = xaxis.get_majorticklocs()[0] assert xp == rs - plt.close(ax.get_figure()) + self.plt.close(ax.get_figure()) @slow def test_finder_monthly(self): - import matplotlib.pyplot as plt xp = Period('Jan 1988').ordinal yrs = [1.15, 2.5, 4, 11] for n in yrs: rng = period_range('1987Q2', periods=int(n * 12), freq='M') ser = Series(np.random.randn(len(rng)), rng) - ax = ser.plot() + _, ax = self.plt.subplots() + ser.plot(ax=ax) xaxis = ax.get_xaxis() rs = xaxis.get_majorticklocs()[0] assert rs == xp @@ -447,12 +436,13 @@ def test_finder_monthly(self): ax.set_xlim(vmin + 0.9, vmax) rs = xaxis.get_majorticklocs()[0] assert xp == rs - plt.close(ax.get_figure()) + self.plt.close(ax.get_figure()) def test_finder_monthly_long(self): rng = period_range('1988Q1', periods=24 * 12, freq='M') ser = Series(np.random.randn(len(rng)), rng) - ax = ser.plot() + _, ax = self.plt.subplots() + ser.plot(ax=ax) xaxis = ax.get_xaxis() rs = xaxis.get_majorticklocs()[0] xp = Period('1989Q1', 'M').ordinal @@ -460,23 +450,24 @@ def test_finder_monthly_long(self): @slow def test_finder_annual(self): - import matplotlib.pyplot as plt xp = [1987, 1988, 1990, 1990, 1995, 2020, 2070, 2170] for i, nyears in enumerate([5, 10, 19, 49, 99, 199, 599, 1001]): rng = period_range('1987', periods=nyears, freq='A') ser = Series(np.random.randn(len(rng)), rng) - ax = ser.plot() + _, ax = self.plt.subplots() + ser.plot(ax=ax) xaxis = ax.get_xaxis() rs = xaxis.get_majorticklocs()[0] assert rs == Period(xp[i], freq='A').ordinal - plt.close(ax.get_figure()) + self.plt.close(ax.get_figure()) @slow def test_finder_minutely(self): nminutes = 50 * 24 * 60 rng = date_range('1/1/1999', freq='Min', periods=nminutes) ser = Series(np.random.randn(len(rng)), rng) - ax = ser.plot() + _, ax = self.plt.subplots() + ser.plot(ax=ax) xaxis = ax.get_xaxis() rs = xaxis.get_majorticklocs()[0] xp = Period('1/1/1999', freq='Min').ordinal @@ -486,7 +477,8 @@ def test_finder_hourly(self): nhours = 23 rng = date_range('1/1/1999', freq='H', periods=nhours) ser = Series(np.random.randn(len(rng)), rng) - ax = ser.plot() + _, ax = self.plt.subplots() + ser.plot(ax=ax) xaxis = ax.get_xaxis() rs = xaxis.get_majorticklocs()[0] xp = Period('1/1/1999', freq='H').ordinal @@ -494,11 +486,10 @@ def test_finder_hourly(self): @slow def test_gaps(self): - import matplotlib.pyplot as plt - ts = tm.makeTimeSeries() ts[5:25] = np.nan - ax = ts.plot() + _, ax = self.plt.subplots() + ts.plot(ax=ax) lines = ax.get_lines() tm._skip_if_mpl_1_5() assert len(lines) == 1 @@ -507,13 +498,14 @@ def test_gaps(self): assert isinstance(data, np.ma.core.MaskedArray) mask = data.mask assert mask[5:25, 1].all() - plt.close(ax.get_figure()) + self.plt.close(ax.get_figure()) # irregular ts = tm.makeTimeSeries() ts = ts[[0, 1, 2, 5, 7, 9, 12, 15, 20]] ts[2:5] = np.nan - ax = ts.plot() + _, ax = self.plt.subplots() + ax = ts.plot(ax=ax) lines = ax.get_lines() assert len(lines) == 1 l = lines[0] @@ -521,13 +513,14 @@ def test_gaps(self): assert isinstance(data, np.ma.core.MaskedArray) mask = data.mask assert mask[2:5, 1].all() - plt.close(ax.get_figure()) + self.plt.close(ax.get_figure()) # non-ts idx = [0, 1, 2, 5, 7, 9, 12, 15, 20] ser = Series(np.random.randn(len(idx)), idx) ser[2:5] = np.nan - ax = ser.plot() + _, ax = self.plt.subplots() + ser.plot(ax=ax) lines = ax.get_lines() assert len(lines) == 1 l = lines[0] @@ -540,7 +533,8 @@ def test_gaps(self): def test_gap_upsample(self): low = tm.makeTimeSeries() low[5:25] = np.nan - ax = low.plot() + _, ax = self.plt.subplots() + low.plot(ax=ax) idxh = date_range(low.index[0], low.index[-1], freq='12h') s = Series(np.random.randn(len(idxh)), idxh) @@ -559,26 +553,25 @@ def test_gap_upsample(self): @slow def test_secondary_y(self): - import matplotlib.pyplot as plt - ser = Series(np.random.randn(10)) ser2 = Series(np.random.randn(10)) + fig, _ = self.plt.subplots() ax = ser.plot(secondary_y=True) assert hasattr(ax, 'left_ax') assert not hasattr(ax, 'right_ax') - fig = ax.get_figure() axes = fig.get_axes() l = ax.get_lines()[0] xp = Series(l.get_ydata(), l.get_xdata()) assert_series_equal(ser, xp) assert ax.get_yaxis().get_ticks_position() == 'right' assert not axes[0].get_yaxis().get_visible() - plt.close(fig) + self.plt.close(fig) - ax2 = ser2.plot() + _, ax2 = self.plt.subplots() + ser2.plot(ax=ax2) assert (ax2.get_yaxis().get_ticks_position() == self.default_tick_position) - plt.close(ax2.get_figure()) + self.plt.close(ax2.get_figure()) ax = ser2.plot() ax2 = ser.plot(secondary_y=True) @@ -590,26 +583,26 @@ def test_secondary_y(self): @slow def test_secondary_y_ts(self): - import matplotlib.pyplot as plt idx = date_range('1/1/2000', periods=10) ser = Series(np.random.randn(10), idx) ser2 = Series(np.random.randn(10), idx) + fig, _ = self.plt.subplots() ax = ser.plot(secondary_y=True) assert hasattr(ax, 'left_ax') assert not hasattr(ax, 'right_ax') - fig = ax.get_figure() axes = fig.get_axes() l = ax.get_lines()[0] xp = Series(l.get_ydata(), l.get_xdata()).to_timestamp() assert_series_equal(ser, xp) assert ax.get_yaxis().get_ticks_position() == 'right' assert not axes[0].get_yaxis().get_visible() - plt.close(fig) + self.plt.close(fig) - ax2 = ser2.plot() + _, ax2 = self.plt.subplots() + ser2.plot(ax=ax2) assert (ax2.get_yaxis().get_ticks_position() == self.default_tick_position) - plt.close(ax2.get_figure()) + self.plt.close(ax2.get_figure()) ax = ser2.plot() ax2 = ser.plot(secondary_y=True) @@ -620,20 +613,19 @@ def test_secondary_kde(self): tm._skip_if_no_scipy() _skip_if_no_scipy_gaussian_kde() - import matplotlib.pyplot as plt # noqa ser = Series(np.random.randn(10)) - ax = ser.plot(secondary_y=True, kind='density') + fig, ax = self.plt.subplots() + ax = ser.plot(secondary_y=True, kind='density', ax=ax) assert hasattr(ax, 'left_ax') assert not hasattr(ax, 'right_ax') - fig = ax.get_figure() axes = fig.get_axes() assert axes[1].get_yaxis().get_ticks_position() == 'right' @slow def test_secondary_bar(self): ser = Series(np.random.randn(10)) - ax = ser.plot(secondary_y=True, kind='bar') - fig = ax.get_figure() + fig, ax = self.plt.subplots() + ser.plot(secondary_y=True, kind='bar', ax=ax) axes = fig.get_axes() assert axes[1].get_yaxis().get_ticks_position() == 'right' @@ -656,7 +648,7 @@ def test_secondary_bar_frame(self): assert axes[2].get_yaxis().get_ticks_position() == 'right' def test_mixed_freq_regular_first(self): - import matplotlib.pyplot as plt # noqa + # TODO s1 = tm.makeTimeSeries() s2 = s1[[0, 5, 10, 11, 12, 13, 14, 15]] @@ -676,11 +668,11 @@ def test_mixed_freq_regular_first(self): @slow def test_mixed_freq_irregular_first(self): - import matplotlib.pyplot as plt # noqa s1 = tm.makeTimeSeries() s2 = s1[[0, 5, 10, 11, 12, 13, 14, 15]] - s2.plot(style='g') - ax = s1.plot() + _, ax = self.plt.subplots() + s2.plot(style='g', ax=ax) + s1.plot(ax=ax) assert not hasattr(ax, 'freq') lines = ax.get_lines() x1 = lines[0].get_xdata() @@ -690,10 +682,10 @@ def test_mixed_freq_irregular_first(self): def test_mixed_freq_regular_first_df(self): # GH 9852 - import matplotlib.pyplot as plt # noqa s1 = tm.makeTimeSeries().to_frame() s2 = s1.iloc[[0, 5, 10, 11, 12, 13, 14, 15], :] - ax = s1.plot() + _, ax = self.plt.subplots() + s1.plot(ax=ax) ax2 = s2.plot(style='g', ax=ax) lines = ax2.get_lines() idx1 = PeriodIndex(lines[0].get_xdata()) @@ -708,11 +700,11 @@ def test_mixed_freq_regular_first_df(self): @slow def test_mixed_freq_irregular_first_df(self): # GH 9852 - import matplotlib.pyplot as plt # noqa s1 = tm.makeTimeSeries().to_frame() s2 = s1.iloc[[0, 5, 10, 11, 12, 13, 14, 15], :] - ax = s2.plot(style='g') - ax = s1.plot(ax=ax) + _, ax = self.plt.subplots() + s2.plot(style='g', ax=ax) + s1.plot(ax=ax) assert not hasattr(ax, 'freq') lines = ax.get_lines() x1 = lines[0].get_xdata() @@ -725,8 +717,9 @@ def test_mixed_freq_hf_first(self): idxl = date_range('1/1/1999', periods=12, freq='M') high = Series(np.random.randn(len(idxh)), idxh) low = Series(np.random.randn(len(idxl)), idxl) - high.plot() - ax = low.plot() + _, ax = self.plt.subplots() + high.plot(ax=ax) + low.plot(ax=ax) for l in ax.get_lines(): assert PeriodIndex(data=l.get_xdata()).freq == 'D' @@ -738,33 +731,35 @@ def test_mixed_freq_alignment(self): ts = Series(ts_data, index=ts_ind) ts2 = ts.asfreq('T').interpolate() - ax = ts.plot() - ts2.plot(style='r') + _, ax = self.plt.subplots() + ax = ts.plot(ax=ax) + ts2.plot(style='r', ax=ax) assert ax.lines[0].get_xdata()[0] == ax.lines[1].get_xdata()[0] @slow def test_mixed_freq_lf_first(self): - import matplotlib.pyplot as plt idxh = date_range('1/1/1999', periods=365, freq='D') idxl = date_range('1/1/1999', periods=12, freq='M') high = Series(np.random.randn(len(idxh)), idxh) low = Series(np.random.randn(len(idxl)), idxl) - low.plot(legend=True) - ax = high.plot(legend=True) + _, ax = self.plt.subplots() + low.plot(legend=True, ax=ax) + high.plot(legend=True, ax=ax) for l in ax.get_lines(): assert PeriodIndex(data=l.get_xdata()).freq == 'D' leg = ax.get_legend() assert len(leg.texts) == 2 - plt.close(ax.get_figure()) + self.plt.close(ax.get_figure()) idxh = date_range('1/1/1999', periods=240, freq='T') idxl = date_range('1/1/1999', periods=4, freq='H') high = Series(np.random.randn(len(idxh)), idxh) low = Series(np.random.randn(len(idxl)), idxl) - low.plot() - ax = high.plot() + _, ax = self.plt.subplots() + low.plot(ax=ax) + high.plot(ax=ax) for l in ax.get_lines(): assert PeriodIndex(data=l.get_xdata()).freq == 'T' @@ -773,8 +768,9 @@ def test_mixed_freq_irreg_period(self): irreg = ts[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 16, 17, 18, 29]] rng = period_range('1/3/2000', periods=30, freq='B') ps = Series(np.random.randn(len(rng)), rng) - irreg.plot() - ps.plot() + _, ax = self.plt.subplots() + irreg.plot(ax=ax) + ps.plot(ax=ax) def test_mixed_freq_shared_ax(self): @@ -813,9 +809,7 @@ def test_mixed_freq_shared_ax(self): def test_nat_handling(self): - fig = self.plt.gcf() - # self.plt.clf() - ax = fig.add_subplot(111) + _, ax = self.plt.subplots() dti = DatetimeIndex(['2015-01-01', NaT, '2015-01-03']) s = Series(range(len(dti)), dti) @@ -831,17 +825,18 @@ def test_to_weekly_resampling(self): idxl = date_range('1/1/1999', periods=12, freq='M') high = Series(np.random.randn(len(idxh)), idxh) low = Series(np.random.randn(len(idxl)), idxl) - high.plot() - ax = low.plot() + _, ax = self.plt.subplots() + high.plot(ax=ax) + low.plot(ax=ax) for l in ax.get_lines(): assert PeriodIndex(data=l.get_xdata()).freq == idxh.freq # tsplot from pandas.tseries.plotting import tsplot - import matplotlib.pyplot as plt - tsplot(high, plt.Axes.plot) - lines = tsplot(low, plt.Axes.plot) + _, ax = self.plt.subplots() + tsplot(high, self.plt.Axes.plot, ax=ax) + lines = tsplot(low, self.plt.Axes.plot, ax=ax) for l in lines: assert PeriodIndex(data=l.get_xdata()).freq == idxh.freq @@ -851,8 +846,9 @@ def test_from_weekly_resampling(self): idxl = date_range('1/1/1999', periods=12, freq='M') high = Series(np.random.randn(len(idxh)), idxh) low = Series(np.random.randn(len(idxl)), idxl) - low.plot() - ax = high.plot() + _, ax = self.plt.subplots() + low.plot(ax=ax) + high.plot(ax=ax) expected_h = idxh.to_period().asi8.astype(np.float64) expected_l = np.array([1514, 1519, 1523, 1527, 1531, 1536, 1540, 1544, @@ -868,10 +864,10 @@ def test_from_weekly_resampling(self): # tsplot from pandas.tseries.plotting import tsplot - import matplotlib.pyplot as plt - tsplot(low, plt.Axes.plot) - lines = tsplot(high, plt.Axes.plot) + _, ax = self.plt.subplots() + tsplot(low, self.plt.Axes.plot, ax=ax) + lines = tsplot(high, self.plt.Axes.plot, ax=ax) for l in lines: assert PeriodIndex(data=l.get_xdata()).freq == idxh.freq xdata = l.get_xdata(orig=False) @@ -891,8 +887,9 @@ def test_from_resampling_area_line_mixed(self): # low to high for kind1, kind2 in [('line', 'area'), ('area', 'line')]: - ax = low.plot(kind=kind1, stacked=True) - ax = high.plot(kind=kind2, stacked=True, ax=ax) + _, ax = self.plt.subplots() + low.plot(kind=kind1, stacked=True, ax=ax) + high.plot(kind=kind2, stacked=True, ax=ax) # check low dataframe result expected_x = np.array([1514, 1519, 1523, 1527, 1531, 1536, 1540, @@ -923,8 +920,9 @@ def test_from_resampling_area_line_mixed(self): # high to low for kind1, kind2 in [('line', 'area'), ('area', 'line')]: - ax = high.plot(kind=kind1, stacked=True) - ax = low.plot(kind=kind2, stacked=True, ax=ax) + _, ax = self.plt.subplots() + high.plot(kind=kind1, stacked=True, ax=ax) + low.plot(kind=kind2, stacked=True, ax=ax) # check high dataframe result expected_x = idxh.to_period().asi8.astype(np.float64) @@ -960,16 +958,18 @@ def test_mixed_freq_second_millisecond(self): high = Series(np.random.randn(len(idxh)), idxh) low = Series(np.random.randn(len(idxl)), idxl) # high to low - high.plot() - ax = low.plot() + _, ax = self.plt.subplots() + high.plot(ax=ax) + low.plot(ax=ax) assert len(ax.get_lines()) == 2 for l in ax.get_lines(): assert PeriodIndex(data=l.get_xdata()).freq == 'L' tm.close() # low to high - low.plot() - ax = high.plot() + _, ax = self.plt.subplots() + low.plot(ax=ax) + high.plot(ax=ax) assert len(ax.get_lines()) == 2 for l in ax.get_lines(): assert PeriodIndex(data=l.get_xdata()).freq == 'L' @@ -985,7 +985,8 @@ def test_irreg_dtypes(self): idx = date_range('1/1/2000', periods=10) idx = idx[[0, 2, 5, 9]].asobject df = DataFrame(np.random.randn(len(idx), 3), idx) - _check_plot_works(df.plot) + _, ax = self.plt.subplots() + _check_plot_works(df.plot, ax=ax) @slow def test_time(self): @@ -995,7 +996,8 @@ def test_time(self): df = DataFrame({'a': np.random.randn(len(ts)), 'b': np.random.randn(len(ts))}, index=ts) - ax = df.plot() + _, ax = self.plt.subplots() + df.plot(ax=ax) # verify tick labels ticks = ax.get_xticks() @@ -1031,7 +1033,8 @@ def test_time_musec(self): df = DataFrame({'a': np.random.randn(len(ts)), 'b': np.random.randn(len(ts))}, index=ts) - ax = df.plot() + _, ax = self.plt.subplots() + ax = df.plot(ax=ax) # verify tick labels ticks = ax.get_xticks() @@ -1054,8 +1057,9 @@ def test_secondary_upsample(self): idxl = date_range('1/1/1999', periods=12, freq='M') high = Series(np.random.randn(len(idxh)), idxh) low = Series(np.random.randn(len(idxl)), idxl) - low.plot() - ax = high.plot(secondary_y=True) + _, ax = self.plt.subplots() + low.plot(ax=ax) + ax = high.plot(secondary_y=True, ax=ax) for l in ax.get_lines(): assert PeriodIndex(l.get_xdata()).freq == 'D' assert hasattr(ax, 'left_ax') @@ -1065,14 +1069,12 @@ def test_secondary_upsample(self): @slow def test_secondary_legend(self): - import matplotlib.pyplot as plt - fig = plt.gcf() - plt.clf() + fig = self.plt.figure() ax = fig.add_subplot(211) # ts df = tm.makeTimeDataFrame() - ax = df.plot(secondary_y=['A', 'B']) + df.plot(secondary_y=['A', 'B'], ax=ax) leg = ax.get_legend() assert len(leg.get_lines()) == 4 assert leg.get_texts()[0].get_text() == 'A (right)' @@ -1086,33 +1088,37 @@ def test_secondary_legend(self): # TODO: color cycle problems assert len(colors) == 4 + self.plt.close(fig) - plt.clf() + fig = self.plt.figure() ax = fig.add_subplot(211) - ax = df.plot(secondary_y=['A', 'C'], mark_right=False) + df.plot(secondary_y=['A', 'C'], mark_right=False, ax=ax) leg = ax.get_legend() assert len(leg.get_lines()) == 4 assert leg.get_texts()[0].get_text() == 'A' assert leg.get_texts()[1].get_text() == 'B' assert leg.get_texts()[2].get_text() == 'C' assert leg.get_texts()[3].get_text() == 'D' + self.plt.close(fig) - plt.clf() - ax = df.plot(kind='bar', secondary_y=['A']) + fig, ax = self.plt.subplots() + df.plot(kind='bar', secondary_y=['A'], ax=ax) leg = ax.get_legend() assert leg.get_texts()[0].get_text() == 'A (right)' assert leg.get_texts()[1].get_text() == 'B' + self.plt.close(fig) - plt.clf() - ax = df.plot(kind='bar', secondary_y=['A'], mark_right=False) + fig, ax = self.plt.subplots() + df.plot(kind='bar', secondary_y=['A'], mark_right=False, ax=ax) leg = ax.get_legend() assert leg.get_texts()[0].get_text() == 'A' assert leg.get_texts()[1].get_text() == 'B' + self.plt.close(fig) - plt.clf() + fig = self.plt.figure() ax = fig.add_subplot(211) df = tm.makeTimeDataFrame() - ax = df.plot(secondary_y=['C', 'D']) + ax = df.plot(secondary_y=['C', 'D'], ax=ax) leg = ax.get_legend() assert len(leg.get_lines()) == 4 assert ax.right_ax.get_legend() is None @@ -1122,12 +1128,13 @@ def test_secondary_legend(self): # TODO: color cycle problems assert len(colors) == 4 + self.plt.close(fig) # non-ts df = tm.makeDataFrame() - plt.clf() + fig = self.plt.figure() ax = fig.add_subplot(211) - ax = df.plot(secondary_y=['A', 'B']) + ax = df.plot(secondary_y=['A', 'B'], ax=ax) leg = ax.get_legend() assert len(leg.get_lines()) == 4 assert ax.right_ax.get_legend() is None @@ -1137,10 +1144,11 @@ def test_secondary_legend(self): # TODO: color cycle problems assert len(colors) == 4 + self.plt.close() - plt.clf() + fig = self.plt.figure() ax = fig.add_subplot(211) - ax = df.plot(secondary_y=['C', 'D']) + ax = df.plot(secondary_y=['C', 'D'], ax=ax) leg = ax.get_legend() assert len(leg.get_lines()) == 4 assert ax.right_ax.get_legend() is None @@ -1154,7 +1162,8 @@ def test_secondary_legend(self): def test_format_date_axis(self): rng = date_range('1/1/2012', periods=12, freq='M') df = DataFrame(np.random.randn(len(rng), 3), rng) - ax = df.plot() + _, ax = self.plt.subplots() + ax = df.plot(ax=ax) xaxis = ax.get_xaxis() for l in xaxis.get_ticklabels(): if len(l.get_text()) > 0: @@ -1162,28 +1171,21 @@ def test_format_date_axis(self): @slow def test_ax_plot(self): - import matplotlib.pyplot as plt - x = DatetimeIndex(start='2012-01-02', periods=10, freq='D') y = lrange(len(x)) - fig = plt.figure() - ax = fig.add_subplot(111) + _, ax = self.plt.subplots() lines = ax.plot(x, y, label='Y') tm.assert_index_equal(DatetimeIndex(lines[0].get_xdata()), x) @slow def test_mpl_nopandas(self): - import matplotlib.pyplot as plt - dates = [date(2008, 12, 31), date(2009, 1, 31)] values1 = np.arange(10.0, 11.0, 0.5) values2 = np.arange(11.0, 12.0, 0.5) kw = dict(fmt='-', lw=4) - plt.close('all') - fig = plt.figure() - ax = fig.add_subplot(111) + _, ax = self.plt.subplots() ax.plot_date([x.toordinal() for x in dates], values1, **kw) ax.plot_date([x.toordinal() for x in dates], values2, **kw) @@ -1201,7 +1203,8 @@ def test_irregular_ts_shared_ax_xlim(self): ts_irregular = ts[[1, 4, 5, 6, 8, 9, 10, 12, 13, 14, 15, 17, 18]] # plot the left section of the irregular series, then the right section - ax = ts_irregular[:5].plot() + _, ax = self.plt.subplots() + ts_irregular[:5].plot(ax=ax) ts_irregular[5:].plot(ax=ax) # check that axis limits are correct @@ -1217,7 +1220,8 @@ def test_secondary_y_non_ts_xlim(self): s1 = Series(1, index=index_1) s2 = Series(2, index=index_2) - ax = s1.plot() + _, ax = self.plt.subplots() + s1.plot(ax=ax) left_before, right_before = ax.get_xlim() s2.plot(secondary_y=True, ax=ax) left_after, right_after = ax.get_xlim() @@ -1233,7 +1237,8 @@ def test_secondary_y_regular_ts_xlim(self): s1 = Series(1, index=index_1) s2 = Series(2, index=index_2) - ax = s1.plot() + _, ax = self.plt.subplots() + s1.plot(ax=ax) left_before, right_before = ax.get_xlim() s2.plot(secondary_y=True, ax=ax) left_after, right_after = ax.get_xlim() @@ -1247,7 +1252,8 @@ def test_secondary_y_mixed_freq_ts_xlim(self): rng = date_range('2000-01-01', periods=10000, freq='min') ts = Series(1, index=rng) - ax = ts.plot() + _, ax = self.plt.subplots() + ts.plot(ax=ax) left_before, right_before = ax.get_xlim() ts.resample('D').mean().plot(secondary_y=True, ax=ax) left_after, right_after = ax.get_xlim() @@ -1262,7 +1268,8 @@ def test_secondary_y_irregular_ts_xlim(self): ts = tm.makeTimeSeries()[:20] ts_irregular = ts[[1, 4, 5, 6, 8, 9, 10, 12, 13, 14, 15, 17, 18]] - ax = ts_irregular[:5].plot() + _, ax = self.plt.subplots() + ts_irregular[:5].plot(ax=ax) # plot higher-x values on secondary axis ts_irregular[5:].plot(secondary_y=True, ax=ax) # ensure secondary limits aren't overwritten by plot on primary @@ -1275,10 +1282,11 @@ def test_secondary_y_irregular_ts_xlim(self): def test_plot_outofbounds_datetime(self): # 2579 - checking this does not raise values = [date(1677, 1, 1), date(1677, 1, 2)] - self.plt.plot(values) + _, ax = self.plt.subplots() + ax.plot(values) values = [datetime(1677, 1, 1, 12), datetime(1677, 1, 2, 12)] - self.plt.plot(values) + ax.plot(values) def test_format_timedelta_ticks_narrow(self): if is_platform_mac(): @@ -1290,8 +1298,8 @@ def test_format_timedelta_ticks_narrow(self): rng = timedelta_range('0', periods=10, freq='ns') df = DataFrame(np.random.randn(len(rng), 3), rng) - ax = df.plot(fontsize=2) - fig = ax.get_figure() + fig, ax = self.plt.subplots() + df.plot(fontsize=2, ax=ax) fig.canvas.draw() labels = ax.get_xticklabels() assert len(labels) == len(expected_labels) @@ -1316,8 +1324,8 @@ def test_format_timedelta_ticks_wide(self): rng = timedelta_range('0', periods=10, freq='1 d') df = DataFrame(np.random.randn(len(rng), 3), rng) - ax = df.plot(fontsize=2) - fig = ax.get_figure() + fig, ax = self.plt.subplots() + ax = df.plot(fontsize=2, ax=ax) fig.canvas.draw() labels = ax.get_xticklabels() assert len(labels) == len(expected_labels) @@ -1327,19 +1335,22 @@ def test_format_timedelta_ticks_wide(self): def test_timedelta_plot(self): # test issue #8711 s = Series(range(5), timedelta_range('1day', periods=5)) - _check_plot_works(s.plot) + _, ax = self.plt.subplots() + _check_plot_works(s.plot, ax=ax) # test long period index = timedelta_range('1 day 2 hr 30 min 10 s', periods=10, freq='1 d') s = Series(np.random.randn(len(index)), index) - _check_plot_works(s.plot) + _, ax = self.plt.subplots() + _check_plot_works(s.plot, ax=ax) # test short period index = timedelta_range('1 day 2 hr 30 min 10 s', periods=10, freq='1 ns') s = Series(np.random.randn(len(index)), index) - _check_plot_works(s.plot) + _, ax = self.plt.subplots() + _check_plot_works(s.plot, ax=ax) def test_hist(self): # https://github.com/matplotlib/matplotlib/issues/8459 @@ -1347,7 +1358,8 @@ def test_hist(self): x = rng w1 = np.arange(0, 1, .1) w2 = np.arange(0, 1, .1)[::-1] - self.plt.hist([x, x], weights=[w1, w2]) + _, ax = self.plt.subplots() + ax.hist([x, x], weights=[w1, w2]) @slow def test_overlapping_datetime(self): @@ -1361,7 +1373,8 @@ def test_overlapping_datetime(self): # plot first series, then add the second series to those axes, # then try adding the first series again - ax = s1.plot() + _, ax = self.plt.subplots() + s1.plot(ax=ax) s2.plot(ax=ax) s1.plot(ax=ax) diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index 340a98484480f..7c66b5dafb9c7 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -82,7 +82,8 @@ def test_plot(self): @slow def test_plot_figsize_and_title(self): # figsize and title - ax = self.series.plot(title='Test', figsize=(16, 8)) + _, ax = self.plt.subplots() + ax = self.series.plot(title='Test', figsize=(16, 8), ax=ax) self._check_text_labels(ax.title, 'Test') self._check_axes_shape(ax, axes_num=1, layout=(1, 1), figsize=(16, 8)) @@ -93,25 +94,28 @@ def test_dont_modify_rcParams(self): else: key = 'axes.color_cycle' colors = self.plt.rcParams[key] - Series([1, 2, 3]).plot() + _, ax = self.plt.subplots() + Series([1, 2, 3]).plot(ax=ax) assert colors == self.plt.rcParams[key] def test_ts_line_lim(self): - ax = self.ts.plot() + fig, ax = self.plt.subplots() + ax = self.ts.plot(ax=ax) xmin, xmax = ax.get_xlim() lines = ax.get_lines() assert xmin == lines[0].get_data(orig=False)[0][0] assert xmax == lines[0].get_data(orig=False)[0][-1] tm.close() - ax = self.ts.plot(secondary_y=True) + ax = self.ts.plot(secondary_y=True, ax=ax) xmin, xmax = ax.get_xlim() lines = ax.get_lines() assert xmin == lines[0].get_data(orig=False)[0][0] assert xmax == lines[0].get_data(orig=False)[0][-1] def test_ts_area_lim(self): - ax = self.ts.plot.area(stacked=False) + _, ax = self.plt.subplots() + ax = self.ts.plot.area(stacked=False, ax=ax) xmin, xmax = ax.get_xlim() line = ax.get_lines()[0].get_data(orig=False)[0] assert xmin == line[0] @@ -119,7 +123,8 @@ def test_ts_area_lim(self): tm.close() # GH 7471 - ax = self.ts.plot.area(stacked=False, x_compat=True) + _, ax = self.plt.subplots() + ax = self.ts.plot.area(stacked=False, x_compat=True, ax=ax) xmin, xmax = ax.get_xlim() line = ax.get_lines()[0].get_data(orig=False)[0] assert xmin == line[0] @@ -128,14 +133,16 @@ def test_ts_area_lim(self): tz_ts = self.ts.copy() tz_ts.index = tz_ts.tz_localize('GMT').tz_convert('CET') - ax = tz_ts.plot.area(stacked=False, x_compat=True) + _, ax = self.plt.subplots() + ax = tz_ts.plot.area(stacked=False, x_compat=True, ax=ax) xmin, xmax = ax.get_xlim() line = ax.get_lines()[0].get_data(orig=False)[0] assert xmin == line[0] assert xmax == line[-1] tm.close() - ax = tz_ts.plot.area(stacked=False, secondary_y=True) + _, ax = self.plt.subplots() + ax = tz_ts.plot.area(stacked=False, secondary_y=True, ax=ax) xmin, xmax = ax.get_xlim() line = ax.get_lines()[0].get_data(orig=False)[0] assert xmin == line[0] @@ -143,23 +150,28 @@ def test_ts_area_lim(self): def test_label(self): s = Series([1, 2]) - ax = s.plot(label='LABEL', legend=True) + _, ax = self.plt.subplots() + ax = s.plot(label='LABEL', legend=True, ax=ax) self._check_legend_labels(ax, labels=['LABEL']) self.plt.close() - ax = s.plot(legend=True) + _, ax = self.plt.subplots() + ax = s.plot(legend=True, ax=ax) self._check_legend_labels(ax, labels=['None']) self.plt.close() # get name from index s.name = 'NAME' - ax = s.plot(legend=True) + _, ax = self.plt.subplots() + ax = s.plot(legend=True, ax=ax) self._check_legend_labels(ax, labels=['NAME']) self.plt.close() # override the default - ax = s.plot(legend=True, label='LABEL') + _, ax = self.plt.subplots() + ax = s.plot(legend=True, label='LABEL', ax=ax) self._check_legend_labels(ax, labels=['LABEL']) self.plt.close() # Add lebel info, but don't draw - ax = s.plot(legend=False, label='LABEL') + _, ax = self.plt.subplots() + ax = s.plot(legend=False, label='LABEL', ax=ax) assert ax.get_legend() is None # Hasn't been drawn ax.legend() # draw it self._check_legend_labels(ax, labels=['LABEL']) @@ -189,10 +201,12 @@ def test_line_area_nan_series(self): def test_line_use_index_false(self): s = Series([1, 2, 3], index=['a', 'b', 'c']) s.index.name = 'The Index' - ax = s.plot(use_index=False) + _, ax = self.plt.subplots() + ax = s.plot(use_index=False, ax=ax) label = ax.get_xlabel() assert label == '' - ax2 = s.plot.bar(use_index=False) + _, ax = self.plt.subplots() + ax2 = s.plot.bar(use_index=False, ax=ax) label2 = ax2.get_xlabel() assert label2 == '' @@ -203,11 +217,13 @@ def test_bar_log(self): if not self.mpl_le_1_2_1: expected = np.hstack((.1, expected, 1e4)) - ax = Series([200, 500]).plot.bar(log=True) + _, ax = self.plt.subplots() + ax = Series([200, 500]).plot.bar(log=True, ax=ax) tm.assert_numpy_array_equal(ax.yaxis.get_ticklocs(), expected) tm.close() - ax = Series([200, 500]).plot.barh(log=True) + _, ax = self.plt.subplots() + ax = Series([200, 500]).plot.barh(log=True, ax=ax) tm.assert_numpy_array_equal(ax.xaxis.get_ticklocs(), expected) tm.close() @@ -219,7 +235,8 @@ def test_bar_log(self): if self.mpl_ge_2_0_0: expected = np.hstack((1.0e-05, expected)) - ax = Series([0.1, 0.01, 0.001]).plot(log=True, kind='bar') + _, ax = self.plt.subplots() + ax = Series([0.1, 0.01, 0.001]).plot(log=True, kind='bar', ax=ax) ymin = 0.0007943282347242822 if self.mpl_ge_2_0_0 else 0.001 ymax = 0.12589254117941673 if self.mpl_ge_2_0_0 else .10000000000000001 res = ax.get_ylim() @@ -228,7 +245,8 @@ def test_bar_log(self): tm.assert_numpy_array_equal(ax.yaxis.get_ticklocs(), expected) tm.close() - ax = Series([0.1, 0.01, 0.001]).plot(log=True, kind='barh') + _, ax = self.plt.subplots() + ax = Series([0.1, 0.01, 0.001]).plot(log=True, kind='barh', ax=ax) res = ax.get_xlim() tm.assert_almost_equal(res[0], ymin) tm.assert_almost_equal(res[1], ymax) @@ -237,23 +255,27 @@ def test_bar_log(self): @slow def test_bar_ignore_index(self): df = Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']) - ax = df.plot.bar(use_index=False) + _, ax = self.plt.subplots() + ax = df.plot.bar(use_index=False, ax=ax) self._check_text_labels(ax.get_xticklabels(), ['0', '1', '2', '3']) def test_rotation(self): df = DataFrame(randn(5, 5)) # Default rot 0 - axes = df.plot() + _, ax = self.plt.subplots() + axes = df.plot(ax=ax) self._check_ticks_props(axes, xrot=0) - axes = df.plot(rot=30) + _, ax = self.plt.subplots() + axes = df.plot(rot=30, ax=ax) self._check_ticks_props(axes, xrot=30) def test_irregular_datetime(self): rng = date_range('1/1/2000', '3/1/2000') rng = rng[[0, 1, 2, 3, 5, 9, 10, 11, 12]] ser = Series(randn(len(rng)), rng) - ax = ser.plot() + _, ax = self.plt.subplots() + ax = ser.plot(ax=ax) xp = datetime(1999, 1, 1).toordinal() ax.set_xlim('1/1/1999', '1/1/2001') assert xp == ax.get_xlim()[0] @@ -311,7 +333,8 @@ def test_pie_series(self): def test_pie_nan(self): s = Series([1, np.nan, 1, 1]) - ax = s.plot.pie(legend=True) + _, ax = self.plt.subplots() + ax = s.plot.pie(legend=True, ax=ax) expected = ['0', '', '2', '3'] result = [x.get_text() for x in ax.texts] assert result == expected @@ -319,7 +342,8 @@ def test_pie_nan(self): @slow def test_hist_df_kwargs(self): df = DataFrame(np.random.randn(10, 2)) - ax = df.plot.hist(bins=5) + _, ax = self.plt.subplots() + ax = df.plot.hist(bins=5, ax=ax) assert len(ax.patches) == 10 @slow @@ -329,10 +353,12 @@ def test_hist_df_with_nonnumerics(self): df = DataFrame( np.random.randn(10, 4), columns=['A', 'B', 'C', 'D']) df['E'] = ['x', 'y'] * 5 - ax = df.plot.hist(bins=5) + _, ax = self.plt.subplots() + ax = df.plot.hist(bins=5, ax=ax) assert len(ax.patches) == 20 - ax = df.plot.hist() # bins=10 + _, ax = self.plt.subplots() + ax = df.plot.hist(ax=ax) # bins=10 assert len(ax.patches) == 40 @slow @@ -439,7 +465,8 @@ def test_hist_secondary_legend(self): df = DataFrame(np.random.randn(30, 4), columns=list('abcd')) # primary -> secondary - ax = df['a'].plot.hist(legend=True) + _, ax = self.plt.subplots() + ax = df['a'].plot.hist(legend=True, ax=ax) df['b'].plot.hist(ax=ax, legend=True, secondary_y=True) # both legends are dran on left ax # left and right axis must be visible @@ -449,7 +476,8 @@ def test_hist_secondary_legend(self): tm.close() # secondary -> secondary - ax = df['a'].plot.hist(legend=True, secondary_y=True) + _, ax = self.plt.subplots() + ax = df['a'].plot.hist(legend=True, secondary_y=True, ax=ax) df['b'].plot.hist(ax=ax, legend=True, secondary_y=True) # both legends are draw on left ax # left axis must be invisible, right axis must be visible @@ -460,7 +488,8 @@ def test_hist_secondary_legend(self): tm.close() # secondary -> primary - ax = df['a'].plot.hist(legend=True, secondary_y=True) + _, ax = self.plt.subplots() + ax = df['a'].plot.hist(legend=True, secondary_y=True, ax=ax) # right axes is returned df['b'].plot.hist(ax=ax, legend=True) # both legends are draw on left ax @@ -477,8 +506,9 @@ def test_df_series_secondary_legend(self): s = Series(np.random.randn(30), name='x') # primary -> secondary (without passing ax) - ax = df.plot() - s.plot(legend=True, secondary_y=True) + _, ax = self.plt.subplots() + ax = df.plot(ax=ax) + s.plot(legend=True, secondary_y=True, ax=ax) # both legends are dran on left ax # left and right axis must be visible self._check_legend_labels(ax, labels=['a', 'b', 'c', 'x (right)']) @@ -487,7 +517,8 @@ def test_df_series_secondary_legend(self): tm.close() # primary -> secondary (with passing ax) - ax = df.plot() + _, ax = self.plt.subplots() + ax = df.plot(ax=ax) s.plot(ax=ax, legend=True, secondary_y=True) # both legends are dran on left ax # left and right axis must be visible @@ -497,8 +528,9 @@ def test_df_series_secondary_legend(self): tm.close() # seconcary -> secondary (without passing ax) - ax = df.plot(secondary_y=True) - s.plot(legend=True, secondary_y=True) + _, ax = self.plt.subplots() + ax = df.plot(secondary_y=True, ax=ax) + s.plot(legend=True, secondary_y=True, ax=ax) # both legends are dran on left ax # left axis must be invisible and right axis must be visible expected = ['a (right)', 'b (right)', 'c (right)', 'x (right)'] @@ -508,7 +540,8 @@ def test_df_series_secondary_legend(self): tm.close() # secondary -> secondary (with passing ax) - ax = df.plot(secondary_y=True) + _, ax = self.plt.subplots() + ax = df.plot(secondary_y=True, ax=ax) s.plot(ax=ax, legend=True, secondary_y=True) # both legends are dran on left ax # left axis must be invisible and right axis must be visible @@ -519,7 +552,8 @@ def test_df_series_secondary_legend(self): tm.close() # secondary -> secondary (with passing ax) - ax = df.plot(secondary_y=True, mark_right=False) + _, ax = self.plt.subplots() + ax = df.plot(secondary_y=True, mark_right=False, ax=ax) s.plot(ax=ax, legend=True, secondary_y=True) # both legends are dran on left ax # left axis must be invisible and right axis must be visible @@ -533,11 +567,13 @@ def test_df_series_secondary_legend(self): def test_plot_fails_with_dupe_color_and_style(self): x = Series(randn(2)) with pytest.raises(ValueError): - x.plot(style='k--', color='k') + _, ax = self.plt.subplots() + x.plot(style='k--', color='k', ax=ax) @slow def test_hist_kde(self): - ax = self.ts.plot.hist(logy=True) + _, ax = self.plt.subplots() + ax = self.ts.plot.hist(logy=True, ax=ax) self._check_ax_scales(ax, yaxis='log') xlabels = ax.get_xticklabels() # ticks are values, thus ticklabels are blank @@ -549,7 +585,8 @@ def test_hist_kde(self): _skip_if_no_scipy_gaussian_kde() _check_plot_works(self.ts.plot.kde) _check_plot_works(self.ts.plot.density) - ax = self.ts.plot.kde(logy=True) + _, ax = self.plt.subplots() + ax = self.ts.plot.kde(logy=True, ax=ax) self._check_ax_scales(ax, yaxis='log') xlabels = ax.get_xticklabels() self._check_text_labels(xlabels, [''] * len(xlabels)) @@ -565,8 +602,9 @@ def test_kde_kwargs(self): ind=linspace(-100, 100, 20)) _check_plot_works(self.ts.plot.density, bw_method=.5, ind=linspace(-100, 100, 20)) + _, ax = self.plt.subplots() ax = self.ts.plot.kde(logy=True, bw_method=.5, - ind=linspace(-100, 100, 20)) + ind=linspace(-100, 100, 20), ax=ax) self._check_ax_scales(ax, yaxis='log') self._check_text_labels(ax.yaxis.get_label(), 'Density') @@ -583,29 +621,34 @@ def test_kde_missing_vals(self): @slow def test_hist_kwargs(self): - ax = self.ts.plot.hist(bins=5) + _, ax = self.plt.subplots() + ax = self.ts.plot.hist(bins=5, ax=ax) assert len(ax.patches) == 5 self._check_text_labels(ax.yaxis.get_label(), 'Frequency') tm.close() if self.mpl_ge_1_3_1: - ax = self.ts.plot.hist(orientation='horizontal') + _, ax = self.plt.subplots() + ax = self.ts.plot.hist(orientation='horizontal', ax=ax) self._check_text_labels(ax.xaxis.get_label(), 'Frequency') tm.close() - ax = self.ts.plot.hist(align='left', stacked=True) + _, ax = self.plt.subplots() + ax = self.ts.plot.hist(align='left', stacked=True, ax=ax) tm.close() @slow def test_hist_kde_color(self): - ax = self.ts.plot.hist(logy=True, bins=10, color='b') + _, ax = self.plt.subplots() + ax = self.ts.plot.hist(logy=True, bins=10, color='b', ax=ax) self._check_ax_scales(ax, yaxis='log') assert len(ax.patches) == 10 self._check_colors(ax.patches, facecolors=['b'] * 10) tm._skip_if_no_scipy() _skip_if_no_scipy_gaussian_kde() - ax = self.ts.plot.kde(logy=True, color='r') + _, ax = self.plt.subplots() + ax = self.ts.plot.kde(logy=True, color='r', ax=ax) self._check_ax_scales(ax, yaxis='log') lines = ax.get_lines() assert len(lines) == 1 @@ -613,7 +656,8 @@ def test_hist_kde_color(self): @slow def test_boxplot_series(self): - ax = self.ts.plot.box(logy=True) + _, ax = self.plt.subplots() + ax = self.ts.plot.box(logy=True, ax=ax) self._check_ax_scales(ax, yaxis='log') xlabels = ax.get_xticklabels() self._check_text_labels(xlabels, [self.ts.name]) @@ -625,20 +669,22 @@ def test_kind_both_ways(self): s = Series(range(3)) kinds = (plotting._core._common_kinds + plotting._core._series_kinds) + _, ax = self.plt.subplots() for kind in kinds: if not _ok_for_gaussian_kde(kind): continue - s.plot(kind=kind) + s.plot(kind=kind, ax=ax) getattr(s.plot, kind)() @slow def test_invalid_plot_data(self): s = Series(list('abcd')) + _, ax = self.plt.subplots() for kind in plotting._core._common_kinds: if not _ok_for_gaussian_kde(kind): continue with pytest.raises(TypeError): - s.plot(kind=kind) + s.plot(kind=kind, ax=ax) @slow def test_valid_object_plot(self): @@ -650,11 +696,12 @@ def test_valid_object_plot(self): def test_partially_invalid_plot_data(self): s = Series(['a', 'b', 1.0, 2]) + _, ax = self.plt.subplots() for kind in plotting._core._common_kinds: if not _ok_for_gaussian_kde(kind): continue with pytest.raises(TypeError): - s.plot(kind=kind) + s.plot(kind=kind, ax=ax) def test_invalid_kind(self): s = Series([1, 2]) @@ -776,13 +823,15 @@ def test_standard_colors_all(self): def test_series_plot_color_kwargs(self): # GH1890 - ax = Series(np.arange(12) + 1).plot(color='green') + _, ax = self.plt.subplots() + ax = Series(np.arange(12) + 1).plot(color='green', ax=ax) self._check_colors(ax.get_lines(), linecolors=['green']) def test_time_series_plot_color_kwargs(self): # #1890 + _, ax = self.plt.subplots() ax = Series(np.arange(12) + 1, index=date_range( - '1/1/2000', periods=12)).plot(color='green') + '1/1/2000', periods=12)).plot(color='green', ax=ax) self._check_colors(ax.get_lines(), linecolors=['green']) def test_time_series_plot_color_with_empty_kwargs(self): @@ -797,14 +846,16 @@ def test_time_series_plot_color_with_empty_kwargs(self): ncolors = 3 + _, ax = self.plt.subplots() for i in range(ncolors): - ax = s.plot() + ax = s.plot(ax=ax) self._check_colors(ax.get_lines(), linecolors=def_colors[:ncolors]) def test_xticklabels(self): # GH11529 s = Series(np.arange(10), index=['P%02d' % i for i in range(10)]) - ax = s.plot(xticks=[0, 3, 5, 9]) + _, ax = self.plt.subplots() + ax = s.plot(xticks=[0, 3, 5, 9], ax=ax) exp = ['P%02d' % i for i in [0, 3, 5, 9]] self._check_text_labels(ax.get_xticklabels(), exp) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 270a93e4ae382..fc5a2eb468d4f 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1071,6 +1071,43 @@ def test_crosstab_margins(self): exp_rows = exp_rows.fillna(0).astype(np.int64) tm.assert_series_equal(all_rows, exp_rows) + def test_crosstab_margins_set_margin_name(self): + # GH 15972 + a = np.random.randint(0, 7, size=100) + b = np.random.randint(0, 3, size=100) + c = np.random.randint(0, 5, size=100) + + df = DataFrame({'a': a, 'b': b, 'c': c}) + + result = crosstab(a, [b, c], rownames=['a'], colnames=('b', 'c'), + margins=True, margins_name='TOTAL') + + assert result.index.names == ('a',) + assert result.columns.names == ['b', 'c'] + + all_cols = result['TOTAL', ''] + exp_cols = df.groupby(['a']).size().astype('i8') + # to keep index.name + exp_margin = Series([len(df)], index=Index(['TOTAL'], name='a')) + exp_cols = exp_cols.append(exp_margin) + exp_cols.name = ('TOTAL', '') + + tm.assert_series_equal(all_cols, exp_cols) + + all_rows = result.loc['TOTAL'] + exp_rows = df.groupby(['b', 'c']).size().astype('i8') + exp_rows = exp_rows.append(Series([len(df)], index=[('TOTAL', '')])) + exp_rows.name = 'TOTAL' + + exp_rows = exp_rows.reindex(all_rows.index) + exp_rows = exp_rows.fillna(0).astype(np.int64) + tm.assert_series_equal(all_rows, exp_rows) + + for margins_name in [666, None, ['a', 'b']]: + with pytest.raises(ValueError): + crosstab(a, [b, c], rownames=['a'], colnames=('b', 'c'), + margins=True, margins_name=margins_name) + def test_crosstab_pass_values(self): a = np.random.randint(0, 7, size=100) b = np.random.randint(0, 3, size=100) diff --git a/pandas/tests/reshape/test_tile.py b/pandas/tests/reshape/test_tile.py index 8602b33856fea..2523f8ab9f776 100644 --- a/pandas/tests/reshape/test_tile.py +++ b/pandas/tests/reshape/test_tile.py @@ -211,6 +211,7 @@ def test_cut_pass_labels(self): result = cut(arr, bins, labels=labels) exp = Categorical(['Medium'] + 4 * ['Small'] + ['Medium', 'Large'], + categories=labels, ordered=True) tm.assert_categorical_equal(result, exp) @@ -219,6 +220,13 @@ def test_cut_pass_labels(self): exp = Categorical.from_codes([1] + 4 * [0] + [1, 2], labels) tm.assert_categorical_equal(result, exp) + # issue 16459 + labels = ['Good', 'Medium', 'Bad'] + result = cut(arr, 3, labels=labels) + exp = cut(arr, 3, labels=Categorical(labels, categories=labels, + ordered=True)) + tm.assert_categorical_equal(result, exp) + def test_qcut_include_lowest(self): values = np.arange(10) diff --git a/pandas/tests/series/test_apply.py b/pandas/tests/series/test_apply.py index c273d3161fff5..2c5f0d7772cc2 100644 --- a/pandas/tests/series/test_apply.py +++ b/pandas/tests/series/test_apply.py @@ -306,6 +306,22 @@ def test_reduce(self): name=self.series.name) assert_series_equal(result, expected) + def test_non_callable_aggregates(self): + # test agg using non-callable series attributes + s = Series([1, 2, None]) + + # Calling agg w/ just a string arg same as calling s.arg + result = s.agg('size') + expected = s.size + assert result == expected + + # test when mixed w/ callable reducers + result = s.agg(['size', 'count', 'mean']) + expected = Series(OrderedDict({'size': 3.0, + 'count': 2.0, + 'mean': 1.5})) + assert_series_equal(result[expected.index], expected) + class TestSeriesMap(TestData): diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index e084fa58d6c51..9ab02a8c2aad7 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -248,3 +248,12 @@ def test_intercept_astype_object(self): result = df.values.squeeze() assert (result[:, 0] == expected.values).all() + + def test_series_to_categorical(self): + # see gh-16524: test conversion of Series to Categorical + series = Series(['a', 'b', 'c']) + + result = Series(series, dtype='category') + expected = Series(['a', 'b', 'c'], dtype='category') + + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/test_indexing.py b/pandas/tests/series/test_indexing.py index 6ded4d593a571..7774d10c5eaf8 100644 --- a/pandas/tests/series/test_indexing.py +++ b/pandas/tests/series/test_indexing.py @@ -70,6 +70,21 @@ def test_get(self): result = vc.get(True, default='Missing') assert result == 'Missing' + def test_get_nan(self): + # GH 8569 + s = pd.Float64Index(range(10)).to_series() + assert s.get(np.nan) is None + assert s.get(np.nan, default='Missing') == 'Missing' + + # ensure that fixing the above hasn't broken get + # with multiple elements + idx = [20, 30] + assert_series_equal(s.get(idx), + Series([np.nan] * 2, index=idx)) + idx = [np.nan, np.nan] + assert_series_equal(s.get(idx), + Series([np.nan] * 2, index=idx)) + def test_delitem(self): # GH 5542 diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 351e646cbb0b2..063dcea5c76d6 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -929,6 +929,22 @@ def test_unique_index(self): tm.assert_numpy_array_equal(case.duplicated(), np.array([False, False, False])) + @pytest.mark.parametrize('arr, unique', [ + ([(0, 0), (0, 1), (1, 0), (1, 1), (0, 0), (0, 1), (1, 0), (1, 1)], + [(0, 0), (0, 1), (1, 0), (1, 1)]), + ([('b', 'c'), ('a', 'b'), ('a', 'b'), ('b', 'c')], + [('b', 'c'), ('a', 'b')]), + ([('a', 1), ('b', 2), ('a', 3), ('a', 1)], + [('a', 1), ('b', 2), ('a', 3)]), + ]) + def test_unique_tuples(self, arr, unique): + # https://github.com/pandas-dev/pandas/issues/16519 + expected = np.empty(len(unique), dtype=object) + expected[:] = unique + + result = pd.unique(arr) + tm.assert_numpy_array_equal(result, expected) + class GroupVarTestMixin(object): diff --git a/pandas/tests/test_expressions.py b/pandas/tests/test_expressions.py index fae7bfa513dcd..08c3a25e66b0e 100644 --- a/pandas/tests/test_expressions.py +++ b/pandas/tests/test_expressions.py @@ -13,7 +13,7 @@ from pandas.core.api import DataFrame, Panel from pandas.core.computation import expressions as expr -from pandas import compat, _np_version_under1p11 +from pandas import compat, _np_version_under1p11, _np_version_under1p13 from pandas.util.testing import (assert_almost_equal, assert_series_equal, assert_frame_equal, assert_panel_equal, assert_panel4d_equal, slow) @@ -420,6 +420,10 @@ def test_bool_ops_warn_on_arithmetic(self): f = getattr(operator, name) fe = getattr(operator, sub_funcs[subs[op]]) + # >= 1.13.0 these are now TypeErrors + if op == '-' and not _np_version_under1p13: + continue + with tm.use_numexpr(True, min_elements=5): with tm.assert_produces_warning(check_stacklevel=False): r = f(df, df) diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index 3243b69a25acd..e19e42e062932 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -2429,6 +2429,11 @@ def test_all_any_unhandled(self): pytest.raises(NotImplementedError, self.panel.all, bool_only=True) pytest.raises(NotImplementedError, self.panel.any, bool_only=True) + # GH issue 15960 + def test_sort_values(self): + pytest.raises(NotImplementedError, self.panel.sort_values) + pytest.raises(NotImplementedError, self.panel.sort_values, 'ItemA') + class TestLongPanel(object): """ diff --git a/pandas/tests/test_panel4d.py b/pandas/tests/test_panel4d.py index 96f02d63712fc..e1995316e7b7c 100644 --- a/pandas/tests/test_panel4d.py +++ b/pandas/tests/test_panel4d.py @@ -939,3 +939,8 @@ def test_rename(self): def test_get_attr(self): tm.assert_panel_equal(self.panel4d['l1'], self.panel4d.l1) + + # GH issue 15960 + def test_sort_values(self): + pytest.raises(NotImplementedError, self.panel4d.sort_values) + pytest.raises(NotImplementedError, self.panel4d.sort_values, 'ItemA') diff --git a/pandas/tests/test_resample.py b/pandas/tests/test_resample.py index 170cab4947a5a..959e3d2f459ce 100644 --- a/pandas/tests/test_resample.py +++ b/pandas/tests/test_resample.py @@ -1672,6 +1672,28 @@ def test_resample_dtype_preservation(self): result = df.groupby('group').resample('1D').ffill() assert result.val.dtype == np.int32 + def test_resample_dtype_coerceion(self): + + pytest.importorskip('scipy') + + # GH 16361 + df = {"a": [1, 3, 1, 4]} + df = pd.DataFrame( + df, index=pd.date_range("2017-01-01", "2017-01-04")) + + expected = (df.astype("float64") + .resample("H") + .mean() + ["a"] + .interpolate("cubic") + ) + + result = df.resample("H")["a"].mean().interpolate("cubic") + tm.assert_series_equal(result, expected) + + result = df.resample("H").mean()["a"].interpolate("cubic") + tm.assert_series_equal(result, expected) + def test_weekly_resample_buglet(self): # #1327 rng = date_range('1/1/2000', freq='B', periods=20) diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index 6a640d62108b3..cbb3c345a9353 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -3833,3 +3833,26 @@ def test_non_monotonic(self): df2 = df.sort_values('B') result = df2.groupby('A').rolling('4s', on='B').C.mean() tm.assert_series_equal(result, expected) + + def test_rolling_cov_offset(self): + # GH16058 + + idx = pd.date_range('2017-01-01', periods=24, freq='1h') + ss = pd.Series(np.arange(len(idx)), index=idx) + + result = ss.rolling('2h').cov() + expected = pd.Series([np.nan] + [0.5 for _ in range(len(idx) - 1)], + index=idx) + tm.assert_series_equal(result, expected) + + expected2 = ss.rolling(2, min_periods=1).cov() + tm.assert_series_equal(result, expected2) + + result = ss.rolling('3h').cov() + expected = pd.Series([np.nan, 0.5] + + [1.0 for _ in range(len(idx) - 2)], + index=idx) + tm.assert_series_equal(result, expected) + + expected2 = ss.rolling(3, min_periods=1).cov() + tm.assert_series_equal(result, expected2) diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index f9f4adc1b2c81..2a120a0696836 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -1596,7 +1596,6 @@ def apply(self, other): if otherDay != self.weekday: other = other + timedelta((self.weekday - otherDay) % 7) k = k - 1 - other = other for i in range(k): other = other + self._inc else: diff --git a/pandas/util/testing.py b/pandas/util/testing.py index f987045c27d5f..17e09b38b20e0 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -2424,15 +2424,8 @@ def assert_raises_regex(_exception, _regexp, _callable=None, Check that the specified Exception is raised and that the error message matches a given regular expression pattern. This may be a regular expression object or a string containing a regular expression suitable - for use by `re.search()`. - - This is a port of the `assertRaisesRegexp` function from unittest in - Python 2.7. However, with our migration to `pytest`, please refrain - from using this. Instead, use the following paradigm: - - with pytest.raises(_exception) as exc_info: - func(*args, **kwargs) - exc_info.matches(reg_exp) + for use by `re.search()`. This is a port of the `assertRaisesRegexp` + function from unittest in Python 2.7. Examples -------- diff --git a/setup.py b/setup.py index 82d5f407228a9..31a3cddc3f9fd 100755 --- a/setup.py +++ b/setup.py @@ -702,6 +702,8 @@ def pxd(name): 'parser/data/*.gz', 'parser/data/*.bz2', 'parser/data/*.txt', + 'parser/data/*.tar', + 'parser/data/*.tar.gz', 'sas/data/*.csv', 'sas/data/*.xpt', 'sas/data/*.sas7bdat',